From aad4b41a34191b07ad0cb78b4009b46fd23382f7 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Tue, 22 Oct 2024 21:12:01 -0700 Subject: [PATCH 0001/1450] Documentation: ieee802154: fix grammar Fix grammar where it improves readability. Signed-off-by: Leo Stone Reviewed-by: Miquel Raynal Reviewed-by: Simon Horman Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/20241023041203.35313-1-leocstone@gmail.com Signed-off-by: Stefan Schmidt --- Documentation/networking/ieee802154.rst | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/ieee802154.rst b/Documentation/networking/ieee802154.rst index c652d383fe102..743c0a80e3096 100644 --- a/Documentation/networking/ieee802154.rst +++ b/Documentation/networking/ieee802154.rst @@ -72,7 +72,8 @@ exports a management (e.g. MLME) and data API. possibly with some kinds of acceleration like automatic CRC computation and comparison, automagic ACK handling, address matching, etc. -Those types of devices require different approach to be hooked into Linux kernel. +Each type of device requires a different approach to be hooked into the Linux +kernel. HardMAC ------- @@ -81,10 +82,10 @@ See the header include/net/ieee802154_netdev.h. You have to implement Linux net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family code via plain sk_buffs. On skb reception skb->cb must contain additional info as described in the struct ieee802154_mac_cb. During packet transmission -the skb->cb is used to provide additional data to device's header_ops->create -function. Be aware that this data can be overridden later (when socket code -submits skb to qdisc), so if you need something from that cb later, you should -store info in the skb->data on your own. +the skb->cb is used to provide additional data to the device's +header_ops->create function. Be aware that this data can be overridden later +(when socket code submits skb to qdisc), so if you need something from that cb +later, you should store info in the skb->data on your own. To hook the MLME interface you have to populate the ml_priv field of your net_device with a pointer to struct ieee802154_mlme_ops instance. The fields @@ -94,8 +95,9 @@ All other fields are required. SoftMAC ------- -The MAC is the middle layer in the IEEE 802.15.4 Linux stack. This moment it -provides interface for drivers registration and management of slave interfaces. +The MAC is the middle layer in the IEEE 802.15.4 Linux stack. At the moment, it +provides an interface for driver registration and management of slave +interfaces. NOTE: Currently the only monitor device type is supported - it's IEEE 802.15.4 stack interface for network sniffers (e.g. WireShark). From 34cd3bdffa111eb483c12c0a55d77ad0634bb5f3 Mon Sep 17 00:00:00 2001 From: Balaji Pothunoori Date: Tue, 29 Oct 2024 14:03:40 +0530 Subject: [PATCH 0002/1450] wifi: ath11k: Suspend hardware before firmware mode off for WCN6750 During rmmod, the ath11k host driver sends a QMI MODE OFF command to firmware. As part of this command, firmware initiates WLAN de-initialization and accesses certain UMAC registers during this process. Currently, on WCN6750 WLAN hardware, the system is in a sleep state when firmware receives the QMI MODE OFF command. This results in a firmware/hardware reset while accessing the UMAC hardware registers during sleep state. To avoid this, add logic to send WCN6750 hardware specific WMI_PDEV_SUSPEND_AND_DISABLE_INTR command to firmware prior to sending the QMI MODE OFF command. This will cause firmware to cease all activities and put the device in a powered-on state that prevents access to registers which have been powered off. Signed-off-by: Balaji Pothunoori Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241029083340.3010798-1-quic_bpothuno@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/core.c | 45 ++++++++++++++++++++++++++ drivers/net/wireless/ath/ath11k/hw.h | 1 + 2 files changed, 46 insertions(+) diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c index be67382c00f6d..a9aefb1a705d7 100644 --- a/drivers/net/wireless/ath/ath11k/core.c +++ b/drivers/net/wireless/ath/ath11k/core.c @@ -123,6 +123,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .tx_ring_size = DP_TCL_DATA_RING_SIZE, .smp2p_wow_exit = false, .support_dual_stations = false, + .pdev_suspend = false, }, { .hw_rev = ATH11K_HW_IPQ6018_HW10, @@ -207,6 +208,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .smp2p_wow_exit = false, .support_fw_mac_sequence = false, .support_dual_stations = false, + .pdev_suspend = false, }, { .name = "qca6390 hw2.0", @@ -296,6 +298,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .smp2p_wow_exit = false, .support_fw_mac_sequence = true, .support_dual_stations = true, + .pdev_suspend = false, }, { .name = "qcn9074 hw1.0", @@ -379,6 +382,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .smp2p_wow_exit = false, .support_fw_mac_sequence = false, .support_dual_stations = false, + .pdev_suspend = false, }, { .name = "wcn6855 hw2.0", @@ -468,6 +472,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .smp2p_wow_exit = false, .support_fw_mac_sequence = true, .support_dual_stations = true, + .pdev_suspend = false, }, { .name = "wcn6855 hw2.1", @@ -555,6 +560,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .smp2p_wow_exit = false, .support_fw_mac_sequence = true, .support_dual_stations = true, + .pdev_suspend = false, }, { .name = "wcn6750 hw1.0", @@ -637,6 +643,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .smp2p_wow_exit = true, .support_fw_mac_sequence = true, .support_dual_stations = false, + .pdev_suspend = true, }, { .hw_rev = ATH11K_HW_IPQ5018_HW10, @@ -719,6 +726,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .smp2p_wow_exit = false, .support_fw_mac_sequence = false, .support_dual_stations = false, + .pdev_suspend = false, }, { .name = "qca2066 hw2.1", @@ -808,6 +816,7 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .smp2p_wow_exit = false, .support_fw_mac_sequence = true, .support_dual_stations = true, + .pdev_suspend = false, }, }; @@ -1669,11 +1678,47 @@ static int ath11k_core_pdev_create(struct ath11k_base *ab) return ret; } +static void ath11k_core_pdev_suspend_target(struct ath11k_base *ab) +{ + struct ath11k *ar; + struct ath11k_pdev *pdev; + unsigned long time_left; + int ret; + int i; + + if (!ab->hw_params.pdev_suspend) + return; + + for (i = 0; i < ab->num_radios; i++) { + pdev = &ab->pdevs[i]; + ar = pdev->ar; + + reinit_completion(&ab->htc_suspend); + + ret = ath11k_wmi_pdev_suspend(ar, WMI_PDEV_SUSPEND_AND_DISABLE_INTR, + pdev->pdev_id); + if (ret) { + ath11k_warn(ab, "could not suspend target :%d\n", ret); + /* pointless to try other pdevs */ + return; + } + + time_left = wait_for_completion_timeout(&ab->htc_suspend, 3 * HZ); + + if (!time_left) { + ath11k_warn(ab, "suspend timed out - target pause event never came\n"); + /* pointless to try other pdevs */ + return; + } + } +} + static void ath11k_core_pdev_destroy(struct ath11k_base *ab) { ath11k_spectral_deinit(ab); ath11k_thermal_unregister(ab); ath11k_mac_unregister(ab); + ath11k_core_pdev_suspend_target(ab); ath11k_hif_irq_disable(ab); ath11k_dp_pdev_free(ab); ath11k_debugfs_pdev_destroy(ab); diff --git a/drivers/net/wireless/ath/ath11k/hw.h b/drivers/net/wireless/ath/ath11k/hw.h index 300322535766e..52d9f4c13b136 100644 --- a/drivers/net/wireless/ath/ath11k/hw.h +++ b/drivers/net/wireless/ath/ath11k/hw.h @@ -227,6 +227,7 @@ struct ath11k_hw_params { bool smp2p_wow_exit; bool support_fw_mac_sequence; bool support_dual_stations; + bool pdev_suspend; }; struct ath11k_hw_ops { From 38db1ae301c37b3c9599f4f67cd495b6f3a9010e Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 28 Oct 2024 07:08:40 -0700 Subject: [PATCH 0003/1450] wifi: ath12k: mark QMI driver event helpers as noinline As described in [1], compiling the ath12k driver using clang with KASAN enabled warns about some functions with excessive stack usage, with the worst case being: drivers/net/wireless/ath/ath12k/qmi.c:3546:13: warning: stack frame size (2456) exceeds limit (1024) in 'ath12k_qmi_driver_event_work' [-Wframe-larger-than] Nathan [2] highlighted work done by Arnd [3] to address similar issues in other portions of the kernel. ath12k_qmi_driver_event_work() itself is a pretty lightweight function, but it dispatches to several other functions which do the real work: ath12k_qmi_driver_event_work() ath12k_qmi_event_server_arrive() ath12k_qmi_host_cap_send() ath12k_qmi_event_mem_request() ath12k_qmi_respond_fw_mem_request() ath12k_qmi_event_load_bdf() ath12k_qmi_request_target_cap() ath12k_qmi_load_bdf_qmi() ath12k_qmi_wlanfw_m3_info_send() Mark all of those underlying functions as 'noinline_for_stack' to prevent them from being inlined in ath12k_qmi_driver_event_work(), thereby eliminating the excessive stack usage. Link: https://msgid.link/bc214795-1c51-4cb7-922f-67d6ef98bff2@quicinc.com # [1] Link: https://msgid.link/20241025223321.GA3647469@thelio-3990X # [2] Link: https://lore.kernel.org/all/?q=f:arnd@kernel.org+Wframe-larger-than # [3] Acked-by: Kalle Valo Link: https://patch.msgid.link/20241028-ath12k_qmi_driver_event_work-v1-1-0d532eb593fa@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/qmi.c | 34 ++++++++++++++++++++------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c index b93ce9f87f612..d2d9d03c7a28e 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.c +++ b/drivers/net/wireless/ath/ath12k/qmi.c @@ -2066,7 +2066,9 @@ static void ath12k_host_cap_parse_mlo(struct ath12k_base *ab, req->mlo_chip_info_valid = 1; } -static int ath12k_qmi_host_cap_send(struct ath12k_base *ab) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath12k_qmi_host_cap_send(struct ath12k_base *ab) { struct qmi_wlanfw_host_cap_req_msg_v01 req = {}; struct qmi_wlanfw_host_cap_resp_msg_v01 resp = {}; @@ -2275,7 +2277,9 @@ static int ath12k_qmi_fw_ind_register_send(struct ath12k_base *ab) return ret; } -static int ath12k_qmi_respond_fw_mem_request(struct ath12k_base *ab) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath12k_qmi_respond_fw_mem_request(struct ath12k_base *ab) { struct qmi_wlanfw_respond_mem_req_msg_v01 *req; struct qmi_wlanfw_respond_mem_resp_msg_v01 resp = {}; @@ -2433,7 +2437,9 @@ static int ath12k_qmi_alloc_target_mem_chunk(struct ath12k_base *ab) return 0; } -static int ath12k_qmi_request_target_cap(struct ath12k_base *ab) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath12k_qmi_request_target_cap(struct ath12k_base *ab) { struct qmi_wlanfw_cap_req_msg_v01 req = {}; struct qmi_wlanfw_cap_resp_msg_v01 resp = {}; @@ -2619,8 +2625,10 @@ static int ath12k_qmi_load_file_target_mem(struct ath12k_base *ab, return ret; } -static int ath12k_qmi_load_bdf_qmi(struct ath12k_base *ab, - enum ath12k_qmi_bdf_type type) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath12k_qmi_load_bdf_qmi(struct ath12k_base *ab, + enum ath12k_qmi_bdf_type type) { struct device *dev = ab->dev; char filename[ATH12K_QMI_MAX_BDF_FILE_NAME_SIZE]; @@ -2791,7 +2799,9 @@ static int ath12k_qmi_m3_load(struct ath12k_base *ab) return ret; } -static int ath12k_qmi_wlanfw_m3_info_send(struct ath12k_base *ab) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath12k_qmi_wlanfw_m3_info_send(struct ath12k_base *ab) { struct m3_mem_region *m3_mem = &ab->qmi.m3_mem; struct qmi_wlanfw_m3_info_req_msg_v01 req = {}; @@ -3079,7 +3089,9 @@ ath12k_qmi_driver_event_post(struct ath12k_qmi *qmi, return 0; } -static int ath12k_qmi_event_server_arrive(struct ath12k_qmi *qmi) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath12k_qmi_event_server_arrive(struct ath12k_qmi *qmi) { struct ath12k_base *ab = qmi->ab; int ret; @@ -3101,7 +3113,9 @@ static int ath12k_qmi_event_server_arrive(struct ath12k_qmi *qmi) return ret; } -static int ath12k_qmi_event_mem_request(struct ath12k_qmi *qmi) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath12k_qmi_event_mem_request(struct ath12k_qmi *qmi) { struct ath12k_base *ab = qmi->ab; int ret; @@ -3115,7 +3129,9 @@ static int ath12k_qmi_event_mem_request(struct ath12k_qmi *qmi) return ret; } -static int ath12k_qmi_event_load_bdf(struct ath12k_qmi *qmi) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath12k_qmi_event_load_bdf(struct ath12k_qmi *qmi) { struct ath12k_base *ab = qmi->ab; int ret; From 757cc46520091ca103b9a948ddbdfa660a10879d Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Fri, 1 Nov 2024 17:16:58 +0200 Subject: [PATCH 0004/1450] wifi: ath12k: ath12k_mac_vdev_create(): use goto for error handling In commit 477cabfdb776 ("wifi: ath12k: modify link arvif creation and removal for MLO") I had accidentally left one personal TODO comment about using goto instead of ret. Switch to use goto to be consistent with the error handling in the function. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241101151705.165987-2-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index d493ec812055f..fa48200f012e8 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -7034,8 +7034,7 @@ int ath12k_mac_vdev_create(struct ath12k *ar, struct ath12k_link_vif *arvif) ret = ath12k_wait_for_peer_delete_done(ar, arvif->vdev_id, arvif->bssid); if (ret) - /* KVALO: why not goto err? */ - return ret; + goto err_vdev_del; ar->num_peers--; } From 1ea0cdee6fb3a498e7413cb2a28a918464d33d48 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Fri, 1 Nov 2024 17:16:59 +0200 Subject: [PATCH 0005/1450] wifi: ath12k: MLO vdev bringup changes Add changes to add the link vdevs dynamically whenever a channel is assigned from mac80211 for a link vdev. During vdev create, update ML address of the vdev to firmware using the new WMI parameter (WMI_TAG_MLO_VDEV_CREATE_PARAMS). During vdev start, notify the firmware that this link vdev is newly added and also indicate all its known partners so that the firmware can take necessary actions to internally update the partners on the new link being added. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Co-developed-by: Rameshkumar Sundaram Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241101151705.165987-3-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 87 ++++++++++++++++++++++++++- drivers/net/wireless/ath/ath12k/wmi.c | 85 +++++++++++++++++++++++++- drivers/net/wireless/ath/ath12k/wmi.h | 63 +++++++++++++++++++ 3 files changed, 230 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index fa48200f012e8..1a97eafaa3d24 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -648,6 +648,18 @@ struct ath12k *ath12k_mac_get_ar_by_pdev_id(struct ath12k_base *ab, u32 pdev_id) return NULL; } +static bool ath12k_mac_is_ml_arvif(struct ath12k_link_vif *arvif) +{ + struct ath12k_vif *ahvif = arvif->ahvif; + + lockdep_assert_wiphy(ahvif->ah->hw->wiphy); + + if (ahvif->vif->valid_links & BIT(arvif->link_id)) + return true; + + return false; +} + static struct ath12k *ath12k_mac_get_ar_by_chan(struct ieee80211_hw *hw, struct ieee80211_channel *channel) { @@ -1498,7 +1510,8 @@ static int ath12k_mac_setup_bcn_tmpl_ema(struct ath12k_link_vif *arvif) tx_ahvif = ath12k_vif_to_ahvif(ahvif->vif->mbssid_tx_vif); tx_arvif = &tx_ahvif->deflink; beacons = ieee80211_beacon_get_template_ema_list(ath12k_ar_to_hw(tx_arvif->ar), - tx_ahvif->vif, 0); + tx_ahvif->vif, + tx_arvif->link_id); if (!beacons || !beacons->cnt) { ath12k_warn(arvif->ar->ab, "failed to get ema beacon templates from mac80211\n"); @@ -1563,7 +1576,7 @@ static int ath12k_mac_setup_bcn_tmpl(struct ath12k_link_vif *arvif) } bcn = ieee80211_beacon_get_template(ath12k_ar_to_hw(tx_arvif->ar), tx_ahvif->vif, - &offs, 0); + &offs, tx_arvif->link_id); if (!bcn) { ath12k_warn(ab, "failed to get beacon template from mac80211\n"); return -EPERM; @@ -1644,7 +1657,7 @@ static void ath12k_control_beaconing(struct ath12k_link_vif *arvif, ahvif->aid = 0; - ether_addr_copy(arvif->bssid, info->bssid); + ether_addr_copy(arvif->bssid, info->addr); params.vdev_id = arvif->vdev_id; params.aid = ahvif->aid; @@ -6658,6 +6671,8 @@ static int ath12k_mac_setup_vdev_create_arg(struct ath12k_link_vif *arvif, struct ath12k_vif *ahvif = arvif->ahvif; int ret; + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + arg->if_id = arvif->vdev_id; arg->type = ahvif->vdev_type; arg->subtype = ahvif->vdev_subtype; @@ -6689,6 +6704,17 @@ static int ath12k_mac_setup_vdev_create_arg(struct ath12k_link_vif *arvif, } arg->if_stats_id = ath12k_mac_get_vdev_stats_id(arvif); + + if (ath12k_mac_is_ml_arvif(arvif)) { + if (hweight16(ahvif->vif->valid_links) > ATH12K_WMI_MLO_MAX_LINKS) { + ath12k_warn(ar->ab, "too many MLO links during setting up vdev: %d", + ahvif->vif->valid_links); + return -EINVAL; + } + + ether_addr_copy(arg->mld_addr, ahvif->vif->addr); + } + return 0; } @@ -7626,6 +7652,58 @@ ath12k_mac_check_down_grade_phy_mode(struct ath12k *ar, return down_mode; } +static void +ath12k_mac_mlo_get_vdev_args(struct ath12k_link_vif *arvif, + struct wmi_ml_arg *ml_arg) +{ + struct ath12k_vif *ahvif = arvif->ahvif; + struct wmi_ml_partner_info *partner_info; + struct ieee80211_bss_conf *link_conf; + struct ath12k_link_vif *arvif_p; + unsigned long links; + u8 link_id; + + lockdep_assert_wiphy(ahvif->ah->hw->wiphy); + + if (!ath12k_mac_is_ml_arvif(arvif)) + return; + + if (hweight16(ahvif->vif->valid_links) > ATH12K_WMI_MLO_MAX_LINKS) + return; + + ml_arg->enabled = true; + + /* Driver always add a new link via VDEV START, FW takes + * care of internally adding this link to existing + * link vdevs which are advertised as partners below + */ + ml_arg->link_add = true; + partner_info = ml_arg->partner_info; + + links = ahvif->links_map; + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif_p = wiphy_dereference(ahvif->ah->hw->wiphy, ahvif->link[link_id]); + + if (WARN_ON(!arvif_p)) + continue; + + if (arvif == arvif_p) + continue; + + link_conf = wiphy_dereference(ahvif->ah->hw->wiphy, + ahvif->vif->link_conf[arvif_p->link_id]); + + if (!link_conf) + continue; + + partner_info->vdev_id = arvif_p->vdev_id; + partner_info->hw_link_id = arvif_p->ar->pdev->hw_link_id; + ether_addr_copy(partner_info->addr, link_conf->addr); + ml_arg->num_partner_links++; + partner_info++; + } +} + static int ath12k_mac_vdev_start_restart(struct ath12k_link_vif *arvif, struct ieee80211_chanctx_conf *ctx, @@ -7704,6 +7782,9 @@ ath12k_mac_vdev_start_restart(struct ath12k_link_vif *arvif, arg.passive |= !!(chandef->chan->flags & IEEE80211_CHAN_NO_IR); + if (!restart) + ath12k_mac_mlo_get_vdev_args(arvif, &arg.ml); + ath12k_dbg(ab, ATH12K_DBG_MAC, "mac vdev %d start center_freq %d phymode %s punct_bitmap 0x%x\n", arg.vdev_id, arg.freq, diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index dced2aa9ba1a3..e089b58bbea1b 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -821,6 +821,8 @@ int ath12k_wmi_vdev_create(struct ath12k *ar, u8 *macaddr, struct wmi_vdev_create_cmd *cmd; struct sk_buff *skb; struct ath12k_wmi_vdev_txrx_streams_params *txrx_streams; + bool is_ml_vdev = is_valid_ether_addr(args->mld_addr); + struct wmi_vdev_create_mlo_params *ml_params; struct wmi_tlv *tlv; int ret, len; void *ptr; @@ -830,7 +832,8 @@ int ath12k_wmi_vdev_create(struct ath12k *ar, u8 *macaddr, * both the bands. */ len = sizeof(*cmd) + TLV_HDR_SIZE + - (WMI_NUM_SUPPORTED_BAND_MAX * sizeof(*txrx_streams)); + (WMI_NUM_SUPPORTED_BAND_MAX * sizeof(*txrx_streams)) + + (is_ml_vdev ? TLV_HDR_SIZE + sizeof(*ml_params) : 0); skb = ath12k_wmi_alloc_skb(wmi->wmi_ab, len); if (!skb) @@ -879,6 +882,21 @@ int ath12k_wmi_vdev_create(struct ath12k *ar, u8 *macaddr, txrx_streams->supported_rx_streams = cpu_to_le32(args->chains[NL80211_BAND_5GHZ].rx); + ptr += WMI_NUM_SUPPORTED_BAND_MAX * sizeof(*txrx_streams); + + if (is_ml_vdev) { + tlv = ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_STRUCT, + sizeof(*ml_params)); + ptr += TLV_HDR_SIZE; + ml_params = ptr; + + ml_params->tlv_header = + ath12k_wmi_tlv_cmd_hdr(WMI_TAG_MLO_VDEV_CREATE_PARAMS, + sizeof(*ml_params)); + ether_addr_copy(ml_params->mld_macaddr.addr, args->mld_addr); + } + ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "WMI vdev create: id %d type %d subtype %d macaddr %pM pdevid %d\n", args->if_id, args->type, args->subtype, @@ -1020,19 +1038,27 @@ static void ath12k_wmi_put_wmi_channel(struct ath12k_wmi_channel_params *chan, int ath12k_wmi_vdev_start(struct ath12k *ar, struct wmi_vdev_start_req_arg *arg, bool restart) { + struct wmi_vdev_start_mlo_params *ml_params; + struct wmi_partner_link_info *partner_info; struct ath12k_wmi_pdev *wmi = ar->wmi; struct wmi_vdev_start_request_cmd *cmd; struct sk_buff *skb; struct ath12k_wmi_channel_params *chan; struct wmi_tlv *tlv; void *ptr; - int ret, len; + int ret, len, i, ml_arg_size = 0; if (WARN_ON(arg->ssid_len > sizeof(cmd->ssid.ssid))) return -EINVAL; len = sizeof(*cmd) + sizeof(*chan) + TLV_HDR_SIZE; + if (!restart && arg->ml.enabled) { + ml_arg_size = TLV_HDR_SIZE + sizeof(*ml_params) + + TLV_HDR_SIZE + (arg->ml.num_partner_links * + sizeof(*partner_info)); + len += ml_arg_size; + } skb = ath12k_wmi_alloc_skb(wmi->wmi_ab, len); if (!skb) return -ENOMEM; @@ -1085,6 +1111,61 @@ int ath12k_wmi_vdev_start(struct ath12k *ar, struct wmi_vdev_start_req_arg *arg, ptr += sizeof(*tlv); + if (ml_arg_size) { + tlv = ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_STRUCT, + sizeof(*ml_params)); + ptr += TLV_HDR_SIZE; + + ml_params = ptr; + + ml_params->tlv_header = + ath12k_wmi_tlv_cmd_hdr(WMI_TAG_MLO_VDEV_START_PARAMS, + sizeof(*ml_params)); + + ml_params->flags = le32_encode_bits(arg->ml.enabled, + ATH12K_WMI_FLAG_MLO_ENABLED) | + le32_encode_bits(arg->ml.assoc_link, + ATH12K_WMI_FLAG_MLO_ASSOC_LINK) | + le32_encode_bits(arg->ml.mcast_link, + ATH12K_WMI_FLAG_MLO_MCAST_VDEV) | + le32_encode_bits(arg->ml.link_add, + ATH12K_WMI_FLAG_MLO_LINK_ADD); + + ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "vdev %d start ml flags 0x%x\n", + arg->vdev_id, ml_params->flags); + + ptr += sizeof(*ml_params); + + tlv = ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_STRUCT, + arg->ml.num_partner_links * + sizeof(*partner_info)); + ptr += TLV_HDR_SIZE; + + partner_info = ptr; + + for (i = 0; i < arg->ml.num_partner_links; i++) { + partner_info->tlv_header = + ath12k_wmi_tlv_cmd_hdr(WMI_TAG_MLO_PARTNER_LINK_PARAMS, + sizeof(*partner_info)); + partner_info->vdev_id = + cpu_to_le32(arg->ml.partner_info[i].vdev_id); + partner_info->hw_link_id = + cpu_to_le32(arg->ml.partner_info[i].hw_link_id); + ether_addr_copy(partner_info->vdev_addr.addr, + arg->ml.partner_info[i].addr); + + ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "partner vdev %d hw_link_id %d macaddr%pM\n", + partner_info->vdev_id, partner_info->hw_link_id, + partner_info->vdev_addr.addr); + + partner_info++; + } + + ptr = partner_info; + } + ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "vdev %s id 0x%x freq 0x%x mode 0x%x\n", restart ? "restart" : "start", arg->vdev_id, arg->freq, arg->mode); diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h index 6f55dbdf629db..0ddd7ce973851 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.h +++ b/drivers/net/wireless/ath/ath12k/wmi.h @@ -1929,6 +1929,19 @@ enum wmi_tlv_tag { WMI_TAG_REGULATORY_RULE_EXT_STRUCT = 0x3A9, WMI_TAG_REG_CHAN_LIST_CC_EXT_EVENT, WMI_TAG_EHT_RATE_SET = 0x3C4, + WMI_TAG_DCS_AWGN_INT_TYPE = 0x3C5, + WMI_TAG_MLO_TX_SEND_PARAMS, + WMI_TAG_MLO_PARTNER_LINK_PARAMS, + WMI_TAG_MLO_PARTNER_LINK_PARAMS_PEER_ASSOC, + WMI_TAG_MLO_SETUP_CMD = 0x3C9, + WMI_TAG_MLO_SETUP_COMPLETE_EVENT, + WMI_TAG_MLO_READY_CMD, + WMI_TAG_MLO_TEARDOWN_CMD, + WMI_TAG_MLO_TEARDOWN_COMPLETE, + WMI_TAG_MLO_PEER_ASSOC_PARAMS = 0x3D0, + WMI_TAG_MLO_PEER_CREATE_PARAMS = 0x3D5, + WMI_TAG_MLO_VDEV_START_PARAMS = 0x3D6, + WMI_TAG_MLO_VDEV_CREATE_PARAMS = 0x3D7, WMI_TAG_PDEV_SET_BIOS_SAR_TABLE_CMD = 0x3D8, WMI_TAG_PDEV_SET_BIOS_GEO_TABLE_CMD = 0x3D9, WMI_TAG_PDEV_SET_BIOS_INTERFACE_CMD = 0x3FB, @@ -2740,6 +2753,7 @@ struct ath12k_wmi_vdev_create_arg { u8 if_stats_id; u32 mbssid_flags; u32 mbssid_tx_vdev_id; + u8 mld_addr[ETH_ALEN]; }; #define ATH12K_MAX_VDEV_STATS_ID 0x30 @@ -2766,6 +2780,33 @@ struct ath12k_wmi_vdev_txrx_streams_params { __le32 supported_rx_streams; } __packed; +struct wmi_vdev_create_mlo_params { + __le32 tlv_header; + struct ath12k_wmi_mac_addr_params mld_macaddr; +} __packed; + +#define ATH12K_WMI_FLAG_MLO_ENABLED BIT(0) +#define ATH12K_WMI_FLAG_MLO_ASSOC_LINK BIT(1) +#define ATH12K_WMI_FLAG_MLO_PRIMARY_UMAC BIT(2) +#define ATH12K_WMI_FLAG_MLO_LOGICAL_LINK_IDX_VALID BIT(3) +#define ATH12K_WMI_FLAG_MLO_PEER_ID_VALID BIT(4) +#define ATH12K_WMI_FLAG_MLO_MCAST_VDEV BIT(5) +#define ATH12K_WMI_FLAG_MLO_EMLSR_SUPPORT BIT(6) +#define ATH12K_WMI_FLAG_MLO_FORCED_INACTIVE BIT(7) +#define ATH12K_WMI_FLAG_MLO_LINK_ADD BIT(8) + +struct wmi_vdev_start_mlo_params { + __le32 tlv_header; + __le32 flags; +} __packed; + +struct wmi_partner_link_info { + __le32 tlv_header; + __le32 vdev_id; + __le32 hw_link_id; + struct ath12k_wmi_mac_addr_params vdev_addr; +} __packed; + struct wmi_vdev_delete_cmd { __le32 tlv_header; __le32 vdev_id; @@ -2909,6 +2950,27 @@ enum wmi_phy_mode { MODE_MAX = 33, }; +#define ATH12K_WMI_MLO_MAX_LINKS 4 + +struct wmi_ml_partner_info { + u32 vdev_id; + u32 hw_link_id; + u8 addr[ETH_ALEN]; + bool assoc_link; + bool primary_umac; + bool logical_link_idx_valid; + u32 logical_link_idx; +}; + +struct wmi_ml_arg { + bool enabled; + bool assoc_link; + bool mcast_link; + bool link_add; + u8 num_partner_links; + struct wmi_ml_partner_info partner_info[ATH12K_WMI_MLO_MAX_LINKS]; +}; + struct wmi_vdev_start_req_arg { u32 vdev_id; u32 freq; @@ -2946,6 +3008,7 @@ struct wmi_vdev_start_req_arg { u32 mbssid_flags; u32 mbssid_tx_vdev_id; u32 punct_bitmap; + struct wmi_ml_arg ml; }; struct ath12k_wmi_peer_create_arg { From c8a98ed160e54ae629364a1efd5379bed839d633 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Fri, 1 Nov 2024 17:17:00 +0200 Subject: [PATCH 0006/1450] wifi: ath12k: Refactor sta state machine Refactor ath12k_mac_op_sta_state(), with generic wrappers which can be used for both multi link stations and non-ML stations. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241101151705.165987-4-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.h | 3 + drivers/net/wireless/ath/ath12k/mac.c | 343 +++++++++++++++++-------- 2 files changed, 244 insertions(+), 102 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 3bf31ee5b9fa4..23e27d1198595 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -469,6 +469,9 @@ struct ath12k_link_sta { struct ath12k_link_vif *arvif; struct ath12k_sta *ahsta; + /* link address similar to ieee80211_link_sta */ + u8 addr[ETH_ALEN]; + /* the following are protected by ar->data_lock */ u32 changed; /* IEEE80211_RC_* */ u32 bw; diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 1a97eafaa3d24..73a80b55b229f 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4505,10 +4505,10 @@ ath12k_mac_set_peer_vht_fixed_rate(struct ath12k_link_vif *arvif, return ret; } -static int ath12k_station_assoc(struct ath12k *ar, - struct ath12k_link_vif *arvif, - struct ath12k_link_sta *arsta, - bool reassoc) +static int ath12k_mac_station_assoc(struct ath12k *ar, + struct ath12k_link_vif *arvif, + struct ath12k_link_sta *arsta, + bool reassoc) { struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); @@ -4595,28 +4595,19 @@ static int ath12k_station_assoc(struct ath12k *ar, return 0; } -static int ath12k_station_disassoc(struct ath12k *ar, - struct ath12k_link_vif *arvif, - struct ath12k_link_sta *arsta) +static int ath12k_mac_station_disassoc(struct ath12k *ar, + struct ath12k_link_vif *arvif, + struct ath12k_link_sta *arsta) { struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - int ret; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); if (!sta->wme) { arvif->num_legacy_stations--; - ret = ath12k_recalc_rtscts_prot(arvif); - if (ret) - return ret; + return ath12k_recalc_rtscts_prot(arvif); } - ret = ath12k_clear_peer_keys(arvif, sta->addr); - if (ret) { - ath12k_warn(ar->ab, "failed to clear all peer keys for vdev %i: %d\n", - arvif->vdev_id, ret); - return ret; - } return 0; } @@ -4812,6 +4803,147 @@ static void ath12k_mac_dec_num_stations(struct ath12k_link_vif *arvif, ar->num_stations--; } +static void ath12k_mac_station_post_remove(struct ath12k *ar, + struct ath12k_link_vif *arvif, + struct ath12k_link_sta *arsta) +{ + struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); + struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); + struct ath12k_sta *ahsta = arsta->ahsta; + struct ath12k_peer *peer; + + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + ath12k_mac_dec_num_stations(arvif, arsta); + + spin_lock_bh(&ar->ab->base_lock); + + peer = ath12k_peer_find(ar->ab, arvif->vdev_id, sta->addr); + if (peer && peer->sta == sta) { + ath12k_warn(ar->ab, "Found peer entry %pM n vdev %i after it was supposedly removed\n", + vif->addr, arvif->vdev_id); + peer->sta = NULL; + list_del(&peer->list); + kfree(peer); + ar->num_peers--; + } + + spin_unlock_bh(&ar->ab->base_lock); + + kfree(arsta->rx_stats); + arsta->rx_stats = NULL; + + if (arsta->link_id < IEEE80211_MLD_MAX_NUM_LINKS) { + ahsta->links_map &= ~(BIT(arsta->link_id)); + rcu_assign_pointer(ahsta->link[arsta->link_id], NULL); + synchronize_rcu(); + arsta->link_id = ATH12K_INVALID_LINK_ID; + arsta->ahsta = NULL; + } +} + +static int ath12k_mac_station_unauthorize(struct ath12k *ar, + struct ath12k_link_vif *arvif, + struct ath12k_link_sta *arsta) +{ + struct ath12k_peer *peer; + int ret; + + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + spin_lock_bh(&ar->ab->base_lock); + + peer = ath12k_peer_find(ar->ab, arvif->vdev_id, arsta->addr); + if (peer) + peer->is_authorized = false; + + spin_unlock_bh(&ar->ab->base_lock); + + /* Driver must clear the keys during the state change from + * IEEE80211_STA_AUTHORIZED to IEEE80211_STA_ASSOC, since after + * returning from here, mac80211 is going to delete the keys + * in __sta_info_destroy_part2(). This will ensure that the driver does + * not retain stale key references after mac80211 deletes the keys. + */ + ret = ath12k_clear_peer_keys(arvif, arsta->addr); + if (ret) { + ath12k_warn(ar->ab, "failed to clear all peer keys for vdev %i: %d\n", + arvif->vdev_id, ret); + return ret; + } + + return 0; +} + +static int ath12k_mac_station_authorize(struct ath12k *ar, + struct ath12k_link_vif *arvif, + struct ath12k_link_sta *arsta) +{ + struct ath12k_peer *peer; + struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); + struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); + int ret; + + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + spin_lock_bh(&ar->ab->base_lock); + + peer = ath12k_peer_find(ar->ab, arvif->vdev_id, sta->addr); + if (peer) + peer->is_authorized = true; + + spin_unlock_bh(&ar->ab->base_lock); + + if (vif->type == NL80211_IFTYPE_STATION && arvif->is_up) { + ret = ath12k_wmi_set_peer_param(ar, sta->addr, + arvif->vdev_id, + WMI_PEER_AUTHORIZE, + 1); + if (ret) { + ath12k_warn(ar->ab, "Unable to authorize peer %pM vdev %d: %d\n", + sta->addr, arvif->vdev_id, ret); + return ret; + } + } + + return 0; +} + +static int ath12k_mac_station_remove(struct ath12k *ar, + struct ath12k_link_vif *arvif, + struct ath12k_link_sta *arsta) +{ + struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); + struct ath12k_vif *ahvif = arvif->ahvif; + int ret; + + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + wiphy_work_cancel(ar->ah->hw->wiphy, &arsta->update_wk); + + if (ahvif->vdev_type == WMI_VDEV_TYPE_STA) { + ath12k_bss_disassoc(ar, arvif); + ret = ath12k_mac_vdev_stop(arvif); + if (ret) + ath12k_warn(ar->ab, "failed to stop vdev %i: %d\n", + arvif->vdev_id, ret); + } + + ath12k_dp_peer_cleanup(ar, arvif->vdev_id, sta->addr); + + ret = ath12k_peer_delete(ar, arvif->vdev_id, sta->addr); + if (ret) + ath12k_warn(ar->ab, "Failed to delete peer: %pM for VDEV: %d\n", + sta->addr, arvif->vdev_id); + else + ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "Removed peer: %pM for VDEV: %d\n", + sta->addr, arvif->vdev_id); + + ath12k_mac_station_post_remove(ar, arvif, arsta); + + return ret; +} + static int ath12k_mac_station_add(struct ath12k *ar, struct ath12k_link_vif *arvif, struct ath12k_link_sta *arsta) @@ -4919,31 +5051,37 @@ static u32 ath12k_mac_ieee80211_sta_bw_to_wmi(struct ath12k *ar, return bw; } -static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct ieee80211_sta *sta, - enum ieee80211_sta_state old_state, - enum ieee80211_sta_state new_state) +static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, + struct ath12k_link_vif *arvif, + struct ath12k_link_sta *arsta, + enum ieee80211_sta_state old_state, + enum ieee80211_sta_state new_state) { - struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); - struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); - struct ath12k *ar; - struct ath12k_link_vif *arvif; - struct ath12k_link_sta *arsta; - struct ath12k_peer *peer; + struct ath12k *ar = arvif->ar; + struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); + struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); + struct ath12k_sta *ahsta = arsta->ahsta; int ret = 0; lockdep_assert_wiphy(hw->wiphy); - arvif = &ahvif->deflink; - arsta = &ahsta->deflink; + /* IEEE80211_STA_NONE -> IEEE80211_STA_NOTEXIST: Remove the station + * from driver + */ + if ((old_state == IEEE80211_STA_NONE && + new_state == IEEE80211_STA_NOTEXIST)) { + /* ML sta needs separate handling */ + if (sta->mlo) + return 0; - ar = ath12k_get_ar_by_vif(hw, vif); - if (!ar) { - WARN_ON_ONCE(1); - return -EINVAL; + ret = ath12k_mac_station_remove(ar, arvif, arsta); + if (ret) { + ath12k_warn(ar->ab, "Failed to remove station: %pM for VDEV: %d\n", + arsta->addr, arvif->vdev_id); + } } + /* IEEE80211_STA_NOTEXIST -> IEEE80211_STA_NONE: Add new station to driver */ if (old_state == IEEE80211_STA_NOTEXIST && new_state == IEEE80211_STA_NONE) { memset(arsta, 0, sizeof(*arsta)); @@ -4961,56 +5099,16 @@ static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, if (ret) ath12k_warn(ar->ab, "Failed to add station: %pM for VDEV: %d\n", sta->addr, arvif->vdev_id); - } else if ((old_state == IEEE80211_STA_NONE && - new_state == IEEE80211_STA_NOTEXIST)) { - wiphy_work_cancel(hw->wiphy, &arsta->update_wk); - if (ahvif->vdev_type == WMI_VDEV_TYPE_STA) { - ath12k_bss_disassoc(ar, arvif); - ret = ath12k_mac_vdev_stop(arvif); - if (ret) - ath12k_warn(ar->ab, "failed to stop vdev %i: %d\n", - arvif->vdev_id, ret); - } - ath12k_dp_peer_cleanup(ar, arvif->vdev_id, sta->addr); - - ret = ath12k_peer_delete(ar, arvif->vdev_id, sta->addr); - if (ret) - ath12k_warn(ar->ab, "Failed to delete peer: %pM for VDEV: %d\n", - sta->addr, arvif->vdev_id); - else - ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "Removed peer: %pM for VDEV: %d\n", - sta->addr, arvif->vdev_id); - - ath12k_mac_dec_num_stations(arvif, arsta); - spin_lock_bh(&ar->ab->base_lock); - peer = ath12k_peer_find(ar->ab, arvif->vdev_id, sta->addr); - if (peer && peer->sta == sta) { - ath12k_warn(ar->ab, "Found peer entry %pM n vdev %i after it was supposedly removed\n", - vif->addr, arvif->vdev_id); - peer->sta = NULL; - list_del(&peer->list); - kfree(peer); - ar->num_peers--; - } - spin_unlock_bh(&ar->ab->base_lock); - - kfree(arsta->rx_stats); - arsta->rx_stats = NULL; - - if (arsta->link_id < IEEE80211_MLD_MAX_NUM_LINKS) { - rcu_assign_pointer(ahsta->link[arsta->link_id], NULL); - synchronize_rcu(); - ahsta->links_map &= ~(BIT(arsta->link_id)); - arsta->link_id = ATH12K_INVALID_LINK_ID; - arsta->ahsta = NULL; - } + /* IEEE80211_STA_AUTH -> IEEE80211_STA_ASSOC: Send station assoc command for + * peer associated to AP/Mesh/ADHOC vif type. + */ } else if (old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC && (vif->type == NL80211_IFTYPE_AP || vif->type == NL80211_IFTYPE_MESH_POINT || vif->type == NL80211_IFTYPE_ADHOC)) { - ret = ath12k_station_assoc(ar, arvif, arsta, false); + ret = ath12k_mac_station_assoc(ar, arvif, arsta, false); if (ret) ath12k_warn(ar->ab, "Failed to associate station: %pM\n", sta->addr); @@ -5021,40 +5119,32 @@ static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, arsta->bw_prev = sta->deflink.bandwidth; spin_unlock_bh(&ar->data_lock); + + /* IEEE80211_STA_ASSOC -> IEEE80211_STA_AUTHORIZED: set peer status as + * authorized + */ } else if (old_state == IEEE80211_STA_ASSOC && new_state == IEEE80211_STA_AUTHORIZED) { - spin_lock_bh(&ar->ab->base_lock); - - peer = ath12k_peer_find(ar->ab, arvif->vdev_id, sta->addr); - if (peer) - peer->is_authorized = true; - - spin_unlock_bh(&ar->ab->base_lock); + ret = ath12k_mac_station_authorize(ar, arvif, arsta); + if (ret) + ath12k_warn(ar->ab, "Failed to authorize station: %pM\n", + sta->addr); - if (vif->type == NL80211_IFTYPE_STATION && arvif->is_up) { - ret = ath12k_wmi_set_peer_param(ar, sta->addr, - arvif->vdev_id, - WMI_PEER_AUTHORIZE, - 1); - if (ret) - ath12k_warn(ar->ab, "Unable to authorize peer %pM vdev %d: %d\n", - sta->addr, arvif->vdev_id, ret); - } + /* IEEE80211_STA_AUTHORIZED -> IEEE80211_STA_ASSOC: station may be in removal, + * deauthorize it. + */ } else if (old_state == IEEE80211_STA_AUTHORIZED && new_state == IEEE80211_STA_ASSOC) { - spin_lock_bh(&ar->ab->base_lock); - - peer = ath12k_peer_find(ar->ab, arvif->vdev_id, sta->addr); - if (peer) - peer->is_authorized = false; - - spin_unlock_bh(&ar->ab->base_lock); + ath12k_mac_station_unauthorize(ar, arvif, arsta); + /* IEEE80211_STA_ASSOC -> IEEE80211_STA_AUTH: disassoc peer connected to + * AP/mesh/ADHOC vif type. + */ } else if (old_state == IEEE80211_STA_ASSOC && new_state == IEEE80211_STA_AUTH && (vif->type == NL80211_IFTYPE_AP || vif->type == NL80211_IFTYPE_MESH_POINT || vif->type == NL80211_IFTYPE_ADHOC)) { - ret = ath12k_station_disassoc(ar, arvif, arsta); + ret = ath12k_mac_station_disassoc(ar, arvif, arsta); if (ret) ath12k_warn(ar->ab, "Failed to disassociate station: %pM\n", sta->addr); @@ -5063,6 +5153,55 @@ static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, return ret; } +static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + enum ieee80211_sta_state old_state, + enum ieee80211_sta_state new_state) +{ + struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); + struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); + struct ath12k_link_vif *arvif; + struct ath12k_link_sta *arsta; + int ret; + u8 link_id = 0; + + lockdep_assert_wiphy(hw->wiphy); + + if (ieee80211_vif_is_mld(vif) && sta->valid_links) { + WARN_ON(!sta->mlo && hweight16(sta->valid_links) != 1); + link_id = ffs(sta->valid_links) - 1; + } + + /* Handle for non-ML station */ + if (!sta->mlo) { + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + arsta = &ahsta->deflink; + arsta->ahsta = ahsta; + + if (WARN_ON(!arvif || !arsta)) { + ret = -EINVAL; + goto exit; + } + + /* vdev might be in deleted */ + if (WARN_ON(!arvif->ar)) { + ret = -EINVAL; + goto exit; + } + + ret = ath12k_mac_handle_link_sta_state(hw, arvif, arsta, + old_state, new_state); + if (ret) + goto exit; + } + + ret = 0; + +exit: + return ret; +} + static int ath12k_mac_op_sta_set_txpwr(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta) From a2189d2b8005cd9f3a440512af087eb9b62c103e Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Fri, 1 Nov 2024 17:17:01 +0200 Subject: [PATCH 0007/1450] wifi: ath12k: introduce ath12k_hw_warn() In the following patch we need to use ath12k_warn() but don't easily have access to struct ath12k_base (ab) but do have access to struct ath12k_hw (ah). So add a new warning helper ath12_hw_warn() which takes the latter but the log output is still identical but uses the struct device pointer stored to struct ath12k_hw. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241101151705.165987-5-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.h | 2 ++ drivers/net/wireless/ath/ath12k/debug.c | 6 +++--- drivers/net/wireless/ath/ath12k/debug.h | 5 ++++- drivers/net/wireless/ath/ath12k/mac.c | 2 ++ 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 23e27d1198595..e084e231f753c 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -691,6 +691,8 @@ struct ath12k { struct ath12k_hw { struct ieee80211_hw *hw; + struct device *dev; + /* Protect the write operation of the hardware state ath12k_hw::state * between hardware start<=>reconfigure<=>stop transitions. */ diff --git a/drivers/net/wireless/ath/ath12k/debug.c b/drivers/net/wireless/ath/ath12k/debug.c index fe5a732ba9ec9..ff6eaeafa092c 100644 --- a/drivers/net/wireless/ath/ath12k/debug.c +++ b/drivers/net/wireless/ath/ath12k/debug.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: BSD-3-Clause-Clear /* * Copyright (c) 2018-2021 The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2023 Qualcomm Innovation Center, Inc. All rights reserved. + * Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved. */ #include @@ -36,7 +36,7 @@ void ath12k_err(struct ath12k_base *ab, const char *fmt, ...) va_end(args); } -void ath12k_warn(struct ath12k_base *ab, const char *fmt, ...) +void __ath12k_warn(struct device *dev, const char *fmt, ...) { struct va_format vaf = { .fmt = fmt, @@ -45,7 +45,7 @@ void ath12k_warn(struct ath12k_base *ab, const char *fmt, ...) va_start(args, fmt); vaf.va = &args; - dev_warn_ratelimited(ab->dev, "%pV", &vaf); + dev_warn_ratelimited(dev, "%pV", &vaf); /* TODO: Trace the log */ va_end(args); } diff --git a/drivers/net/wireless/ath/ath12k/debug.h b/drivers/net/wireless/ath/ath12k/debug.h index f7005917362c6..90e801136bc65 100644 --- a/drivers/net/wireless/ath/ath12k/debug.h +++ b/drivers/net/wireless/ath/ath12k/debug.h @@ -31,7 +31,10 @@ enum ath12k_debug_mask { __printf(2, 3) void ath12k_info(struct ath12k_base *ab, const char *fmt, ...); __printf(2, 3) void ath12k_err(struct ath12k_base *ab, const char *fmt, ...); -__printf(2, 3) void ath12k_warn(struct ath12k_base *ab, const char *fmt, ...); +__printf(2, 3) void __ath12k_warn(struct device *dev, const char *fmt, ...); + +#define ath12k_warn(ab, fmt, ...) __ath12k_warn((ab)->dev, fmt, ##__VA_ARGS__) +#define ath12k_hw_warn(ah, fmt, ...) __ath12k_warn((ah)->dev, fmt, ##__VA_ARGS__) extern unsigned int ath12k_debug_mask; diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 73a80b55b229f..5b8d4aa0eefdc 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -10179,6 +10179,8 @@ int ath12k_mac_allocate(struct ath12k_base *ab) goto err; } + ah->dev = ab->dev; + ab->ah[i] = ah; } From 7fd8b4cbde65bf65d32e1a6615ae8160cf305ef8 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Fri, 1 Nov 2024 17:17:02 +0200 Subject: [PATCH 0008/1450] wifi: ath12k: Add helpers for multi link peer creation and deletion Add helper functions for multi link peer addition and deletion. And add address validation to ensure we are not creating link peers (belonging to different clients) with same MLD address. To aid in this validation for faster lookup, add a new list of ML peers to struct ath12k_hw::ml_peers and use the same for parsing for the above address validation use cases. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241101151705.165987-6-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.h | 13 ++++ drivers/net/wireless/ath/ath12k/mac.c | 2 + drivers/net/wireless/ath/ath12k/peer.c | 98 ++++++++++++++++++++++++++ drivers/net/wireless/ath/ath12k/peer.h | 8 +++ 4 files changed, 121 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index e084e231f753c..7324dae3fcb88 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -63,6 +63,13 @@ #define ATH12K_RECONFIGURE_TIMEOUT_HZ (10 * HZ) #define ATH12K_RECOVER_START_TIMEOUT_HZ (20 * HZ) +#define ATH12K_MAX_SOCS 3 +#define ATH12K_INVALID_GROUP_ID 0xFF +#define ATH12K_INVALID_DEVICE_ID 0xFF + +#define ATH12K_MAX_MLO_PEERS 256 +#define ATH12K_MLO_PEER_ID_INVALID 0xFFFF + enum ath12k_bdf_search { ATH12K_BDF_SEARCH_DEFAULT, ATH12K_BDF_SEARCH_BUS_AND_BOARD, @@ -496,6 +503,7 @@ struct ath12k_sta { struct ath12k_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; /* indicates bitmap of link sta created in FW */ u16 links_map; + u16 ml_peer_id; }; #define ATH12K_MIN_5G_FREQ 4150 @@ -703,6 +711,11 @@ struct ath12k_hw { u8 num_radio; + DECLARE_BITMAP(free_ml_peer_id_map, ATH12K_MAX_MLO_PEERS); + + /* protected by wiphy_lock() */ + struct list_head ml_peers; + /* Keep last */ struct ath12k radio[] __aligned(sizeof(void *)); }; diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 5b8d4aa0eefdc..61d05ffa364a5 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -5091,6 +5091,7 @@ static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, ahsta->links_map = BIT(arsta->link_id); arsta->ahsta = ahsta; arsta->arvif = arvif; + ether_addr_copy(arsta->addr, sta->addr); wiphy_work_init(&arsta->update_wk, ath12k_sta_rc_update_wk); synchronize_rcu(); @@ -10110,6 +10111,7 @@ static struct ath12k_hw *ath12k_mac_hw_allocate(struct ath12k_base *ab, ah->num_radio = num_pdev_map; mutex_init(&ah->hw_mutex); + INIT_LIST_HEAD(&ah->ml_peers); for (i = 0; i < num_pdev_map; i++) { ab = pdev_map[i].ab; diff --git a/drivers/net/wireless/ath/ath12k/peer.c b/drivers/net/wireless/ath/ath12k/peer.c index 7a62665b8af93..2ad19baf0664a 100644 --- a/drivers/net/wireless/ath/ath12k/peer.c +++ b/drivers/net/wireless/ath/ath12k/peer.c @@ -8,6 +8,22 @@ #include "peer.h" #include "debug.h" +static struct ath12k_ml_peer *ath12k_peer_ml_find(struct ath12k_hw *ah, const u8 *addr) +{ + struct ath12k_ml_peer *ml_peer; + + lockdep_assert_wiphy(ah->hw->wiphy); + + list_for_each_entry(ml_peer, &ah->ml_peers, list) { + if (!ether_addr_equal(ml_peer->addr, addr)) + continue; + + return ml_peer; + } + + return NULL; +} + struct ath12k_peer *ath12k_peer_find(struct ath12k_base *ab, int vdev_id, const u8 *addr) { @@ -341,3 +357,85 @@ int ath12k_peer_create(struct ath12k *ar, struct ath12k_link_vif *arvif, return 0; } + +static u16 ath12k_peer_ml_alloc(struct ath12k_hw *ah) +{ + u16 ml_peer_id; + + lockdep_assert_wiphy(ah->hw->wiphy); + + for (ml_peer_id = 0; ml_peer_id < ATH12K_MAX_MLO_PEERS; ml_peer_id++) { + if (test_bit(ml_peer_id, ah->free_ml_peer_id_map)) + continue; + + set_bit(ml_peer_id, ah->free_ml_peer_id_map); + break; + } + + if (ml_peer_id == ATH12K_MAX_MLO_PEERS) + ml_peer_id = ATH12K_MLO_PEER_ID_INVALID; + + return ml_peer_id; +} + +int ath12k_peer_ml_create(struct ath12k_hw *ah, struct ieee80211_sta *sta) +{ + struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); + struct ath12k_ml_peer *ml_peer; + + lockdep_assert_wiphy(ah->hw->wiphy); + + if (!sta->mlo) + return -EINVAL; + + ml_peer = ath12k_peer_ml_find(ah, sta->addr); + if (ml_peer) { + ath12k_hw_warn(ah, "ML peer %d exists already, unable to add new entry for %pM", + ml_peer->id, sta->addr); + return -EEXIST; + } + + ml_peer = kzalloc(sizeof(*ml_peer), GFP_ATOMIC); + if (!ml_peer) + return -ENOMEM; + + ahsta->ml_peer_id = ath12k_peer_ml_alloc(ah); + + if (ahsta->ml_peer_id == ATH12K_MLO_PEER_ID_INVALID) { + ath12k_hw_warn(ah, "unable to allocate ML peer id for sta %pM", + sta->addr); + kfree(ml_peer); + return -ENOMEM; + } + + ether_addr_copy(ml_peer->addr, sta->addr); + ml_peer->id = ahsta->ml_peer_id; + list_add(&ml_peer->list, &ah->ml_peers); + + return 0; +} + +int ath12k_peer_ml_delete(struct ath12k_hw *ah, struct ieee80211_sta *sta) +{ + struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); + struct ath12k_ml_peer *ml_peer; + + lockdep_assert_wiphy(ah->hw->wiphy); + + if (!sta->mlo) + return -EINVAL; + + clear_bit(ahsta->ml_peer_id, ah->free_ml_peer_id_map); + ahsta->ml_peer_id = ATH12K_MLO_PEER_ID_INVALID; + + ml_peer = ath12k_peer_ml_find(ah, sta->addr); + if (!ml_peer) { + ath12k_hw_warn(ah, "ML peer for %pM not found", sta->addr); + return -EINVAL; + } + + list_del(&ml_peer->list); + kfree(ml_peer); + + return 0; +} diff --git a/drivers/net/wireless/ath/ath12k/peer.h b/drivers/net/wireless/ath/ath12k/peer.h index b955f0cdf5989..085246ca938d9 100644 --- a/drivers/net/wireless/ath/ath12k/peer.h +++ b/drivers/net/wireless/ath/ath12k/peer.h @@ -49,6 +49,12 @@ struct ath12k_peer { bool dp_setup_done; }; +struct ath12k_ml_peer { + struct list_head list; + u8 addr[ETH_ALEN]; + u16 id; +}; + void ath12k_peer_unmap_event(struct ath12k_base *ab, u16 peer_id); void ath12k_peer_map_event(struct ath12k_base *ab, u8 vdev_id, u16 peer_id, u8 *mac_addr, u16 ast_hash, u16 hw_peer_id); @@ -66,5 +72,7 @@ int ath12k_wait_for_peer_delete_done(struct ath12k *ar, u32 vdev_id, const u8 *addr); bool ath12k_peer_exist_by_vdev_id(struct ath12k_base *ab, int vdev_id); struct ath12k_peer *ath12k_peer_find_by_ast(struct ath12k_base *ab, int ast_hash); +int ath12k_peer_ml_create(struct ath12k_hw *ah, struct ieee80211_sta *sta); +int ath12k_peer_ml_delete(struct ath12k_hw *ah, struct ieee80211_sta *sta); #endif /* _PEER_H_ */ From 0660e1e2ed5ff493f1e383a32d28db2b7d8490f7 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Fri, 1 Nov 2024 17:17:03 +0200 Subject: [PATCH 0009/1450] wifi: ath12k: add multi-link flag in peer create command Driver should indicate to firmware whether a peer is multi-link or not in peer create command using multi-link flag. Add changes to support WMI_TAG_MLO_PEER_CREATE_PARAMS in WMI_PEER_CREATE_CMDID. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241101151705.165987-7-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 5 +++-- drivers/net/wireless/ath/ath12k/wmi.c | 27 +++++++++++++++++++++++---- drivers/net/wireless/ath/ath12k/wmi.h | 6 ++++++ 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 61d05ffa364a5..076e4da4875f8 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4969,8 +4969,9 @@ static int ath12k_mac_station_add(struct ath12k *ar, } peer_param.vdev_id = arvif->vdev_id; - peer_param.peer_addr = sta->addr; + peer_param.peer_addr = arsta->addr; peer_param.peer_type = WMI_PEER_TYPE_DEFAULT; + peer_param.ml_enabled = sta->mlo; ret = ath12k_peer_create(ar, arvif, sta, &peer_param); if (ret) { @@ -7005,7 +7006,7 @@ int ath12k_mac_vdev_create(struct ath12k *ar, struct ath12k_link_vif *arvif) struct ath12k_vif *ahvif = arvif->ahvif; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif); struct ath12k_wmi_vdev_create_arg vdev_arg = {0}; - struct ath12k_wmi_peer_create_arg peer_param; + struct ath12k_wmi_peer_create_arg peer_param = {0}; struct ieee80211_bss_conf *link_conf; u32 param_id, param_value; u16 nss; diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index e089b58bbea1b..0583d832fac7c 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -1230,9 +1230,14 @@ int ath12k_wmi_send_peer_create_cmd(struct ath12k *ar, struct ath12k_wmi_pdev *wmi = ar->wmi; struct wmi_peer_create_cmd *cmd; struct sk_buff *skb; - int ret; + int ret, len; + struct wmi_peer_create_mlo_params *ml_param; + void *ptr; + struct wmi_tlv *tlv; - skb = ath12k_wmi_alloc_skb(wmi->wmi_ab, sizeof(*cmd)); + len = sizeof(*cmd) + TLV_HDR_SIZE + sizeof(*ml_param); + + skb = ath12k_wmi_alloc_skb(wmi->wmi_ab, len); if (!skb) return -ENOMEM; @@ -1244,9 +1249,23 @@ int ath12k_wmi_send_peer_create_cmd(struct ath12k *ar, cmd->peer_type = cpu_to_le32(arg->peer_type); cmd->vdev_id = cpu_to_le32(arg->vdev_id); + ptr = skb->data + sizeof(*cmd); + tlv = ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_STRUCT, + sizeof(*ml_param)); + ptr += TLV_HDR_SIZE; + ml_param = ptr; + ml_param->tlv_header = + ath12k_wmi_tlv_cmd_hdr(WMI_TAG_MLO_PEER_CREATE_PARAMS, + sizeof(*ml_param)); + if (arg->ml_enabled) + ml_param->flags = cpu_to_le32(ATH12K_WMI_FLAG_MLO_ENABLED); + + ptr += sizeof(*ml_param); + ath12k_dbg(ar->ab, ATH12K_DBG_WMI, - "WMI peer create vdev_id %d peer_addr %pM\n", - arg->vdev_id, arg->peer_addr); + "WMI peer create vdev_id %d peer_addr %pM ml_flags 0x%x\n", + arg->vdev_id, arg->peer_addr, ml_param->flags); ret = ath12k_wmi_cmd_send(wmi, skb, WMI_PEER_CREATE_CMDID); if (ret) { diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h index 0ddd7ce973851..2378d94b24098 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.h +++ b/drivers/net/wireless/ath/ath12k/wmi.h @@ -3015,6 +3015,12 @@ struct ath12k_wmi_peer_create_arg { const u8 *peer_addr; u32 peer_type; u32 vdev_id; + bool ml_enabled; +}; + +struct wmi_peer_create_mlo_params { + __le32 tlv_header; + __le32 flags; }; struct ath12k_wmi_pdev_set_regdomain_arg { From c20dbc8c68b38fe702e9fbc4748aa117194f8963 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Fri, 1 Nov 2024 17:17:04 +0200 Subject: [PATCH 0010/1450] wifi: ath12k: add helper to find multi-link station Multi-link stations are identified in driver using the multi-link peer id and they have ATH12K_PEER_ML_ID_VALID bit set in the id. Add a helper to find multi-link station using the multi-link peer id. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241101151705.165987-8-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/peer.c | 17 +++++++++++++++++ drivers/net/wireless/ath/ath12k/peer.h | 4 ++++ 2 files changed, 21 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/peer.c b/drivers/net/wireless/ath/ath12k/peer.c index 2ad19baf0664a..0e86847edd6e1 100644 --- a/drivers/net/wireless/ath/ath12k/peer.c +++ b/drivers/net/wireless/ath/ath12k/peer.c @@ -79,6 +79,20 @@ struct ath12k_peer *ath12k_peer_find_by_addr(struct ath12k_base *ab, return NULL; } +static struct ath12k_peer *ath12k_peer_find_by_ml_id(struct ath12k_base *ab, + int ml_peer_id) +{ + struct ath12k_peer *peer; + + lockdep_assert_held(&ab->base_lock); + + list_for_each_entry(peer, &ab->peers, list) + if (ml_peer_id == peer->ml_id) + return peer; + + return NULL; +} + struct ath12k_peer *ath12k_peer_find_by_id(struct ath12k_base *ab, int peer_id) { @@ -86,6 +100,9 @@ struct ath12k_peer *ath12k_peer_find_by_id(struct ath12k_base *ab, lockdep_assert_held(&ab->base_lock); + if (peer_id & ATH12K_PEER_ML_ID_VALID) + return ath12k_peer_find_by_ml_id(ab, peer_id); + list_for_each_entry(peer, &ab->peers, list) if (peer_id == peer->peer_id) return peer; diff --git a/drivers/net/wireless/ath/ath12k/peer.h b/drivers/net/wireless/ath/ath12k/peer.h index 085246ca938d9..c28aca5d88a04 100644 --- a/drivers/net/wireless/ath/ath12k/peer.h +++ b/drivers/net/wireless/ath/ath12k/peer.h @@ -19,6 +19,8 @@ struct ppdu_user_delayba { u32 resp_rate_flags; }; +#define ATH12K_PEER_ML_ID_VALID BIT(13) + struct ath12k_peer { struct list_head list; struct ieee80211_sta *sta; @@ -47,6 +49,8 @@ struct ath12k_peer { /* protected by ab->data_lock */ bool dp_setup_done; + + u16 ml_id; }; struct ath12k_ml_peer { From 507f8e730100822b75290bbf96135f0e789da9cc Mon Sep 17 00:00:00 2001 From: Sriram R Date: Fri, 1 Nov 2024 17:17:05 +0200 Subject: [PATCH 0011/1450] wifi: ath12k: Add MLO peer assoc command support Add changes to send MLO peer assoc command with partner link details and primary umac details. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241101151705.165987-9-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.h | 7 +++ drivers/net/wireless/ath/ath12k/mac.c | 62 ++++++++++++++++++++ drivers/net/wireless/ath/ath12k/wmi.c | 79 ++++++++++++++++++++++++-- drivers/net/wireless/ath/ath12k/wmi.h | 46 +++++++++++++++ 4 files changed, 188 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 7324dae3fcb88..8dbdf6818f58c 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -495,9 +495,16 @@ struct ath12k_link_sta { struct ath12k_rx_peer_stats *rx_stats; struct ath12k_wbm_tx_stats *wbm_tx_stats; u32 bw_prev; + + /* For now the assoc link will be considered primary */ + bool is_assoc_link; + + /* for firmware use only */ + u8 link_idx; }; struct ath12k_sta { + struct ath12k_vif *ahvif; enum hal_pn_type pn_type; struct ath12k_link_sta deflink; struct ath12k_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 076e4da4875f8..ad27a2552a2c9 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -2859,6 +2859,67 @@ static void ath12k_peer_assoc_h_eht(struct ath12k *ar, arg->punct_bitmap = ~arvif->punct_bitmap; } +static void ath12k_peer_assoc_h_mlo(struct ath12k_link_sta *arsta, + struct ath12k_wmi_peer_assoc_arg *arg) +{ + struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); + struct peer_assoc_mlo_params *ml = &arg->ml; + struct ath12k_sta *ahsta = arsta->ahsta; + struct ath12k_link_sta *arsta_p; + struct ath12k_link_vif *arvif; + unsigned long links; + u8 link_id; + int i; + + if (!sta->mlo || ahsta->ml_peer_id == ATH12K_MLO_PEER_ID_INVALID) + return; + + ml->enabled = true; + ml->assoc_link = arsta->is_assoc_link; + + /* For now considering the primary umac based on assoc link */ + ml->primary_umac = arsta->is_assoc_link; + ml->peer_id_valid = true; + ml->logical_link_idx_valid = true; + + ether_addr_copy(ml->mld_addr, sta->addr); + ml->logical_link_idx = arsta->link_idx; + ml->ml_peer_id = ahsta->ml_peer_id; + ml->ieee_link_id = arsta->link_id; + ml->num_partner_links = 0; + links = ahsta->links_map; + + rcu_read_lock(); + + i = 0; + + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + if (i >= ATH12K_WMI_MLO_MAX_LINKS) + break; + + arsta_p = rcu_dereference(ahsta->link[link_id]); + arvif = rcu_dereference(ahsta->ahvif->link[link_id]); + + if (arsta_p == arsta) + continue; + + if (!arvif->is_started) + continue; + + ml->partner_info[i].vdev_id = arvif->vdev_id; + ml->partner_info[i].hw_link_id = arvif->ar->pdev->hw_link_id; + ml->partner_info[i].assoc_link = arsta_p->is_assoc_link; + ml->partner_info[i].primary_umac = arsta_p->is_assoc_link; + ml->partner_info[i].logical_link_idx_valid = true; + ml->partner_info[i].logical_link_idx = arsta_p->link_idx; + ml->num_partner_links++; + + i++; + } + + rcu_read_unlock(); +} + static void ath12k_peer_assoc_prepare(struct ath12k *ar, struct ath12k_link_vif *arvif, struct ath12k_link_sta *arsta, @@ -2883,6 +2944,7 @@ static void ath12k_peer_assoc_prepare(struct ath12k *ar, ath12k_peer_assoc_h_qos(ar, arvif, arsta, arg); ath12k_peer_assoc_h_phymode(ar, arvif, arsta, arg); ath12k_peer_assoc_h_smps(arsta, arg); + ath12k_peer_assoc_h_mlo(arsta, arg); /* TODO: amsdu_disable req? */ } diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 0583d832fac7c..50ed7e72f178f 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -2101,12 +2101,15 @@ int ath12k_wmi_send_peer_assoc_cmd(struct ath12k *ar, struct ath12k_wmi_vht_rate_set_params *mcs; struct ath12k_wmi_he_rate_set_params *he_mcs; struct ath12k_wmi_eht_rate_set_params *eht_mcs; + struct wmi_peer_assoc_mlo_params *ml_params; + struct wmi_peer_assoc_mlo_partner_info_params *partner_info; struct sk_buff *skb; struct wmi_tlv *tlv; void *ptr; u32 peer_legacy_rates_align; u32 peer_ht_rates_align; int i, ret, len; + __le32 v; peer_legacy_rates_align = roundup(arg->peer_legacy_rates.num_rates, sizeof(u32)); @@ -2118,8 +2121,13 @@ int ath12k_wmi_send_peer_assoc_cmd(struct ath12k *ar, TLV_HDR_SIZE + (peer_ht_rates_align * sizeof(u8)) + sizeof(*mcs) + TLV_HDR_SIZE + (sizeof(*he_mcs) * arg->peer_he_mcs_count) + - TLV_HDR_SIZE + (sizeof(*eht_mcs) * arg->peer_eht_mcs_count) + - TLV_HDR_SIZE + TLV_HDR_SIZE; + TLV_HDR_SIZE + (sizeof(*eht_mcs) * arg->peer_eht_mcs_count); + + if (arg->ml.enabled) + len += TLV_HDR_SIZE + sizeof(*ml_params) + + TLV_HDR_SIZE + (arg->ml.num_partner_links * sizeof(*partner_info)); + else + len += (2 * TLV_HDR_SIZE); skb = ath12k_wmi_alloc_skb(wmi->wmi_ab, len); if (!skb) @@ -2243,12 +2251,38 @@ int ath12k_wmi_send_peer_assoc_cmd(struct ath12k *ar, ptr += sizeof(*he_mcs); } - /* MLO header tag with 0 length */ - len = 0; tlv = ptr; + len = arg->ml.enabled ? sizeof(*ml_params) : 0; tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_STRUCT, len); ptr += TLV_HDR_SIZE; + if (!len) + goto skip_ml_params; + + ml_params = ptr; + ml_params->tlv_header = ath12k_wmi_tlv_cmd_hdr(WMI_TAG_MLO_PEER_ASSOC_PARAMS, + len); + ml_params->flags = cpu_to_le32(ATH12K_WMI_FLAG_MLO_ENABLED); + + if (arg->ml.assoc_link) + ml_params->flags |= cpu_to_le32(ATH12K_WMI_FLAG_MLO_ASSOC_LINK); + + if (arg->ml.primary_umac) + ml_params->flags |= cpu_to_le32(ATH12K_WMI_FLAG_MLO_PRIMARY_UMAC); + + if (arg->ml.logical_link_idx_valid) + ml_params->flags |= + cpu_to_le32(ATH12K_WMI_FLAG_MLO_LOGICAL_LINK_IDX_VALID); + + if (arg->ml.peer_id_valid) + ml_params->flags |= cpu_to_le32(ATH12K_WMI_FLAG_MLO_PEER_ID_VALID); + ether_addr_copy(ml_params->mld_addr.addr, arg->ml.mld_addr); + ml_params->logical_link_idx = cpu_to_le32(arg->ml.logical_link_idx); + ml_params->ml_peer_id = cpu_to_le32(arg->ml.ml_peer_id); + ml_params->ieee_link_id = cpu_to_le32(arg->ml.ieee_link_id); + ptr += sizeof(*ml_params); + +skip_ml_params: /* Loop through the EHT rate set */ len = arg->peer_eht_mcs_count * sizeof(*eht_mcs); tlv = ptr; @@ -2265,12 +2299,45 @@ int ath12k_wmi_send_peer_assoc_cmd(struct ath12k *ar, ptr += sizeof(*eht_mcs); } - /* ML partner links tag with 0 length */ - len = 0; tlv = ptr; + len = arg->ml.enabled ? arg->ml.num_partner_links * sizeof(*partner_info) : 0; + /* fill ML Partner links */ tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_STRUCT, len); ptr += TLV_HDR_SIZE; + if (len == 0) + goto send; + + for (i = 0; i < arg->ml.num_partner_links; i++) { + u32 cmd = WMI_TAG_MLO_PARTNER_LINK_PARAMS_PEER_ASSOC; + + partner_info = ptr; + partner_info->tlv_header = ath12k_wmi_tlv_cmd_hdr(cmd, + sizeof(*partner_info)); + partner_info->vdev_id = cpu_to_le32(arg->ml.partner_info[i].vdev_id); + partner_info->hw_link_id = + cpu_to_le32(arg->ml.partner_info[i].hw_link_id); + partner_info->flags = cpu_to_le32(ATH12K_WMI_FLAG_MLO_ENABLED); + + if (arg->ml.partner_info[i].assoc_link) + partner_info->flags |= + cpu_to_le32(ATH12K_WMI_FLAG_MLO_ASSOC_LINK); + + if (arg->ml.partner_info[i].primary_umac) + partner_info->flags |= + cpu_to_le32(ATH12K_WMI_FLAG_MLO_PRIMARY_UMAC); + + if (arg->ml.partner_info[i].logical_link_idx_valid) { + v = cpu_to_le32(ATH12K_WMI_FLAG_MLO_LINK_ID_VALID); + partner_info->flags |= v; + } + + partner_info->logical_link_idx = + cpu_to_le32(arg->ml.partner_info[i].logical_link_idx); + ptr += sizeof(*partner_info); + } + +send: ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "wmi peer assoc vdev id %d assoc id %d peer mac %pM peer_flags %x rate_caps %x peer_caps %x listen_intval %d ht_caps %x max_mpdu %d nss %d phymode %d peer_mpdu_density %d vht_caps %x he cap_info %x he ops %x he cap_info_ext %x he phy %x %x %x peer_bw_rxnss_override %x peer_flags_ext %x eht mac_cap %x %x eht phy_cap %x %x %x\n", cmd->vdev_id, cmd->peer_associd, arg->peer_mac, diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h index 2378d94b24098..05aa9754118a0 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.h +++ b/drivers/net/wireless/ath/ath12k/wmi.h @@ -3687,6 +3687,24 @@ struct wmi_vdev_install_key_arg { #define WMI_HECAP_TXRX_MCS_NSS_IDX_160 1 #define WMI_HECAP_TXRX_MCS_NSS_IDX_80_80 2 +#define ATH12K_WMI_MLO_MAX_PARTNER_LINKS \ + (ATH12K_WMI_MLO_MAX_LINKS + ATH12K_MAX_NUM_BRIDGE_LINKS - 1) + +struct peer_assoc_mlo_params { + bool enabled; + bool assoc_link; + bool primary_umac; + bool peer_id_valid; + bool logical_link_idx_valid; + bool bridge_peer; + u8 mld_addr[ETH_ALEN]; + u32 logical_link_idx; + u32 ml_peer_id; + u32 ieee_link_id; + u8 num_partner_links; + struct wmi_ml_partner_info partner_info[ATH12K_WMI_MLO_MAX_LINKS]; +}; + struct wmi_rate_set_arg { u32 num_rates; u8 rates[WMI_MAX_SUPPORTED_RATES]; @@ -3761,8 +3779,36 @@ struct ath12k_wmi_peer_assoc_arg { u32 peer_eht_tx_mcs_set[WMI_MAX_EHTCAP_RATE_SET]; struct ath12k_wmi_ppe_threshold_arg peer_eht_ppet; u32 punct_bitmap; + bool is_assoc; + struct peer_assoc_mlo_params ml; }; +#define ATH12K_WMI_FLAG_MLO_ENABLED BIT(0) +#define ATH12K_WMI_FLAG_MLO_ASSOC_LINK BIT(1) +#define ATH12K_WMI_FLAG_MLO_PRIMARY_UMAC BIT(2) +#define ATH12K_WMI_FLAG_MLO_LINK_ID_VALID BIT(3) +#define ATH12K_WMI_FLAG_MLO_PEER_ID_VALID BIT(4) + +struct wmi_peer_assoc_mlo_partner_info_params { + __le32 tlv_header; + __le32 vdev_id; + __le32 hw_link_id; + __le32 flags; + __le32 logical_link_idx; +} __packed; + +struct wmi_peer_assoc_mlo_params { + __le32 tlv_header; + __le32 flags; + struct ath12k_wmi_mac_addr_params mld_addr; + __le32 logical_link_idx; + __le32 ml_peer_id; + __le32 ieee_link_id; + __le32 emlsr_trans_timeout_us; + __le32 emlsr_trans_delay_us; + __le32 emlsr_padding_delay_us; +} __packed; + struct wmi_peer_assoc_complete_cmd { __le32 tlv_header; struct ath12k_wmi_mac_addr_params peer_macaddr; From 1053987a6bce68127504cda476ee56b97c9109d9 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 11 Nov 2024 13:47:24 +0300 Subject: [PATCH 0012/1450] wifi: ath9k: miscellaneous spelling fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correct spelling here and there as suggested by codespell. Signed-off-by: Dmitry Antipov Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241111104724.484586-1-dmantipov@yandex.ru Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath9k/antenna.c | 2 +- drivers/net/wireless/ath/ath9k/ar9002_hw.c | 2 +- drivers/net/wireless/ath/ath9k/ar9003_hw.c | 2 +- drivers/net/wireless/ath/ath9k/ar9003_mci.c | 4 ++-- drivers/net/wireless/ath/ath9k/ar9003_phy.h | 2 +- drivers/net/wireless/ath/ath9k/channel.c | 2 +- drivers/net/wireless/ath/ath9k/common-spectral.c | 2 +- drivers/net/wireless/ath/ath9k/dfs.c | 2 +- drivers/net/wireless/ath/ath9k/hif_usb.c | 2 +- drivers/net/wireless/ath/ath9k/hw.c | 4 ++-- drivers/net/wireless/ath/ath9k/hw.h | 2 +- drivers/net/wireless/ath/ath9k/mac.h | 2 +- drivers/net/wireless/ath/ath9k/main.c | 2 +- drivers/net/wireless/ath/ath9k/wow.c | 6 +++--- drivers/net/wireless/ath/ath9k/xmit.c | 2 +- 15 files changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/antenna.c b/drivers/net/wireless/ath/ath9k/antenna.c index acc84e6711b0e..e5e274bc9e687 100644 --- a/drivers/net/wireless/ath/ath9k/antenna.c +++ b/drivers/net/wireless/ath/ath9k/antenna.c @@ -193,7 +193,7 @@ static void ath_lnaconf_alt_good_scan(struct ath_ant_comb *antcomb, static void ath_ant_set_alt_ratio(struct ath_ant_comb *antcomb, struct ath_hw_antcomb_conf *conf) { - /* set alt to the conf with maximun ratio */ + /* set alt to the conf with maximum ratio */ if (antcomb->first_ratio && antcomb->second_ratio) { if (antcomb->rssi_second > antcomb->rssi_third) { /* first alt*/ diff --git a/drivers/net/wireless/ath/ath9k/ar9002_hw.c b/drivers/net/wireless/ath/ath9k/ar9002_hw.c index d08ea0b28530e..b262244800410 100644 --- a/drivers/net/wireless/ath/ath9k/ar9002_hw.c +++ b/drivers/net/wireless/ath/ath9k/ar9002_hw.c @@ -395,7 +395,7 @@ static void ar9002_hw_init_hang_checks(struct ath_hw *ah) ah->config.hw_hang_checks |= HW_MAC_HANG; } -/* Sets up the AR5008/AR9001/AR9002 hardware familiy callbacks */ +/* Sets up the AR5008/AR9001/AR9002 hardware family callbacks */ int ar9002_hw_attach_ops(struct ath_hw *ah) { struct ath_hw_private_ops *priv_ops = ath9k_hw_private_ops(ah); diff --git a/drivers/net/wireless/ath/ath9k/ar9003_hw.c b/drivers/net/wireless/ath/ath9k/ar9003_hw.c index e9bd13eeee92f..6595eca749977 100644 --- a/drivers/net/wireless/ath/ath9k/ar9003_hw.c +++ b/drivers/net/wireless/ath/ath9k/ar9003_hw.c @@ -1170,7 +1170,7 @@ static bool ar9003_hw_detect_mac_hang(struct ath_hw *ah) return false; } -/* Sets up the AR9003 hardware familiy callbacks */ +/* Sets up the AR9003 hardware family callbacks */ void ar9003_hw_attach_ops(struct ath_hw *ah) { struct ath_hw_private_ops *priv_ops = ath9k_hw_private_ops(ah); diff --git a/drivers/net/wireless/ath/ath9k/ar9003_mci.c b/drivers/net/wireless/ath/ath9k/ar9003_mci.c index 2b9c07961cd71..3f0543e55d9ba 100644 --- a/drivers/net/wireless/ath/ath9k/ar9003_mci.c +++ b/drivers/net/wireless/ath/ath9k/ar9003_mci.c @@ -637,7 +637,7 @@ static u32 ar9003_mci_wait_for_gpm(struct ath_hw *ah, u8 gpm_type, * same time. Since BT's calibration doesn't happen * that often, we'll let BT completes calibration then * we continue to wait for cal_grant from BT. - * Orginal: Wait BT_CAL_GRANT. + * Original: Wait BT_CAL_GRANT. * New: Receive BT_CAL_REQ -> send WLAN_CAL_GRANT->wait * BT_CAL_DONE -> Wait BT_CAL_GRANT. */ @@ -747,7 +747,7 @@ int ar9003_mci_end_reset(struct ath_hw *ah, struct ath9k_channel *chan, * BT is sleeping. Check if BT wakes up during * WLAN calibration. If BT wakes up during * WLAN calibration, need to go through all - * message exchanges again and recal. + * message exchanges again and recalibrate. */ REG_WRITE(ah, AR_MCI_INTERRUPT_RX_MSG_RAW, (AR_MCI_INTERRUPT_RX_MSG_REMOTE_RESET | diff --git a/drivers/net/wireless/ath/ath9k/ar9003_phy.h b/drivers/net/wireless/ath/ath9k/ar9003_phy.h index ad72a30b67c3b..e13873fb8e2fe 100644 --- a/drivers/net/wireless/ath/ath9k/ar9003_phy.h +++ b/drivers/net/wireless/ath/ath9k/ar9003_phy.h @@ -246,7 +246,7 @@ /* - * MRC Feild Definitions + * MRC Field Definitions */ #define AR_PHY_SGI_DSC_MAN 0x0007FFF0 #define AR_PHY_SGI_DSC_MAN_S 4 diff --git a/drivers/net/wireless/ath/ath9k/channel.c b/drivers/net/wireless/ath/ath9k/channel.c index 571062f2e82a7..02237d106f8c6 100644 --- a/drivers/net/wireless/ath/ath9k/channel.c +++ b/drivers/net/wireless/ath/ath9k/channel.c @@ -17,7 +17,7 @@ #include "ath9k.h" /* Set/change channels. If the channel is really being changed, it's done - * by reseting the chip. To accomplish this we must first cleanup any pending + * by resetting the chip. To accomplish this we must first cleanup any pending * DMA, then restart stuff. */ static int ath_set_channel(struct ath_softc *sc) diff --git a/drivers/net/wireless/ath/ath9k/common-spectral.c b/drivers/net/wireless/ath/ath9k/common-spectral.c index 4b27445a5fb87..628eeec4b82fe 100644 --- a/drivers/net/wireless/ath/ath9k/common-spectral.c +++ b/drivers/net/wireless/ath/ath9k/common-spectral.c @@ -734,7 +734,7 @@ void ath9k_cmn_spectral_scan_trigger(struct ath_common *common, ATH9K_RX_FILTER_PHYRADAR | ATH9K_RX_FILTER_PHYERR); - /* TODO: usually this should not be neccesary, but for some reason + /* TODO: usually this should not be necessary, but for some reason * (or in some mode?) the trigger must be called after the * configuration, otherwise the register will have its values reset * (on my ar9220 to value 0x01002310) diff --git a/drivers/net/wireless/ath/ath9k/dfs.c b/drivers/net/wireless/ath/ath9k/dfs.c index 3689e12db9f74..2fb73a5e1d514 100644 --- a/drivers/net/wireless/ath/ath9k/dfs.c +++ b/drivers/net/wireless/ath/ath9k/dfs.c @@ -79,7 +79,7 @@ static int ath9k_get_max_index_ht40(struct ath9k_dfs_fft_40 *fft, const int DFS_UPPER_BIN_OFFSET = 64; /* if detected radar on both channels, select the significant one */ if (is_ctl && is_ext) { - /* first check wether channels have 'strong' bins */ + /* first check whether channels have 'strong' bins */ is_ctl = fft_bitmap_weight(fft->lower_bins) != 0; is_ext = fft_bitmap_weight(fft->upper_bins) != 0; diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.c b/drivers/net/wireless/ath/ath9k/hif_usb.c index 7265766cddbde..fe9abe8cd268f 100644 --- a/drivers/net/wireless/ath/ath9k/hif_usb.c +++ b/drivers/net/wireless/ath/ath9k/hif_usb.c @@ -1198,7 +1198,7 @@ static int ath9k_hif_request_firmware(struct hif_device_usb *hif_dev, filename = FIRMWARE_AR9271; /* expected fw locations: - * - htc_9271.fw (stable version 1.3, depricated) + * - htc_9271.fw (stable version 1.3, deprecated) */ snprintf(hif_dev->fw_name, sizeof(hif_dev->fw_name), "%s", filename); diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c index e2bef099adb30..a25eacabc664a 100644 --- a/drivers/net/wireless/ath/ath9k/hw.c +++ b/drivers/net/wireless/ath/ath9k/hw.c @@ -2149,7 +2149,7 @@ static void ath9k_set_power_network_sleep(struct ath_hw *ah) /* When chip goes into network sleep, it could be waken * up by MCI_INT interrupt caused by BT's HW messages - * (LNA_xxx, CONT_xxx) which chould be in a very fast + * (LNA_xxx, CONT_xxx) which could be in a very fast * rate (~100us). This will cause chip to leave and * re-enter network sleep mode frequently, which in * consequence will have WLAN MCI HW to generate lots of @@ -2544,7 +2544,7 @@ int ath9k_hw_fill_cap_info(struct ath_hw *ah) pCap->tx_chainmask = ah->eep_ops->get_eeprom(ah, EEP_TX_MASK); /* - * For AR9271 we will temporarilly uses the rx chainmax as read from + * For AR9271 we will temporarily use the rx chainmax as read from * the EEPROM. */ if ((ah->hw_version.devid == AR5416_DEVID_PCI) && diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h index 450ab19b1d4e8..e2cbf3f00da07 100644 --- a/drivers/net/wireless/ath/ath9k/hw.h +++ b/drivers/net/wireless/ath/ath9k/hw.h @@ -282,7 +282,7 @@ enum ath9k_hw_caps { * an exact user defined pattern or de-authentication/disassoc pattern. * @ATH9K_HW_WOW_PATTERN_MATCH_DWORD: device requires the first four * bytes of the pattern for user defined pattern, de-authentication and - * disassociation patterns for all types of possible frames recieved + * disassociation patterns for all types of possible frames received * of those types. */ diff --git a/drivers/net/wireless/ath/ath9k/mac.h b/drivers/net/wireless/ath/ath9k/mac.h index f03d792732da7..16203e7ecf290 100644 --- a/drivers/net/wireless/ath/ath9k/mac.h +++ b/drivers/net/wireless/ath/ath9k/mac.h @@ -251,7 +251,7 @@ struct ath_desc { * when the descriptor is specifically marked to generate * an interrupt with this flag. Descriptors should be * marked periodically to insure timely replenishing of the - * supply needed for sending frames. Defering interrupts + * supply needed for sending frames. Deferring interrupts * reduces system load and potentially allows more concurrent * work to be done but if done to aggressively can cause * senders to backup. When the hardware queue is left too diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index b92c89dad8dea..dd79107828588 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -1001,7 +1001,7 @@ static bool ath9k_uses_beacons(int type) static void ath9k_vif_iter_set_beacon(struct ath9k_vif_iter_data *iter_data, struct ieee80211_vif *vif) { - /* Use the first (configured) interface, but prefering AP interfaces. */ + /* Use the first (configured) interface, but preferring AP interfaces. */ if (!iter_data->primary_beacon_vif) { iter_data->primary_beacon_vif = vif; } else { diff --git a/drivers/net/wireless/ath/ath9k/wow.c b/drivers/net/wireless/ath/ath9k/wow.c index 8d0b1730a9d5b..ed4152cd44f0e 100644 --- a/drivers/net/wireless/ath/ath9k/wow.c +++ b/drivers/net/wireless/ath/ath9k/wow.c @@ -60,7 +60,7 @@ static int ath9k_wow_add_disassoc_deauth_pattern(struct ath_softc *sc) memset(dis_deauth_mask, 0, MAX_PATTERN_SIZE); /* - * Create Dissassociate / Deauthenticate packet filter + * Create Disassociate / Deauthenticate packet filter * * 2 bytes 2 byte 6 bytes 6 bytes 6 bytes * +--------------+----------+---------+--------+--------+---- @@ -70,7 +70,7 @@ static int ath9k_wow_add_disassoc_deauth_pattern(struct ath_softc *sc) * The above is the management frame format for disassociate/ * deauthenticate pattern, from this we need to match the first byte * of 'Frame Control' and DA, SA, and BSSID fields - * (skipping 2nd byte of FC and Duration feild. + * (skipping 2nd byte of FC and Duration field. * * Disassociate pattern * -------------------- @@ -225,7 +225,7 @@ int ath9k_suspend(struct ieee80211_hw *hw, ath9k_stop_btcoex(sc); /* - * Enable wake up on recieving disassoc/deauth + * Enable wake up on receiving disassoc/deauth * frame by default. */ ret = ath9k_wow_add_disassoc_deauth_pattern(sc); diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c index 35aa47a9db90b..0a24439dd30d7 100644 --- a/drivers/net/wireless/ath/ath9k/xmit.c +++ b/drivers/net/wireless/ath/ath9k/xmit.c @@ -557,7 +557,7 @@ static void ath_tx_complete_aggr(struct ath_softc *sc, struct ath_txq *txq, /* * AR5416 can become deaf/mute when BA * issue happens. Chip needs to be reset. - * But AP code may have sychronization issues + * But AP code may have synchronization issues * when perform internal reset in this routine. * Only enable reset in STA mode for now. */ From eb8c0534713865d190856f10bfc97cf0b88475b1 Mon Sep 17 00:00:00 2001 From: Karol Przybylski Date: Tue, 5 Nov 2024 11:11:31 +0100 Subject: [PATCH 0013/1450] wifi: ath12k: Fix for out-of bound access error Selfgen stats are placed in a buffer using print_array_to_buf_index() function. Array length parameter passed to the function is too big, resulting in possible out-of bound memory error. Decreasing buffer size by one fixes faulty upper bound of passed array. Discovered in coverity scan, CID 1600742 and CID 1600758 Signed-off-by: Karol Przybylski Acked-by: Kalle Valo Link: https://patch.msgid.link/20241105101132.374372-1-karprzy7@gmail.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c index c9980c0193d1d..43ea87e981f42 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c @@ -1562,7 +1562,8 @@ ath12k_htt_print_tx_selfgen_ac_stats_tlv(const void *tag_buf, u16 tag_len, le32_to_cpu(htt_stats_buf->ac_mu_mimo_ndp)); len += print_array_to_buf_index(buf, len, "ac_mu_mimo_brpollX_tried = ", 1, htt_stats_buf->ac_mu_mimo_brpoll, - ATH12K_HTT_TX_NUM_AC_MUMIMO_USER_STATS, "\n\n"); + ATH12K_HTT_TX_NUM_AC_MUMIMO_USER_STATS - 1, + "\n\n"); stats_req->buf_len = len; } @@ -1590,7 +1591,7 @@ ath12k_htt_print_tx_selfgen_ax_stats_tlv(const void *tag_buf, u16 tag_len, le32_to_cpu(htt_stats_buf->ax_mu_mimo_ndp)); len += print_array_to_buf_index(buf, len, "ax_mu_mimo_brpollX_tried = ", 1, htt_stats_buf->ax_mu_mimo_brpoll, - ATH12K_HTT_TX_NUM_AX_MUMIMO_USER_STATS, "\n"); + ATH12K_HTT_TX_NUM_AX_MUMIMO_USER_STATS - 1, "\n"); len += scnprintf(buf + len, buf_len - len, "ax_basic_trigger = %u\n", le32_to_cpu(htt_stats_buf->ax_basic_trigger)); len += scnprintf(buf + len, buf_len - len, "ax_ulmumimo_total_trigger = %u\n", From 78e154d42f2c72905fe66a400847e1b2b101b7b2 Mon Sep 17 00:00:00 2001 From: Balaji Pothunoori Date: Wed, 30 Oct 2024 17:16:25 +0530 Subject: [PATCH 0014/1450] wifi: ath11k: Fix unexpected return buffer manager error for WCN6750/WCN6855 The following error messages were encountered while parsing fragmented RX packets for WCN6750/WCN6855: ath11k 17a10040.wifi: invalid return buffer manager 4 This issue arose due to a hardcoded check for HAL_RX_BUF_RBM_SW3_BM introduced in 'commit 71c748b5e01e ("ath11k: Fix unexpected return buffer manager error for QCA6390")' For WCN6750 and WCN6855, the return buffer manager ID should be HAL_RX_BUF_RBM_SW1_BM. The incorrect conditional check caused fragmented packets to be dropped, resulting in the above error log. Fix this by adding a check for HAL_RX_BUF_RBM_SW1_BM. Tested-on: WCN6750 hw1.0 AHB WLAN.MSL.2.0.c2-00258-QCAMSLSWPL-1 Tested-on: WCN6855 hw2.1 WLAN.HSP.1.1-04479-QCAHSPSWPL_V1_V2_SILICONZ_IOE-1 Fixes: 71c748b5e01e ("ath11k: Fix unexpected return buffer manager error for QCA6390") Signed-off-by: Balaji Pothunoori Acked-by: Jeff Johnson Acked-by: Kalle Valo Link: https://patch.msgid.link/20241030114625.2416942-1-quic_bpothuno@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/dp_rx.c | 1 + drivers/net/wireless/ath/ath11k/hal_rx.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c index c087d8a0f5b25..176bbc5d95a62 100644 --- a/drivers/net/wireless/ath/ath11k/dp_rx.c +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c @@ -3872,6 +3872,7 @@ int ath11k_dp_process_rx_err(struct ath11k_base *ab, struct napi_struct *napi, ath11k_hal_rx_msdu_link_info_get(link_desc_va, &num_msdus, msdu_cookies, &rbm); if (rbm != HAL_RX_BUF_RBM_WBM_IDLE_DESC_LIST && + rbm != HAL_RX_BUF_RBM_SW1_BM && rbm != HAL_RX_BUF_RBM_SW3_BM) { ab->soc_stats.invalid_rbm++; ath11k_warn(ab, "invalid return buffer manager %d\n", rbm); diff --git a/drivers/net/wireless/ath/ath11k/hal_rx.c b/drivers/net/wireless/ath/ath11k/hal_rx.c index 8f7dd43dc1bd8..753bd93f02123 100644 --- a/drivers/net/wireless/ath/ath11k/hal_rx.c +++ b/drivers/net/wireless/ath/ath11k/hal_rx.c @@ -372,7 +372,8 @@ int ath11k_hal_wbm_desc_parse_err(struct ath11k_base *ab, void *desc, ret_buf_mgr = FIELD_GET(BUFFER_ADDR_INFO1_RET_BUF_MGR, wbm_desc->buf_addr_info.info1); - if (ret_buf_mgr != HAL_RX_BUF_RBM_SW3_BM) { + if (ret_buf_mgr != HAL_RX_BUF_RBM_SW1_BM && + ret_buf_mgr != HAL_RX_BUF_RBM_SW3_BM) { ab->soc_stats.invalid_rbm++; return -EINVAL; } From 7b5ce65d90187f0944e70dc5741aa0edfac926f4 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 6 Nov 2024 15:55:31 +0200 Subject: [PATCH 0015/1450] wifi: rtw88: 8821au: Add additional devices to the USB_DEVICE list These are the entries that Nick Morrow provided. From https://github.com/morrownr/8821au-20210708 Signed-off-by: Larry Finger Signed-off-by: Bitterblue Smith Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/da05b866-a9ff-428c-a008-35e8cf200a98@gmail.com --- .../net/wireless/realtek/rtw88/rtw8821au.c | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8821au.c b/drivers/net/wireless/realtek/rtw88/rtw8821au.c index 730018773e1c3..a01744b64e8da 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8821au.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8821au.c @@ -9,8 +9,58 @@ #include "usb.h" static const struct usb_device_id rtw_8821au_id_table[] = { - { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x011e, 0xff, 0xff, 0xff), + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0x0811, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0x0820, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0x0821, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0x8822, 0xff, 0xff, 0xff), .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0x0823, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0xa811, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x0411, 0x0242, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* Buffalo */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0411, 0x029b, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* Buffalo */ + { USB_DEVICE_AND_INTERFACE_INFO(0x04bb, 0x0953, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* I-O DATA */ + { USB_DEVICE_AND_INTERFACE_INFO(0x056e, 0x4007, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* ELECOM */ + { USB_DEVICE_AND_INTERFACE_INFO(0x056e, 0x400e, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* ELECOM */ + { USB_DEVICE_AND_INTERFACE_INFO(0x056e, 0x400f, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* ELECOM */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0846, 0x9052, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* Netgear */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0e66, 0x0023, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* HAWKING */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x3314, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* D-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x3318, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* D-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2019, 0xab32, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* Planex */ + { USB_DEVICE_AND_INTERFACE_INFO(0x20f4, 0x804b, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* TRENDnet */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x011e, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* TP Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x011f, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* TP Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x0120, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* TP Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3823, 0x6249, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* Obihai */ + { USB_DEVICE_AND_INTERFACE_INFO(0x7392, 0xa811, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* Edimax */ + { USB_DEVICE_AND_INTERFACE_INFO(0x7392, 0xa812, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* Edimax */ + { USB_DEVICE_AND_INTERFACE_INFO(0x7392, 0xa813, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* Edimax */ + { USB_DEVICE_AND_INTERFACE_INFO(0x7392, 0xb611, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8821a_hw_spec) }, /* Edimax */ {}, }; MODULE_DEVICE_TABLE(usb, rtw_8821au_id_table); From 1ee6ff9ae3c1a9eda9081f9db04f85d3a7352d38 Mon Sep 17 00:00:00 2001 From: Nick Morrow Date: Wed, 6 Nov 2024 15:57:10 +0200 Subject: [PATCH 0016/1450] wifi: rtw88: 8812au: Add more device IDs From https://github.com/morrownr/8812au-20210820. Signed-off-by: Nick Morrow Signed-off-by: Bitterblue Smith Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/31b7ee6b-f96d-43e0-a32f-a9eb1174a0c1@gmail.com --- .../net/wireless/realtek/rtw88/rtw8812au.c | 68 ++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8812au.c b/drivers/net/wireless/realtek/rtw88/rtw8812au.c index 4da69590a4234..e18995f4cc78e 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8812au.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8812au.c @@ -9,8 +9,74 @@ #include "usb.h" static const struct usb_device_id rtw_8812au_id_table[] = { - { USB_DEVICE_AND_INTERFACE_INFO(0x2604, 0x0012, 0xff, 0xff, 0xff), + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0x8812, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0x881a, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0x881b, 0xff, 0xff, 0xff), .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(RTW_USB_VENDOR_ID_REALTEK, 0x881c, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x0409, 0x0408, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* NEC */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0411, 0x025d, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Buffalo */ + { USB_DEVICE_AND_INTERFACE_INFO(0x04bb, 0x0952, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* I-O DATA */ + { USB_DEVICE_AND_INTERFACE_INFO(0x050d, 0x1106, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Belkin */ + { USB_DEVICE_AND_INTERFACE_INFO(0x050d, 0x1109, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Belkin */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0586, 0x3426, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* ZyXEL */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0789, 0x016e, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Logitec */ + { USB_DEVICE_AND_INTERFACE_INFO(0x07b8, 0x8812, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Abocom */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0846, 0x9051, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Netgear */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0b05, 0x17d2, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* ASUS */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0df6, 0x0074, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Sitecom */ + { USB_DEVICE_AND_INTERFACE_INFO(0x0e66, 0x0022, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Hawking */ + { USB_DEVICE_AND_INTERFACE_INFO(0x1058, 0x0632, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* WD */ + { USB_DEVICE_AND_INTERFACE_INFO(0x13b1, 0x003f, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Linksys */ + { USB_DEVICE_AND_INTERFACE_INFO(0x148f, 0x9097, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Amped Wireless */ + { USB_DEVICE_AND_INTERFACE_INFO(0x1740, 0x0100, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* EnGenius */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x330e, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* D-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x3313, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* D-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x3315, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* D-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x3316, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* D-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2019, 0xab30, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Planex */ + { USB_DEVICE_AND_INTERFACE_INFO(0x20f4, 0x805b, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* TRENDnet */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x0101, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* TP-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x0103, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* TP-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x010d, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* TP-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x010e, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* TP-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x010f, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* TP-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2357, 0x0122, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* TP-Link */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2604, 0x0012, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Tenda */ + { USB_DEVICE_AND_INTERFACE_INFO(0x7392, 0xa822, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8812a_hw_spec) }, /* Edimax */ {}, }; MODULE_DEVICE_TABLE(usb, rtw_8812au_id_table); From d4c4903508f9e1b2bfec88f777718484e27343fb Mon Sep 17 00:00:00 2001 From: Nick Morrow Date: Thu, 7 Nov 2024 08:28:46 +0800 Subject: [PATCH 0017/1450] wifi: rtw88: Add additional USB IDs for RTL8812BU Add three additional USB IDs found in https://github.com/morrownr/88x2bu-20210702 to support more RTL8812BU devices. Signed-off-by: Nick Morrow Signed-off-by: Zenm Chen Reviewed-by: Mikhail Novosyolov Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241107002846.13748-1-zenmchen@gmail.com --- drivers/net/wireless/realtek/rtw88/rtw8822bu.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822bu.c b/drivers/net/wireless/realtek/rtw88/rtw8822bu.c index ab620a0b1dfc6..8883300fc6adb 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822bu.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822bu.c @@ -67,6 +67,12 @@ static const struct usb_device_id rtw_8822bu_id_table[] = { .driver_info = (kernel_ulong_t)&(rtw8822b_hw_spec) }, /* LiteOn */ { USB_DEVICE_AND_INTERFACE_INFO(0x20f4, 0x808a, 0xff, 0xff, 0xff), .driver_info = (kernel_ulong_t)&(rtw8822b_hw_spec) }, /* TRENDnet TEW-808UBM */ + { USB_DEVICE_AND_INTERFACE_INFO(0x20f4, 0x805a, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8822b_hw_spec) }, /* TRENDnet TEW-805UBH */ + { USB_DEVICE_AND_INTERFACE_INFO(0x056e, 0x4011, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8822b_hw_spec) }, /* ELECOM WDB-867DU3S */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2c4e, 0x0107, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&(rtw8822b_hw_spec) }, /* Mercusys MA30H */ {}, }; MODULE_DEVICE_TABLE(usb, rtw_8822bu_id_table); From 5e5903a442bb889a62a0f5d89ac33e53ab08592c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 6 Nov 2024 15:46:42 +0000 Subject: [PATCH 0018/1450] wifi: rtlwifi: rtl8821ae: phy: restore removed code to fix infinite loop A previous clean-up fix removed the assignment of v2 inside a while loop that turned it into an infinite loop. Fix this by restoring the assignment of v2 from array[] so that v2 is updated inside the loop. Fixes: cda37445718d ("wifi: rtlwifi: rtl8821ae: phy: remove some useless code") Signed-off-by: Colin Ian King Tested-by: Ping-Ke Shih Reviewed-by: Su Hui Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241106154642.1627886-1-colin.i.king@gmail.com --- drivers/net/wireless/realtek/rtlwifi/rtl8821ae/phy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/phy.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/phy.c index 1be51ea3f3c82..9eddbada8af12 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/phy.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/phy.c @@ -2033,8 +2033,10 @@ static bool _rtl8821ae_phy_config_bb_with_pgheaderfile(struct ieee80211_hw *hw, if (!_rtl8821ae_check_condition(hw, v1)) { i += 2; /* skip the pair of expression*/ v2 = array[i+1]; - while (v2 != 0xDEAD) + while (v2 != 0xDEAD) { i += 3; + v2 = array[i + 1]; + } } } } From e73e11d303940119e41850a0452a0deda2cc4eb5 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 7 Nov 2024 10:33:18 -0300 Subject: [PATCH 0019/1450] wifi: rtlwifi: do not complete firmware loading needlessly The only code waiting for completion is driver removal, which will not be called when probe returns a failure. So this completion is unnecessary. Fixes: b0302aba812b ("rtlwifi: Convert to asynchronous firmware load") Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Ping-Ke Shih Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241107133322.855112-2-cascardo@igalia.com --- drivers/net/wireless/realtek/rtlwifi/pci.c | 1 - drivers/net/wireless/realtek/rtlwifi/usb.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 11709b6c83f1a..40fc3c297a8ac 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -2266,7 +2266,6 @@ int rtl_pci_probe(struct pci_dev *pdev, pci_iounmap(pdev, (void __iomem *)rtlpriv->io.pci_mem_start); pci_release_regions(pdev); - complete(&rtlpriv->firmware_loading_complete); fail1: if (hw) diff --git a/drivers/net/wireless/realtek/rtlwifi/usb.c b/drivers/net/wireless/realtek/rtlwifi/usb.c index d37a017b2b814..c3aa0cd9ff211 100644 --- a/drivers/net/wireless/realtek/rtlwifi/usb.c +++ b/drivers/net/wireless/realtek/rtlwifi/usb.c @@ -1040,7 +1040,6 @@ int rtl_usb_probe(struct usb_interface *intf, error_out2: _rtl_usb_io_handler_release(hw); usb_put_dev(udev); - complete(&rtlpriv->firmware_loading_complete); kfree(rtlpriv->usb_data); ieee80211_free_hw(hw); return -ENODEV; From 8559a9e0c457729fe3edb3176bbf7c7874f482b0 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 7 Nov 2024 10:33:19 -0300 Subject: [PATCH 0020/1450] wifi: rtlwifi: rtl8192se: rise completion of firmware loading as last step Just like in commit 4dfde294b979 ("rtlwifi: rise completion at the last step of firmware callback"), only signal completion once the function is finished. Otherwise, the module removal waiting for the completion could free the memory that the callback will still use before returning. Fixes: b0302aba812b ("rtlwifi: Convert to asynchronous firmware load") Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Ping-Ke Shih Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241107133322.855112-3-cascardo@igalia.com --- drivers/net/wireless/realtek/rtlwifi/rtl8192se/sw.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192se/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192se/sw.c index bbf8ff63dcedb..e63c67b1861b5 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192se/sw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192se/sw.c @@ -64,22 +64,23 @@ static void rtl92se_fw_cb(const struct firmware *firmware, void *context) rtl_dbg(rtlpriv, COMP_ERR, DBG_LOUD, "Firmware callback routine entered!\n"); - complete(&rtlpriv->firmware_loading_complete); if (!firmware) { pr_err("Firmware %s not available\n", fw_name); rtlpriv->max_fw_size = 0; - return; + goto exit; } if (firmware->size > rtlpriv->max_fw_size) { pr_err("Firmware is too big!\n"); rtlpriv->max_fw_size = 0; release_firmware(firmware); - return; + goto exit; } pfirmware = (struct rt_firmware *)rtlpriv->rtlhal.pfirmware; memcpy(pfirmware->sz_fw_tmpbuffer, firmware->data, firmware->size); pfirmware->sz_fw_tmpbufferlen = firmware->size; release_firmware(firmware); +exit: + complete(&rtlpriv->firmware_loading_complete); } static int rtl92s_init_sw_vars(struct ieee80211_hw *hw) From b4b26642b31ef282df6ff7ea8531985edfdef12a Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 7 Nov 2024 10:33:20 -0300 Subject: [PATCH 0021/1450] wifi: rtlwifi: wait for firmware loading before releasing memory At probe error path, the firmware loading work may have already been queued. In such a case, it will try to access memory allocated by the probe function, which is about to be released. In such paths, wait for the firmware worker to finish before releasing memory. Fixes: a7f7c15e945a ("rtlwifi: rtl8192cu: Free ieee80211_hw if probing fails") Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241107133322.855112-4-cascardo@igalia.com --- drivers/net/wireless/realtek/rtlwifi/usb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/usb.c b/drivers/net/wireless/realtek/rtlwifi/usb.c index c3aa0cd9ff211..c27b116ccdff5 100644 --- a/drivers/net/wireless/realtek/rtlwifi/usb.c +++ b/drivers/net/wireless/realtek/rtlwifi/usb.c @@ -1028,13 +1028,15 @@ int rtl_usb_probe(struct usb_interface *intf, err = ieee80211_register_hw(hw); if (err) { pr_err("Can't register mac80211 hw.\n"); - goto error_out; + goto error_init_vars; } rtlpriv->mac80211.mac80211_registered = 1; set_bit(RTL_STATUS_INTERFACE_START, &rtlpriv->status); return 0; +error_init_vars: + wait_for_completion(&rtlpriv->firmware_loading_complete); error_out: rtl_deinit_core(hw); error_out2: From 00260350aed80c002df270c805ca443ec9a719a6 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 7 Nov 2024 10:33:21 -0300 Subject: [PATCH 0022/1450] wifi: rtlwifi: fix init_sw_vars leak when probe fails If ieee80211_register_hw fails, the memory allocated for the firmware will not be released. Call deinit_sw_vars as the function that undoes the allocationes done by init_sw_vars. Fixes: cefe3dfdb9f5 ("rtl8192cu: Call ieee80211_register_hw from rtl_usb_probe") Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241107133322.855112-5-cascardo@igalia.com --- drivers/net/wireless/realtek/rtlwifi/usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/realtek/rtlwifi/usb.c b/drivers/net/wireless/realtek/rtlwifi/usb.c index c27b116ccdff5..8ec687fab5721 100644 --- a/drivers/net/wireless/realtek/rtlwifi/usb.c +++ b/drivers/net/wireless/realtek/rtlwifi/usb.c @@ -1037,6 +1037,7 @@ int rtl_usb_probe(struct usb_interface *intf, error_init_vars: wait_for_completion(&rtlpriv->firmware_loading_complete); + rtlpriv->cfg->ops->deinit_sw_vars(hw); error_out: rtl_deinit_core(hw); error_out2: From f79bc5c67867c19ce2762e7934c20dbb835ed82c Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 7 Nov 2024 10:33:22 -0300 Subject: [PATCH 0023/1450] wifi: rtlwifi: usb: fix workqueue leak when probe fails rtl_init_core creates a workqueue that is then assigned to rtl_wq. rtl_deinit_core does not destroy it. It is left to rtl_usb_deinit, which must be called in the probe error path. Fixes: 2ca20f79e0d8 ("rtlwifi: Add usb driver") Fixes: 851639fdaeac ("rtlwifi: Modify some USB de-initialize code.") Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241107133322.855112-6-cascardo@igalia.com --- drivers/net/wireless/realtek/rtlwifi/usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/realtek/rtlwifi/usb.c b/drivers/net/wireless/realtek/rtlwifi/usb.c index 8ec687fab5721..0368ecea2e817 100644 --- a/drivers/net/wireless/realtek/rtlwifi/usb.c +++ b/drivers/net/wireless/realtek/rtlwifi/usb.c @@ -1039,6 +1039,7 @@ int rtl_usb_probe(struct usb_interface *intf, wait_for_completion(&rtlpriv->firmware_loading_complete); rtlpriv->cfg->ops->deinit_sw_vars(hw); error_out: + rtl_usb_deinit(hw); rtl_deinit_core(hw); error_out2: _rtl_usb_io_handler_release(hw); From 9c1df813e08832c3836c254bc8a2f83ff22dbc06 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 11 Nov 2024 14:38:35 +0800 Subject: [PATCH 0024/1450] wifi: rtw89: pci: disable PCIE wake bit when PCIE deinit The PCIE wake bit is to control PCIE wake signal to host. When PCIE is going down, clear this bit to prevent waking up host unexpectedly. Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241111063835.15454-1-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/pci.c | 16 +++++++++++++--- drivers/net/wireless/realtek/rtw89/pci.h | 9 +++++++++ drivers/net/wireless/realtek/rtw89/pci_be.c | 1 + 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/pci.c b/drivers/net/wireless/realtek/rtw89/pci.c index f923bec03d410..c3a027735d0f9 100644 --- a/drivers/net/wireless/realtek/rtw89/pci.c +++ b/drivers/net/wireless/realtek/rtw89/pci.c @@ -2516,7 +2516,7 @@ static int rtw89_pci_dphy_delay(struct rtw89_dev *rtwdev) PCIE_DPHY_DLY_25US, PCIE_PHY_GEN1); } -static void rtw89_pci_power_wake(struct rtw89_dev *rtwdev, bool pwr_up) +static void rtw89_pci_power_wake_ax(struct rtw89_dev *rtwdev, bool pwr_up) { if (pwr_up) rtw89_write32_set(rtwdev, R_AX_HCI_OPT_CTRL, BIT_WAKE_CTRL); @@ -2825,6 +2825,8 @@ static int rtw89_pci_ops_deinit(struct rtw89_dev *rtwdev) { const struct rtw89_pci_info *info = rtwdev->pci_info; + rtw89_pci_power_wake(rtwdev, false); + if (rtwdev->chip->chip_id == RTL8852A) { /* ltr sw trigger */ rtw89_write32_set(rtwdev, R_AX_LTR_CTRL_0, B_AX_APP_LTR_IDLE); @@ -2867,7 +2869,7 @@ static int rtw89_pci_ops_mac_pre_init_ax(struct rtw89_dev *rtwdev) return ret; } - rtw89_pci_power_wake(rtwdev, true); + rtw89_pci_power_wake_ax(rtwdev, true); rtw89_pci_autoload_hang(rtwdev); rtw89_pci_l12_vmain(rtwdev); rtw89_pci_gen2_force_ib(rtwdev); @@ -2912,6 +2914,13 @@ static int rtw89_pci_ops_mac_pre_init_ax(struct rtw89_dev *rtwdev) return 0; } +static int rtw89_pci_ops_mac_pre_deinit_ax(struct rtw89_dev *rtwdev) +{ + rtw89_pci_power_wake_ax(rtwdev, false); + + return 0; +} + int rtw89_pci_ltr_set(struct rtw89_dev *rtwdev, bool en) { u32 val; @@ -4325,7 +4334,7 @@ const struct rtw89_pci_gen_def rtw89_pci_gen_ax = { B_AX_RDU_INT}, .mac_pre_init = rtw89_pci_ops_mac_pre_init_ax, - .mac_pre_deinit = NULL, + .mac_pre_deinit = rtw89_pci_ops_mac_pre_deinit_ax, .mac_post_init = rtw89_pci_ops_mac_post_init_ax, .clr_idx_all = rtw89_pci_clr_idx_all_ax, @@ -4343,6 +4352,7 @@ const struct rtw89_pci_gen_def rtw89_pci_gen_ax = { .l1ss_set = rtw89_pci_l1ss_set_ax, .disable_eq = rtw89_pci_disable_eq_ax, + .power_wake = rtw89_pci_power_wake_ax, }; EXPORT_SYMBOL(rtw89_pci_gen_ax); diff --git a/drivers/net/wireless/realtek/rtw89/pci.h b/drivers/net/wireless/realtek/rtw89/pci.h index b68e2d82eea90..d52db4ca1b997 100644 --- a/drivers/net/wireless/realtek/rtw89/pci.h +++ b/drivers/net/wireless/realtek/rtw89/pci.h @@ -1290,6 +1290,7 @@ struct rtw89_pci_gen_def { void (*l1ss_set)(struct rtw89_dev *rtwdev, bool enable); void (*disable_eq)(struct rtw89_dev *rtwdev); + void (*power_wake)(struct rtw89_dev *rtwdev, bool pwr_up); }; #define RTW89_PCI_SSID(v, d, ssv, ssd, cust) \ @@ -1805,4 +1806,12 @@ static inline void rtw89_pci_disable_eq(struct rtw89_dev *rtwdev) gen_def->disable_eq(rtwdev); } +static inline void rtw89_pci_power_wake(struct rtw89_dev *rtwdev, bool pwr_up) +{ + const struct rtw89_pci_info *info = rtwdev->pci_info; + const struct rtw89_pci_gen_def *gen_def = info->gen_def; + + gen_def->power_wake(rtwdev, pwr_up); +} + #endif diff --git a/drivers/net/wireless/realtek/rtw89/pci_be.c b/drivers/net/wireless/realtek/rtw89/pci_be.c index 34154506f5d46..cd39eebe81861 100644 --- a/drivers/net/wireless/realtek/rtw89/pci_be.c +++ b/drivers/net/wireless/realtek/rtw89/pci_be.c @@ -691,5 +691,6 @@ const struct rtw89_pci_gen_def rtw89_pci_gen_be = { .l1ss_set = rtw89_pci_l1ss_set_be, .disable_eq = rtw89_pci_disable_eq_be, + .power_wake = _patch_pcie_power_wake_be, }; EXPORT_SYMBOL(rtw89_pci_gen_be); From abb541d1e52f1573f40bff2173fe6f8465b0f26c Mon Sep 17 00:00:00 2001 From: Kuan-Chung Chen Date: Mon, 11 Nov 2024 14:51:30 +0800 Subject: [PATCH 0025/1450] wifi: rtw89: sar: tweak 6GHz SAR subbands span Given that the 6GHz subband edges are not aligned, specific frequencies can span two adjacent subbands. We considered the need for this functionality outside of SAR and moved it to a common function. No logic change for existing chips. Signed-off-by: Kuan-Chung Chen Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241111065132.19587-2-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.c | 47 +++++++++++++++++++ drivers/net/wireless/realtek/rtw89/core.h | 9 ++++ drivers/net/wireless/realtek/rtw89/sar.c | 57 +++-------------------- 3 files changed, 62 insertions(+), 51 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c index e5b2968c1431f..f73704fc5f85e 100644 --- a/drivers/net/wireless/realtek/rtw89/core.c +++ b/drivers/net/wireless/realtek/rtw89/core.c @@ -203,6 +203,53 @@ static const struct ieee80211_iface_combination rtw89_iface_combs[] = { }, }; +#define RTW89_6GHZ_SPAN_HEAD 6145 +#define RTW89_6GHZ_SPAN_IDX(center_freq) \ + ((((int)(center_freq) - RTW89_6GHZ_SPAN_HEAD) / 5) / 2) + +#define RTW89_DECL_6GHZ_SPAN(center_freq, subband_l, subband_h) \ + [RTW89_6GHZ_SPAN_IDX(center_freq)] = { \ + .sar_subband_low = RTW89_SAR_6GHZ_ ## subband_l, \ + .sar_subband_high = RTW89_SAR_6GHZ_ ## subband_h, \ + } + +/* Since 6GHz subbands are not edge aligned, some cases span two subbands. + * In the following, we describe each of them with rtw89_6ghz_span. + */ +static const struct rtw89_6ghz_span rtw89_overlapping_6ghz[] = { + RTW89_DECL_6GHZ_SPAN(6145, SUBBAND_5_L, SUBBAND_5_H), + RTW89_DECL_6GHZ_SPAN(6165, SUBBAND_5_L, SUBBAND_5_H), + RTW89_DECL_6GHZ_SPAN(6185, SUBBAND_5_L, SUBBAND_5_H), + RTW89_DECL_6GHZ_SPAN(6505, SUBBAND_6, SUBBAND_7_L), + RTW89_DECL_6GHZ_SPAN(6525, SUBBAND_6, SUBBAND_7_L), + RTW89_DECL_6GHZ_SPAN(6545, SUBBAND_6, SUBBAND_7_L), + RTW89_DECL_6GHZ_SPAN(6665, SUBBAND_7_L, SUBBAND_7_H), + RTW89_DECL_6GHZ_SPAN(6705, SUBBAND_7_L, SUBBAND_7_H), + RTW89_DECL_6GHZ_SPAN(6825, SUBBAND_7_H, SUBBAND_8), + RTW89_DECL_6GHZ_SPAN(6865, SUBBAND_7_H, SUBBAND_8), + RTW89_DECL_6GHZ_SPAN(6875, SUBBAND_7_H, SUBBAND_8), + RTW89_DECL_6GHZ_SPAN(6885, SUBBAND_7_H, SUBBAND_8), +}; + +const struct rtw89_6ghz_span * +rtw89_get_6ghz_span(struct rtw89_dev *rtwdev, u32 center_freq) +{ + int idx; + + if (center_freq >= RTW89_6GHZ_SPAN_HEAD) { + idx = RTW89_6GHZ_SPAN_IDX(center_freq); + /* To decrease size of rtw89_overlapping_6ghz[], + * RTW89_6GHZ_SPAN_IDX() truncates the leading NULLs + * to make first span as index 0 of the table. So, if center + * frequency is less than the first one, it will get netative. + */ + if (idx >= 0 && idx < ARRAY_SIZE(rtw89_overlapping_6ghz)) + return &rtw89_overlapping_6ghz[idx]; + } + + return NULL; +} + bool rtw89_ra_report_to_bitrate(struct rtw89_dev *rtwdev, u8 rpt_rate, u16 *bitrate) { struct ieee80211_rate rate; diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index 5ad32eacd0d50..f76c05513d3c4 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -4597,6 +4597,13 @@ struct rtw89_sar_info { }; }; +struct rtw89_6ghz_span { + enum rtw89_sar_subband sar_subband_low; + enum rtw89_sar_subband sar_subband_high; +}; + +#define RTW89_SAR_SPAN_VALID(span) ((span)->sar_subband_high) + enum rtw89_tas_state { RTW89_TAS_STATE_DPR_OFF, RTW89_TAS_STATE_DPR_ON, @@ -6908,6 +6915,8 @@ struct rtw89_sta_link *rtw89_sta_set_link(struct rtw89_sta *rtwsta, unsigned int link_id); void rtw89_sta_unset_link(struct rtw89_sta *rtwsta, unsigned int link_id); void rtw89_core_set_chip_txpwr(struct rtw89_dev *rtwdev); +const struct rtw89_6ghz_span * +rtw89_get_6ghz_span(struct rtw89_dev *rtwdev, u32 center_freq); void rtw89_get_default_chandef(struct cfg80211_chan_def *chandef); void rtw89_get_channel_params(const struct cfg80211_chan_def *chandef, struct rtw89_chan *chan); diff --git a/drivers/net/wireless/realtek/rtw89/sar.c b/drivers/net/wireless/realtek/rtw89/sar.c index bcc287771b2a3..871f45a6508ca 100644 --- a/drivers/net/wireless/realtek/rtw89/sar.c +++ b/drivers/net/wireless/realtek/rtw89/sar.c @@ -42,7 +42,7 @@ static enum rtw89_sar_subband rtw89_sar_get_subband(struct rtw89_dev *rtwdev, /* freq 6875 (ch 185, 20MHz) spans RTW89_SAR_6GHZ_SUBBAND_7_H * and RTW89_SAR_6GHZ_SUBBAND_8, so directly describe it with - * struct rtw89_sar_span in the following. + * struct rtw89_6ghz_span. */ case 6895 ... 7115: @@ -50,63 +50,18 @@ static enum rtw89_sar_subband rtw89_sar_get_subband(struct rtw89_dev *rtwdev, } } -struct rtw89_sar_span { - enum rtw89_sar_subband subband_low; - enum rtw89_sar_subband subband_high; -}; - -#define RTW89_SAR_SPAN_VALID(span) ((span)->subband_high) - -#define RTW89_SAR_6GHZ_SPAN_HEAD 6145 -#define RTW89_SAR_6GHZ_SPAN_IDX(center_freq) \ - ((((int)(center_freq) - RTW89_SAR_6GHZ_SPAN_HEAD) / 5) / 2) - -#define RTW89_DECL_SAR_6GHZ_SPAN(center_freq, subband_l, subband_h) \ - [RTW89_SAR_6GHZ_SPAN_IDX(center_freq)] = { \ - .subband_low = RTW89_SAR_6GHZ_ ## subband_l, \ - .subband_high = RTW89_SAR_6GHZ_ ## subband_h, \ - } - -/* Since 6GHz SAR subbands are not edge aligned, some cases span two SAR - * subbands. In the following, we describe each of them with rtw89_sar_span. - */ -static const struct rtw89_sar_span rtw89_sar_overlapping_6ghz[] = { - RTW89_DECL_SAR_6GHZ_SPAN(6145, SUBBAND_5_L, SUBBAND_5_H), - RTW89_DECL_SAR_6GHZ_SPAN(6165, SUBBAND_5_L, SUBBAND_5_H), - RTW89_DECL_SAR_6GHZ_SPAN(6185, SUBBAND_5_L, SUBBAND_5_H), - RTW89_DECL_SAR_6GHZ_SPAN(6505, SUBBAND_6, SUBBAND_7_L), - RTW89_DECL_SAR_6GHZ_SPAN(6525, SUBBAND_6, SUBBAND_7_L), - RTW89_DECL_SAR_6GHZ_SPAN(6545, SUBBAND_6, SUBBAND_7_L), - RTW89_DECL_SAR_6GHZ_SPAN(6665, SUBBAND_7_L, SUBBAND_7_H), - RTW89_DECL_SAR_6GHZ_SPAN(6705, SUBBAND_7_L, SUBBAND_7_H), - RTW89_DECL_SAR_6GHZ_SPAN(6825, SUBBAND_7_H, SUBBAND_8), - RTW89_DECL_SAR_6GHZ_SPAN(6865, SUBBAND_7_H, SUBBAND_8), - RTW89_DECL_SAR_6GHZ_SPAN(6875, SUBBAND_7_H, SUBBAND_8), - RTW89_DECL_SAR_6GHZ_SPAN(6885, SUBBAND_7_H, SUBBAND_8), -}; - static int rtw89_query_sar_config_common(struct rtw89_dev *rtwdev, u32 center_freq, s32 *cfg) { struct rtw89_sar_cfg_common *rtwsar = &rtwdev->sar.cfg_common; - const struct rtw89_sar_span *span = NULL; enum rtw89_sar_subband subband_l, subband_h; - int idx; - - if (center_freq >= RTW89_SAR_6GHZ_SPAN_HEAD) { - idx = RTW89_SAR_6GHZ_SPAN_IDX(center_freq); - /* To decrease size of rtw89_sar_overlapping_6ghz[], - * RTW89_SAR_6GHZ_SPAN_IDX() truncates the leading NULLs - * to make first span as index 0 of the table. So, if center - * frequency is less than the first one, it will get netative. - */ - if (idx >= 0 && idx < ARRAY_SIZE(rtw89_sar_overlapping_6ghz)) - span = &rtw89_sar_overlapping_6ghz[idx]; - } + const struct rtw89_6ghz_span *span; + + span = rtw89_get_6ghz_span(rtwdev, center_freq); if (span && RTW89_SAR_SPAN_VALID(span)) { - subband_l = span->subband_low; - subband_h = span->subband_high; + subband_l = span->sar_subband_low; + subband_h = span->sar_subband_high; } else { subband_l = rtw89_sar_get_subband(rtwdev, center_freq); subband_h = subband_l; From f0f08a4456b5040e45282a59d9c4ea9f39cd2ef5 Mon Sep 17 00:00:00 2001 From: Kuan-Chung Chen Date: Mon, 11 Nov 2024 14:51:31 +0800 Subject: [PATCH 0026/1450] wifi: rtw89: introduce dynamic antenna gain feature Dynamic Antenna Gain (DAG) adjusts the transmit power based on the platform's antenna gain. This allows for higher transmit power when the antenna gain is lower, while still complying with regulatory limits. The driver reads the Realtek Antenna Gain (RTAG) data from BIOS, and DAG is only enabled when the regulatory domain allows it. Currently, it only supports 8852BE/8852BTE/8852CE. Signed-off-by: Kuan-Chung Chen Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241111065132.19587-3-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/acpi.c | 47 ++++ drivers/net/wireless/realtek/rtw89/acpi.h | 9 + drivers/net/wireless/realtek/rtw89/core.c | 3 + drivers/net/wireless/realtek/rtw89/core.h | 33 +++ drivers/net/wireless/realtek/rtw89/debug.c | 4 + drivers/net/wireless/realtek/rtw89/phy.c | 216 +++++++++++++++++- drivers/net/wireless/realtek/rtw89/phy.h | 3 + drivers/net/wireless/realtek/rtw89/rtw8851b.c | 1 + drivers/net/wireless/realtek/rtw89/rtw8852a.c | 1 + drivers/net/wireless/realtek/rtw89/rtw8852b.c | 1 + .../net/wireless/realtek/rtw89/rtw8852bt.c | 1 + drivers/net/wireless/realtek/rtw89/rtw8852c.c | 1 + drivers/net/wireless/realtek/rtw89/rtw8922a.c | 1 + 13 files changed, 317 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/acpi.c b/drivers/net/wireless/realtek/rtw89/acpi.c index 908e980a4b72b..f5dedb12c1291 100644 --- a/drivers/net/wireless/realtek/rtw89/acpi.c +++ b/drivers/net/wireless/realtek/rtw89/acpi.c @@ -148,3 +148,50 @@ int rtw89_acpi_evaluate_dsm(struct rtw89_dev *rtwdev, ACPI_FREE(obj); return ret; } + +int rtw89_acpi_evaluate_rtag(struct rtw89_dev *rtwdev, + struct rtw89_acpi_rtag_result *res) +{ + struct acpi_buffer buf = {ACPI_ALLOCATE_BUFFER, NULL}; + acpi_handle root, handle; + union acpi_object *obj; + acpi_status status; + u32 buf_len; + int ret = 0; + + root = ACPI_HANDLE(rtwdev->dev); + if (!root) + return -EOPNOTSUPP; + + status = acpi_get_handle(root, (acpi_string)"RTAG", &handle); + if (ACPI_FAILURE(status)) + return -EIO; + + status = acpi_evaluate_object(handle, NULL, NULL, &buf); + if (ACPI_FAILURE(status)) + return -EIO; + + obj = buf.pointer; + if (obj->type != ACPI_TYPE_BUFFER) { + rtw89_debug(rtwdev, RTW89_DBG_ACPI, + "acpi: expect buffer but type: %d\n", obj->type); + ret = -EINVAL; + goto out; + } + + buf_len = obj->buffer.length; + if (buf_len != sizeof(*res)) { + rtw89_debug(rtwdev, RTW89_DBG_ACPI, "%s: invalid buffer length: %u\n", + __func__, buf_len); + ret = -EINVAL; + goto out; + } + + *res = *(struct rtw89_acpi_rtag_result *)obj->buffer.pointer; + + rtw89_hex_dump(rtwdev, RTW89_DBG_ACPI, "antenna_gain: ", res, sizeof(*res)); + +out: + ACPI_FREE(obj); + return ret; +} diff --git a/drivers/net/wireless/realtek/rtw89/acpi.h b/drivers/net/wireless/realtek/rtw89/acpi.h index d274be1775bf3..b43ab106e44df 100644 --- a/drivers/net/wireless/realtek/rtw89/acpi.h +++ b/drivers/net/wireless/realtek/rtw89/acpi.h @@ -63,8 +63,17 @@ struct rtw89_acpi_dsm_result { } u; }; +struct rtw89_acpi_rtag_result { + u8 tag[4]; + u8 revision; + __le32 domain; + u8 ant_gain_table[RTW89_ANT_GAIN_CHAIN_NUM][RTW89_ANT_GAIN_SUBBAND_NR]; +} __packed; + int rtw89_acpi_evaluate_dsm(struct rtw89_dev *rtwdev, enum rtw89_acpi_dsm_func func, struct rtw89_acpi_dsm_result *res); +int rtw89_acpi_evaluate_rtag(struct rtw89_dev *rtwdev, + struct rtw89_acpi_rtag_result *res); #endif diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c index f73704fc5f85e..34034f44c050c 100644 --- a/drivers/net/wireless/realtek/rtw89/core.c +++ b/drivers/net/wireless/realtek/rtw89/core.c @@ -211,6 +211,8 @@ static const struct ieee80211_iface_combination rtw89_iface_combs[] = { [RTW89_6GHZ_SPAN_IDX(center_freq)] = { \ .sar_subband_low = RTW89_SAR_6GHZ_ ## subband_l, \ .sar_subband_high = RTW89_SAR_6GHZ_ ## subband_h, \ + .ant_gain_subband_low = RTW89_ANT_GAIN_6GHZ_ ## subband_l, \ + .ant_gain_subband_high = RTW89_ANT_GAIN_6GHZ_ ## subband_h, \ } /* Since 6GHz subbands are not edge aligned, some cases span two subbands. @@ -4802,6 +4804,7 @@ int rtw89_core_init(struct rtw89_dev *rtwdev) rtw89_ser_init(rtwdev); rtw89_entity_init(rtwdev); rtw89_tas_init(rtwdev); + rtw89_phy_ant_gain_init(rtwdev); return 0; } diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index f76c05513d3c4..bf7aff4268968 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -4255,6 +4255,7 @@ struct rtw89_chip_info { u16 support_bandwidths; bool support_unii4; bool support_rnr; + bool support_ant_gain; bool ul_tb_waveform_ctrl; bool ul_tb_pwr_diff; bool hw_sec_hdr; @@ -4597,12 +4598,43 @@ struct rtw89_sar_info { }; }; +enum rtw89_ant_gain_subband { + RTW89_ANT_GAIN_2GHZ_SUBBAND, + RTW89_ANT_GAIN_5GHZ_SUBBAND_1, /* U-NII-1 */ + RTW89_ANT_GAIN_5GHZ_SUBBAND_2, /* U-NII-2 */ + RTW89_ANT_GAIN_5GHZ_SUBBAND_2E, /* U-NII-2-Extended */ + RTW89_ANT_GAIN_5GHZ_SUBBAND_3_4, /* U-NII-3 and U-NII-4 */ + RTW89_ANT_GAIN_6GHZ_SUBBAND_5_L, /* U-NII-5 lower part */ + RTW89_ANT_GAIN_6GHZ_SUBBAND_5_H, /* U-NII-5 higher part */ + RTW89_ANT_GAIN_6GHZ_SUBBAND_6, /* U-NII-6 */ + RTW89_ANT_GAIN_6GHZ_SUBBAND_7_L, /* U-NII-7 lower part */ + RTW89_ANT_GAIN_6GHZ_SUBBAND_7_H, /* U-NII-7 higher part */ + RTW89_ANT_GAIN_6GHZ_SUBBAND_8, /* U-NII-8 */ + + RTW89_ANT_GAIN_SUBBAND_NR, +}; + +enum rtw89_ant_gain_domain_type { + RTW89_ANT_GAIN_ETSI = 0, + + RTW89_ANT_GAIN_DOMAIN_NUM, +}; + +#define RTW89_ANT_GAIN_CHAIN_NUM 2 +struct rtw89_ant_gain_info { + s8 offset[RTW89_ANT_GAIN_CHAIN_NUM][RTW89_ANT_GAIN_SUBBAND_NR]; + u32 regd_enabled; +}; + struct rtw89_6ghz_span { enum rtw89_sar_subband sar_subband_low; enum rtw89_sar_subband sar_subband_high; + enum rtw89_ant_gain_subband ant_gain_subband_low; + enum rtw89_ant_gain_subband ant_gain_subband_high; }; #define RTW89_SAR_SPAN_VALID(span) ((span)->sar_subband_high) +#define RTW89_ANT_GAIN_SPAN_VALID(span) ((span)->ant_gain_subband_high) enum rtw89_tas_state { RTW89_TAS_STATE_DPR_OFF, @@ -5643,6 +5675,7 @@ struct rtw89_dev { struct rtw89_regulatory_info regulatory; struct rtw89_sar_info sar; struct rtw89_tas_info tas; + struct rtw89_ant_gain_info ant_gain; struct rtw89_btc btc; enum rtw89_ps_mode ps_mode; diff --git a/drivers/net/wireless/realtek/rtw89/debug.c b/drivers/net/wireless/realtek/rtw89/debug.c index 6abd88fa80baf..fd86752d86f34 100644 --- a/drivers/net/wireless/realtek/rtw89/debug.c +++ b/drivers/net/wireless/realtek/rtw89/debug.c @@ -9,6 +9,7 @@ #include "fw.h" #include "mac.h" #include "pci.h" +#include "phy.h" #include "ps.h" #include "reg.h" #include "sar.h" @@ -882,6 +883,9 @@ static int rtw89_debug_priv_txpwr_table_get(struct seq_file *m, void *v) seq_puts(m, "[TAS]\n"); rtw89_print_tas(m, rtwdev); + seq_puts(m, "[DAG]\n"); + rtw89_print_ant_gain(m, rtwdev, chan); + tbl = dbgfs_txpwr_tables[chip_gen]; if (!tbl) { ret = -EOPNOTSUPP; diff --git a/drivers/net/wireless/realtek/rtw89/phy.c b/drivers/net/wireless/realtek/rtw89/phy.c index f24aca663cf00..e9a635c43a815 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.c +++ b/drivers/net/wireless/realtek/rtw89/phy.c @@ -2,6 +2,7 @@ /* Copyright(c) 2019-2020 Realtek Corporation */ +#include "acpi.h" #include "chan.h" #include "coex.h" #include "debug.h" @@ -1854,6 +1855,211 @@ void rtw89_phy_write_reg3_tbl(struct rtw89_dev *rtwdev, } EXPORT_SYMBOL(rtw89_phy_write_reg3_tbl); +static u8 rtw89_phy_ant_gain_domain_to_regd(struct rtw89_dev *rtwdev, u8 ant_gain_regd) +{ + switch (ant_gain_regd) { + case RTW89_ANT_GAIN_ETSI: + return RTW89_ETSI; + default: + rtw89_debug(rtwdev, RTW89_DBG_TXPWR, + "unknown antenna gain domain: %d\n", + ant_gain_regd); + return RTW89_REGD_NUM; + } +} + +/* antenna gain in unit of 0.25 dbm */ +#define RTW89_ANT_GAIN_2GHZ_MIN -8 +#define RTW89_ANT_GAIN_2GHZ_MAX 14 +#define RTW89_ANT_GAIN_5GHZ_MIN -8 +#define RTW89_ANT_GAIN_5GHZ_MAX 20 +#define RTW89_ANT_GAIN_6GHZ_MIN -8 +#define RTW89_ANT_GAIN_6GHZ_MAX 20 + +#define RTW89_ANT_GAIN_REF_2GHZ 14 +#define RTW89_ANT_GAIN_REF_5GHZ 20 +#define RTW89_ANT_GAIN_REF_6GHZ 20 + +void rtw89_phy_ant_gain_init(struct rtw89_dev *rtwdev) +{ + struct rtw89_ant_gain_info *ant_gain = &rtwdev->ant_gain; + const struct rtw89_chip_info *chip = rtwdev->chip; + struct rtw89_acpi_rtag_result res = {}; + u32 domain; + int ret; + u8 i, j; + u8 regd; + u8 val; + + if (!chip->support_ant_gain) + return; + + ret = rtw89_acpi_evaluate_rtag(rtwdev, &res); + if (ret) { + rtw89_debug(rtwdev, RTW89_DBG_TXPWR, + "acpi: cannot eval rtag: %d\n", ret); + return; + } + + if (res.revision != 0) { + rtw89_debug(rtwdev, RTW89_DBG_TXPWR, + "unknown rtag revision: %d\n", res.revision); + return; + } + + domain = get_unaligned_le32(&res.domain); + + for (i = 0; i < RTW89_ANT_GAIN_DOMAIN_NUM; i++) { + if (!(domain & BIT(i))) + continue; + + regd = rtw89_phy_ant_gain_domain_to_regd(rtwdev, i); + if (regd >= RTW89_REGD_NUM) + continue; + ant_gain->regd_enabled |= BIT(regd); + } + + for (i = 0; i < RTW89_ANT_GAIN_CHAIN_NUM; i++) { + for (j = 0; j < RTW89_ANT_GAIN_SUBBAND_NR; j++) { + val = res.ant_gain_table[i][j]; + switch (j) { + default: + case RTW89_ANT_GAIN_2GHZ_SUBBAND: + val = RTW89_ANT_GAIN_REF_2GHZ - + clamp_t(s8, val, + RTW89_ANT_GAIN_2GHZ_MIN, + RTW89_ANT_GAIN_2GHZ_MAX); + break; + case RTW89_ANT_GAIN_5GHZ_SUBBAND_1: + case RTW89_ANT_GAIN_5GHZ_SUBBAND_2: + case RTW89_ANT_GAIN_5GHZ_SUBBAND_2E: + case RTW89_ANT_GAIN_5GHZ_SUBBAND_3_4: + val = RTW89_ANT_GAIN_REF_5GHZ - + clamp_t(s8, val, + RTW89_ANT_GAIN_5GHZ_MIN, + RTW89_ANT_GAIN_5GHZ_MAX); + break; + case RTW89_ANT_GAIN_6GHZ_SUBBAND_5_L: + case RTW89_ANT_GAIN_6GHZ_SUBBAND_5_H: + case RTW89_ANT_GAIN_6GHZ_SUBBAND_6: + case RTW89_ANT_GAIN_6GHZ_SUBBAND_7_L: + case RTW89_ANT_GAIN_6GHZ_SUBBAND_7_H: + case RTW89_ANT_GAIN_6GHZ_SUBBAND_8: + val = RTW89_ANT_GAIN_REF_6GHZ - + clamp_t(s8, val, + RTW89_ANT_GAIN_6GHZ_MIN, + RTW89_ANT_GAIN_6GHZ_MAX); + } + ant_gain->offset[i][j] = val; + } + } +} + +static +enum rtw89_ant_gain_subband rtw89_phy_ant_gain_get_subband(struct rtw89_dev *rtwdev, + u32 center_freq) +{ + switch (center_freq) { + default: + rtw89_debug(rtwdev, RTW89_DBG_TXPWR, + "center freq: %u to antenna gain subband is unhandled\n", + center_freq); + fallthrough; + case 2412 ... 2484: + return RTW89_ANT_GAIN_2GHZ_SUBBAND; + case 5180 ... 5240: + return RTW89_ANT_GAIN_5GHZ_SUBBAND_1; + case 5250 ... 5320: + return RTW89_ANT_GAIN_5GHZ_SUBBAND_2; + case 5500 ... 5720: + return RTW89_ANT_GAIN_5GHZ_SUBBAND_2E; + case 5745 ... 5885: + return RTW89_ANT_GAIN_5GHZ_SUBBAND_3_4; + case 5955 ... 6155: + return RTW89_ANT_GAIN_6GHZ_SUBBAND_5_L; + case 6175 ... 6415: + return RTW89_ANT_GAIN_6GHZ_SUBBAND_5_H; + case 6435 ... 6515: + return RTW89_ANT_GAIN_6GHZ_SUBBAND_6; + case 6535 ... 6695: + return RTW89_ANT_GAIN_6GHZ_SUBBAND_7_L; + case 6715 ... 6855: + return RTW89_ANT_GAIN_6GHZ_SUBBAND_7_H; + + /* freq 6875 (ch 185, 20MHz) spans RTW89_ANT_GAIN_6GHZ_SUBBAND_7_H + * and RTW89_ANT_GAIN_6GHZ_SUBBAND_8, so directly describe it with + * struct rtw89_6ghz_span. + */ + + case 6895 ... 7115: + return RTW89_ANT_GAIN_6GHZ_SUBBAND_8; + } +} + +static s8 rtw89_phy_ant_gain_query(struct rtw89_dev *rtwdev, + enum rtw89_rf_path path, u32 center_freq) +{ + struct rtw89_ant_gain_info *ant_gain = &rtwdev->ant_gain; + enum rtw89_ant_gain_subband subband_l, subband_h; + const struct rtw89_6ghz_span *span; + + span = rtw89_get_6ghz_span(rtwdev, center_freq); + + if (span && RTW89_ANT_GAIN_SPAN_VALID(span)) { + subband_l = span->ant_gain_subband_low; + subband_h = span->ant_gain_subband_high; + } else { + subband_l = rtw89_phy_ant_gain_get_subband(rtwdev, center_freq); + subband_h = subband_l; + } + + rtw89_debug(rtwdev, RTW89_DBG_TXPWR, + "center_freq %u: antenna gain subband {%u, %u}\n", + center_freq, subband_l, subband_h); + + return min(ant_gain->offset[path][subband_l], + ant_gain->offset[path][subband_h]); +} + +static s8 rtw89_phy_ant_gain_offset(struct rtw89_dev *rtwdev, u8 band, u32 center_freq) +{ + struct rtw89_ant_gain_info *ant_gain = &rtwdev->ant_gain; + const struct rtw89_chip_info *chip = rtwdev->chip; + u8 regd = rtw89_regd_get(rtwdev, band); + s8 offset_patha, offset_pathb; + + if (!chip->support_ant_gain) + return 0; + + if (!(ant_gain->regd_enabled & BIT(regd))) + return 0; + + offset_patha = rtw89_phy_ant_gain_query(rtwdev, RF_PATH_A, center_freq); + offset_pathb = rtw89_phy_ant_gain_query(rtwdev, RF_PATH_B, center_freq); + + return max(offset_patha, offset_pathb); +} + +void rtw89_print_ant_gain(struct seq_file *m, struct rtw89_dev *rtwdev, + const struct rtw89_chan *chan) +{ + struct rtw89_ant_gain_info *ant_gain = &rtwdev->ant_gain; + const struct rtw89_chip_info *chip = rtwdev->chip; + u8 regd = rtw89_regd_get(rtwdev, chan->band_type); + s8 offset_patha, offset_pathb; + + if (!chip->support_ant_gain || !(ant_gain->regd_enabled & BIT(regd))) { + seq_puts(m, "no DAG is applied\n"); + return; + } + + offset_patha = rtw89_phy_ant_gain_query(rtwdev, RF_PATH_A, chan->freq); + offset_pathb = rtw89_phy_ant_gain_query(rtwdev, RF_PATH_B, chan->freq); + + seq_printf(m, "ChainA offset: %d dBm\n", offset_patha); + seq_printf(m, "ChainB offset: %d dBm\n", offset_pathb); +} + static const u8 rtw89_rs_idx_num_ax[] = { [RTW89_RS_CCK] = RTW89_RATE_CCK_NUM, [RTW89_RS_OFDM] = RTW89_RATE_OFDM_NUM, @@ -2027,7 +2233,7 @@ s8 rtw89_phy_read_txpwr_limit(struct rtw89_dev *rtwdev, u8 band, u8 ch_idx = rtw89_channel_to_idx(rtwdev, band, ch); u8 regd = rtw89_regd_get(rtwdev, band); u8 reg6 = regulatory->reg_6ghz_power; - s8 lmt = 0, sar; + s8 lmt = 0, sar, offset; s8 cstr; switch (band) { @@ -2059,7 +2265,8 @@ s8 rtw89_phy_read_txpwr_limit(struct rtw89_dev *rtwdev, u8 band, return 0; } - lmt = rtw89_phy_txpwr_rf_to_mac(rtwdev, lmt); + offset = rtw89_phy_ant_gain_offset(rtwdev, band, freq); + lmt = rtw89_phy_txpwr_rf_to_mac(rtwdev, lmt + offset); sar = rtw89_query_sar(rtwdev, freq); cstr = rtw89_phy_get_tpe_constraint(rtwdev, band); @@ -2286,7 +2493,7 @@ s8 rtw89_phy_read_txpwr_limit_ru(struct rtw89_dev *rtwdev, u8 band, u8 ch_idx = rtw89_channel_to_idx(rtwdev, band, ch); u8 regd = rtw89_regd_get(rtwdev, band); u8 reg6 = regulatory->reg_6ghz_power; - s8 lmt_ru = 0, sar; + s8 lmt_ru = 0, sar, offset; s8 cstr; switch (band) { @@ -2318,7 +2525,8 @@ s8 rtw89_phy_read_txpwr_limit_ru(struct rtw89_dev *rtwdev, u8 band, return 0; } - lmt_ru = rtw89_phy_txpwr_rf_to_mac(rtwdev, lmt_ru); + offset = rtw89_phy_ant_gain_offset(rtwdev, band, freq); + lmt_ru = rtw89_phy_txpwr_rf_to_mac(rtwdev, lmt_ru + offset); sar = rtw89_query_sar(rtwdev, freq); cstr = rtw89_phy_get_tpe_constraint(rtwdev, band); diff --git a/drivers/net/wireless/realtek/rtw89/phy.h b/drivers/net/wireless/realtek/rtw89/phy.h index c683f4d7d29b0..2720cabfafe4b 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.h +++ b/drivers/net/wireless/realtek/rtw89/phy.h @@ -826,6 +826,9 @@ s8 *rtw89_phy_raw_byr_seek(struct rtw89_dev *rtwdev, const struct rtw89_rate_desc *desc); s8 rtw89_phy_read_txpwr_byrate(struct rtw89_dev *rtwdev, u8 band, u8 bw, const struct rtw89_rate_desc *rate_desc); +void rtw89_phy_ant_gain_init(struct rtw89_dev *rtwdev); +void rtw89_print_ant_gain(struct seq_file *m, struct rtw89_dev *rtwdev, + const struct rtw89_chan *chan); void rtw89_phy_load_txpwr_byrate(struct rtw89_dev *rtwdev, const struct rtw89_txpwr_table *tbl); s8 rtw89_phy_read_txpwr_limit(struct rtw89_dev *rtwdev, u8 band, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8851b.c b/drivers/net/wireless/realtek/rtw89/rtw8851b.c index 68c67a763f4d5..29c697a251d33 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8851b.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8851b.c @@ -2479,6 +2479,7 @@ const struct rtw89_chip_info rtw8851b_chip_info = { BIT(NL80211_CHAN_WIDTH_40) | BIT(NL80211_CHAN_WIDTH_80), .support_unii4 = true, + .support_ant_gain = false, .ul_tb_waveform_ctrl = true, .ul_tb_pwr_diff = false, .hw_sec_hdr = false, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852a.c b/drivers/net/wireless/realtek/rtw89/rtw8852a.c index e647759ebd69c..fd59ee9f36265 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852a.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852a.c @@ -2196,6 +2196,7 @@ const struct rtw89_chip_info rtw8852a_chip_info = { BIT(NL80211_CHAN_WIDTH_40) | BIT(NL80211_CHAN_WIDTH_80), .support_unii4 = false, + .support_ant_gain = false, .ul_tb_waveform_ctrl = false, .ul_tb_pwr_diff = false, .hw_sec_hdr = false, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852b.c b/drivers/net/wireless/realtek/rtw89/rtw8852b.c index 49a3191283168..76667d4f36312 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852b.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852b.c @@ -834,6 +834,7 @@ const struct rtw89_chip_info rtw8852b_chip_info = { BIT(NL80211_CHAN_WIDTH_40) | BIT(NL80211_CHAN_WIDTH_80), .support_unii4 = true, + .support_ant_gain = true, .ul_tb_waveform_ctrl = true, .ul_tb_pwr_diff = false, .hw_sec_hdr = false, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852bt.c b/drivers/net/wireless/realtek/rtw89/rtw8852bt.c index 8767251332283..d8c0fb87b625e 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852bt.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852bt.c @@ -767,6 +767,7 @@ const struct rtw89_chip_info rtw8852bt_chip_info = { BIT(NL80211_CHAN_WIDTH_40) | BIT(NL80211_CHAN_WIDTH_80), .support_unii4 = true, + .support_ant_gain = true, .ul_tb_waveform_ctrl = true, .ul_tb_pwr_diff = false, .hw_sec_hdr = false, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852c.c b/drivers/net/wireless/realtek/rtw89/rtw8852c.c index cde34f8e1e67f..b639710107519 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852c.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852c.c @@ -2976,6 +2976,7 @@ const struct rtw89_chip_info rtw8852c_chip_info = { BIT(NL80211_CHAN_WIDTH_80) | BIT(NL80211_CHAN_WIDTH_160), .support_unii4 = true, + .support_ant_gain = true, .ul_tb_waveform_ctrl = false, .ul_tb_pwr_diff = true, .hw_sec_hdr = true, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8922a.c b/drivers/net/wireless/realtek/rtw89/rtw8922a.c index 9a4db04a1967b..b3879b4859181 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8922a.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8922a.c @@ -2746,6 +2746,7 @@ const struct rtw89_chip_info rtw8922a_chip_info = { BIT(NL80211_CHAN_WIDTH_80) | BIT(NL80211_CHAN_WIDTH_160), .support_unii4 = true, + .support_ant_gain = false, .ul_tb_waveform_ctrl = false, .ul_tb_pwr_diff = false, .hw_sec_hdr = true, From 50191eace88c4163c8990984a8a156f09f226d39 Mon Sep 17 00:00:00 2001 From: Kuan-Chung Chen Date: Mon, 11 Nov 2024 14:51:32 +0800 Subject: [PATCH 0027/1450] wifi: rtw89: handle different TX power between RF path The dynamic antenna gain (DAG) may independently apply different TX powers for each RF path. This can be accomplished by using the larger TX power as the reference path and adjusting the TX power of the other path based on the difference. Currently only 8852BE/8852BTE/ 8852CE are supported. Signed-off-by: Kuan-Chung Chen Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241111065132.19587-4-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.h | 1 + drivers/net/wireless/realtek/rtw89/phy.c | 31 ++++++------ drivers/net/wireless/realtek/rtw89/phy.h | 23 +++++++++ drivers/net/wireless/realtek/rtw89/rtw8851b.c | 1 + drivers/net/wireless/realtek/rtw89/rtw8852a.c | 1 + drivers/net/wireless/realtek/rtw89/rtw8852b.c | 1 + .../wireless/realtek/rtw89/rtw8852b_common.c | 46 +++++++++++------- .../net/wireless/realtek/rtw89/rtw8852bt.c | 1 + drivers/net/wireless/realtek/rtw89/rtw8852c.c | 47 ++++++++++++------- drivers/net/wireless/realtek/rtw89/rtw8922a.c | 1 + 10 files changed, 107 insertions(+), 46 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index bf7aff4268968..ecccb51184be9 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -4297,6 +4297,7 @@ struct rtw89_chip_info { const struct rtw89_rfe_parms *dflt_parms; const struct rtw89_chanctx_listener *chanctx_listener; + u8 txpwr_factor_bb; u8 txpwr_factor_rf; u8 txpwr_factor_mac; diff --git a/drivers/net/wireless/realtek/rtw89/phy.c b/drivers/net/wireless/realtek/rtw89/phy.c index e9a635c43a815..be2f5338c3a0b 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.c +++ b/drivers/net/wireless/realtek/rtw89/phy.c @@ -2040,6 +2040,23 @@ static s8 rtw89_phy_ant_gain_offset(struct rtw89_dev *rtwdev, u8 band, u32 cente return max(offset_patha, offset_pathb); } +s16 rtw89_phy_ant_gain_pwr_offset(struct rtw89_dev *rtwdev, + const struct rtw89_chan *chan) +{ + struct rtw89_ant_gain_info *ant_gain = &rtwdev->ant_gain; + u8 regd = rtw89_regd_get(rtwdev, chan->band_type); + s8 offset_patha, offset_pathb; + + if (!(ant_gain->regd_enabled & BIT(regd))) + return 0; + + offset_patha = rtw89_phy_ant_gain_query(rtwdev, RF_PATH_A, chan->freq); + offset_pathb = rtw89_phy_ant_gain_query(rtwdev, RF_PATH_B, chan->freq); + + return rtw89_phy_txpwr_rf_to_bb(rtwdev, offset_patha - offset_pathb); +} +EXPORT_SYMBOL(rtw89_phy_ant_gain_pwr_offset); + void rtw89_print_ant_gain(struct seq_file *m, struct rtw89_dev *rtwdev, const struct rtw89_chan *chan) { @@ -2123,20 +2140,6 @@ void rtw89_phy_load_txpwr_byrate(struct rtw89_dev *rtwdev, } EXPORT_SYMBOL(rtw89_phy_load_txpwr_byrate); -static s8 rtw89_phy_txpwr_rf_to_mac(struct rtw89_dev *rtwdev, s8 txpwr_rf) -{ - const struct rtw89_chip_info *chip = rtwdev->chip; - - return txpwr_rf >> (chip->txpwr_factor_rf - chip->txpwr_factor_mac); -} - -static s8 rtw89_phy_txpwr_dbm_to_mac(struct rtw89_dev *rtwdev, s8 dbm) -{ - const struct rtw89_chip_info *chip = rtwdev->chip; - - return clamp_t(s16, dbm << chip->txpwr_factor_mac, -64, 63); -} - static s8 rtw89_phy_txpwr_dbm_without_tolerance(s8 dbm) { const u8 tssi_deviation_point = 0; diff --git a/drivers/net/wireless/realtek/rtw89/phy.h b/drivers/net/wireless/realtek/rtw89/phy.h index 2720cabfafe4b..f4ef7f5fb0814 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.h +++ b/drivers/net/wireless/realtek/rtw89/phy.h @@ -827,6 +827,8 @@ s8 *rtw89_phy_raw_byr_seek(struct rtw89_dev *rtwdev, s8 rtw89_phy_read_txpwr_byrate(struct rtw89_dev *rtwdev, u8 band, u8 bw, const struct rtw89_rate_desc *rate_desc); void rtw89_phy_ant_gain_init(struct rtw89_dev *rtwdev); +s16 rtw89_phy_ant_gain_pwr_offset(struct rtw89_dev *rtwdev, + const struct rtw89_chan *chan); void rtw89_print_ant_gain(struct seq_file *m, struct rtw89_dev *rtwdev, const struct rtw89_chan *chan); void rtw89_phy_load_txpwr_byrate(struct rtw89_dev *rtwdev, @@ -899,6 +901,27 @@ void rtw89_phy_set_txpwr_limit_ru(struct rtw89_dev *rtwdev, phy->set_txpwr_limit_ru(rtwdev, chan, phy_idx); } +static inline s8 rtw89_phy_txpwr_rf_to_bb(struct rtw89_dev *rtwdev, s8 txpwr_rf) +{ + const struct rtw89_chip_info *chip = rtwdev->chip; + + return txpwr_rf << (chip->txpwr_factor_bb - chip->txpwr_factor_rf); +} + +static inline s8 rtw89_phy_txpwr_rf_to_mac(struct rtw89_dev *rtwdev, s8 txpwr_rf) +{ + const struct rtw89_chip_info *chip = rtwdev->chip; + + return txpwr_rf >> (chip->txpwr_factor_rf - chip->txpwr_factor_mac); +} + +static inline s8 rtw89_phy_txpwr_dbm_to_mac(struct rtw89_dev *rtwdev, s8 dbm) +{ + const struct rtw89_chip_info *chip = rtwdev->chip; + + return clamp_t(s16, dbm << chip->txpwr_factor_mac, -64, 63); +} + void rtw89_phy_ra_assoc(struct rtw89_dev *rtwdev, struct rtw89_sta_link *rtwsta_link); void rtw89_phy_ra_update(struct rtw89_dev *rtwdev); void rtw89_phy_ra_update_sta(struct rtw89_dev *rtwdev, struct ieee80211_sta *sta, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8851b.c b/drivers/net/wireless/realtek/rtw89/rtw8851b.c index 29c697a251d33..1ed4e64cbd2c8 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8851b.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8851b.c @@ -2464,6 +2464,7 @@ const struct rtw89_chip_info rtw8851b_chip_info = { .nctl_post_table = &rtw8851b_nctl_post_defs_tbl, .dflt_parms = &rtw89_8851b_dflt_parms, .rfe_parms_conf = rtw89_8851b_rfe_parms_conf, + .txpwr_factor_bb = 3, .txpwr_factor_rf = 2, .txpwr_factor_mac = 1, .dig_table = NULL, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852a.c b/drivers/net/wireless/realtek/rtw89/rtw8852a.c index fd59ee9f36265..a7105a288bc47 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852a.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852a.c @@ -2181,6 +2181,7 @@ const struct rtw89_chip_info rtw8852a_chip_info = { .nctl_post_table = NULL, .dflt_parms = &rtw89_8852a_dflt_parms, .rfe_parms_conf = NULL, + .txpwr_factor_bb = 3, .txpwr_factor_rf = 2, .txpwr_factor_mac = 1, .dig_table = &rtw89_8852a_phy_dig_table, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852b.c b/drivers/net/wireless/realtek/rtw89/rtw8852b.c index 76667d4f36312..ebc853a905dd2 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852b.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852b.c @@ -819,6 +819,7 @@ const struct rtw89_chip_info rtw8852b_chip_info = { .nctl_post_table = NULL, .dflt_parms = &rtw89_8852b_dflt_parms, .rfe_parms_conf = NULL, + .txpwr_factor_bb = 3, .txpwr_factor_rf = 2, .txpwr_factor_mac = 1, .dig_table = NULL, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852b_common.c b/drivers/net/wireless/realtek/rtw89/rtw8852b_common.c index f4aa4437fb759..012739d97f71a 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852b_common.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852b_common.c @@ -1206,24 +1206,25 @@ void __rtw8852bx_set_channel_bb(struct rtw89_dev *rtwdev, const struct rtw89_cha } static u32 rtw8852bx_bb_cal_txpwr_ref(struct rtw89_dev *rtwdev, - enum rtw89_phy_idx phy_idx, s16 ref) + enum rtw89_phy_idx phy_idx, + s16 ref, u16 pwr_ofst_decrease) { const u16 tssi_16dbm_cw = 0x12c; const u8 base_cw_0db = 0x27; - const s8 ofst_int = 0; s16 pwr_s10_3; s16 rf_pwr_cw; u16 bb_pwr_cw; u32 pwr_cw; u32 tssi_ofst_cw; - pwr_s10_3 = (ref << 1) + (s16)(ofst_int) + (s16)(base_cw_0db << 3); + pwr_s10_3 = (ref << 1) + (s16)(base_cw_0db << 3) - pwr_ofst_decrease; bb_pwr_cw = u16_get_bits(pwr_s10_3, GENMASK(2, 0)); rf_pwr_cw = u16_get_bits(pwr_s10_3, GENMASK(8, 3)); rf_pwr_cw = clamp_t(s16, rf_pwr_cw, 15, 63); pwr_cw = (rf_pwr_cw << 3) | bb_pwr_cw; - tssi_ofst_cw = (u32)((s16)tssi_16dbm_cw + (ref << 1) - (16 << 3)); + tssi_ofst_cw = (u32)((s16)tssi_16dbm_cw + (ref << 1) - (16 << 3)) - + pwr_ofst_decrease; rtw89_debug(rtwdev, RTW89_DBG_TXPWR, "[TXPWR] tssi_ofst_cw=%d rf_cw=0x%x bb_cw=0x%x\n", tssi_ofst_cw, rf_pwr_cw, bb_pwr_cw); @@ -1234,10 +1235,11 @@ static u32 rtw8852bx_bb_cal_txpwr_ref(struct rtw89_dev *rtwdev, } static void rtw8852bx_set_txpwr_ref(struct rtw89_dev *rtwdev, - enum rtw89_phy_idx phy_idx) + enum rtw89_phy_idx phy_idx, s16 pwr_ofst) { static const u32 addr[RF_PATH_NUM_8852BX] = {0x5800, 0x7800}; const u32 mask = B_DPD_TSSI_CW | B_DPD_PWR_CW | B_DPD_REF; + u16 ofst_dec[RF_PATH_NUM_8852BX]; const u8 ofst_ofdm = 0x4; const u8 ofst_cck = 0x8; const s16 ref_ofdm = 0; @@ -1250,19 +1252,20 @@ static void rtw8852bx_set_txpwr_ref(struct rtw89_dev *rtwdev, rtw89_mac_txpwr_write32_mask(rtwdev, phy_idx, R_AX_PWR_RATE_CTRL, B_AX_PWR_REF, 0x0); - rtw89_debug(rtwdev, RTW89_DBG_TXPWR, "[TXPWR] set bb ofdm txpwr ref\n"); - val = rtw8852bx_bb_cal_txpwr_ref(rtwdev, phy_idx, ref_ofdm); + ofst_dec[RF_PATH_A] = pwr_ofst > 0 ? 0 : abs(pwr_ofst); + ofst_dec[RF_PATH_B] = pwr_ofst > 0 ? pwr_ofst : 0; - for (i = 0; i < RF_PATH_NUM_8852BX; i++) - rtw89_phy_write32_idx(rtwdev, addr[i] + ofst_ofdm, mask, val, - phy_idx); + rtw89_debug(rtwdev, RTW89_DBG_TXPWR, "[TXPWR] set bb ofdm txpwr ref\n"); + for (i = 0; i < RF_PATH_NUM_8852BX; i++) { + val = rtw8852bx_bb_cal_txpwr_ref(rtwdev, phy_idx, ref_ofdm, ofst_dec[i]); + rtw89_phy_write32_idx(rtwdev, addr[i] + ofst_ofdm, mask, val, phy_idx); + } rtw89_debug(rtwdev, RTW89_DBG_TXPWR, "[TXPWR] set bb cck txpwr ref\n"); - val = rtw8852bx_bb_cal_txpwr_ref(rtwdev, phy_idx, ref_cck); - - for (i = 0; i < RF_PATH_NUM_8852BX; i++) - rtw89_phy_write32_idx(rtwdev, addr[i] + ofst_cck, mask, val, - phy_idx); + for (i = 0; i < RF_PATH_NUM_8852BX; i++) { + val = rtw8852bx_bb_cal_txpwr_ref(rtwdev, phy_idx, ref_cck, ofst_dec[i]); + rtw89_phy_write32_idx(rtwdev, addr[i] + ofst_cck, mask, val, phy_idx); + } } static void rtw8852bx_bb_set_tx_shape_dfir(struct rtw89_dev *rtwdev, @@ -1333,6 +1336,16 @@ static void rtw8852bx_set_tx_shape(struct rtw89_dev *rtwdev, tx_shape_ofdm); } +static void rtw8852bx_set_txpwr_diff(struct rtw89_dev *rtwdev, + const struct rtw89_chan *chan, + enum rtw89_phy_idx phy_idx) +{ + s16 pwr_ofst; + + pwr_ofst = rtw89_phy_ant_gain_pwr_offset(rtwdev, chan); + rtw8852bx_set_txpwr_ref(rtwdev, phy_idx, pwr_ofst); +} + static void __rtw8852bx_set_txpwr(struct rtw89_dev *rtwdev, const struct rtw89_chan *chan, enum rtw89_phy_idx phy_idx) @@ -1342,12 +1355,13 @@ static void __rtw8852bx_set_txpwr(struct rtw89_dev *rtwdev, rtw8852bx_set_tx_shape(rtwdev, chan, phy_idx); rtw89_phy_set_txpwr_limit(rtwdev, chan, phy_idx); rtw89_phy_set_txpwr_limit_ru(rtwdev, chan, phy_idx); + rtw8852bx_set_txpwr_diff(rtwdev, chan, phy_idx); } static void __rtw8852bx_set_txpwr_ctrl(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy_idx) { - rtw8852bx_set_txpwr_ref(rtwdev, phy_idx); + rtw8852bx_set_txpwr_ref(rtwdev, phy_idx, 0); } static diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852bt.c b/drivers/net/wireless/realtek/rtw89/rtw8852bt.c index d8c0fb87b625e..cd1385ff80030 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852bt.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852bt.c @@ -752,6 +752,7 @@ const struct rtw89_chip_info rtw8852bt_chip_info = { .nctl_post_table = NULL, .dflt_parms = NULL, .rfe_parms_conf = NULL, + .txpwr_factor_bb = 3, .txpwr_factor_rf = 2, .txpwr_factor_mac = 1, .dig_table = NULL, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852c.c b/drivers/net/wireless/realtek/rtw89/rtw8852c.c index b639710107519..c7d39499ca75a 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852c.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852c.c @@ -1882,9 +1882,9 @@ static void rtw8852c_rfk_track(struct rtw89_dev *rtwdev) } static u32 rtw8852c_bb_cal_txpwr_ref(struct rtw89_dev *rtwdev, - enum rtw89_phy_idx phy_idx, s16 ref) + enum rtw89_phy_idx phy_idx, + s16 ref, u16 pwr_ofst_decrease) { - s8 ofst_int = 0; u8 base_cw_0db = 0x27; u16 tssi_16dbm_cw = 0x12c; s16 pwr_s10_3 = 0; @@ -1893,13 +1893,14 @@ static u32 rtw8852c_bb_cal_txpwr_ref(struct rtw89_dev *rtwdev, u32 pwr_cw = 0; u32 tssi_ofst_cw = 0; - pwr_s10_3 = (ref << 1) + (s16)(ofst_int) + (s16)(base_cw_0db << 3); + pwr_s10_3 = (ref << 1) + (s16)(base_cw_0db << 3) - pwr_ofst_decrease; bb_pwr_cw = FIELD_GET(GENMASK(2, 0), pwr_s10_3); rf_pwr_cw = FIELD_GET(GENMASK(8, 3), pwr_s10_3); rf_pwr_cw = clamp_t(s16, rf_pwr_cw, 15, 63); pwr_cw = (rf_pwr_cw << 3) | bb_pwr_cw; - tssi_ofst_cw = (u32)((s16)tssi_16dbm_cw + (ref << 1) - (16 << 3)); + tssi_ofst_cw = (u32)((s16)tssi_16dbm_cw + (ref << 1) - (16 << 3)) - + pwr_ofst_decrease; rtw89_debug(rtwdev, RTW89_DBG_TXPWR, "[TXPWR] tssi_ofst_cw=%d rf_cw=0x%x bb_cw=0x%x\n", tssi_ofst_cw, rf_pwr_cw, bb_pwr_cw); @@ -1943,9 +1944,10 @@ void rtw8852c_set_txpwr_ul_tb_offset(struct rtw89_dev *rtwdev, } static void rtw8852c_set_txpwr_ref(struct rtw89_dev *rtwdev, - enum rtw89_phy_idx phy_idx) + enum rtw89_phy_idx phy_idx, s16 pwr_ofst) { static const u32 addr[RF_PATH_NUM_8852C] = {0x5800, 0x7800}; + u16 ofst_dec[RF_PATH_NUM_8852C]; const u32 mask = 0x7FFFFFF; const u8 ofst_ofdm = 0x4; const u8 ofst_cck = 0x8; @@ -1959,19 +1961,20 @@ static void rtw8852c_set_txpwr_ref(struct rtw89_dev *rtwdev, rtw89_mac_txpwr_write32_mask(rtwdev, phy_idx, R_AX_PWR_RATE_CTRL, GENMASK(27, 10), 0x0); - rtw89_debug(rtwdev, RTW89_DBG_TXPWR, "[TXPWR] set bb ofdm txpwr ref\n"); - val = rtw8852c_bb_cal_txpwr_ref(rtwdev, phy_idx, ref_ofdm); + ofst_dec[RF_PATH_A] = pwr_ofst > 0 ? 0 : abs(pwr_ofst); + ofst_dec[RF_PATH_B] = pwr_ofst > 0 ? pwr_ofst : 0; - for (i = 0; i < RF_PATH_NUM_8852C; i++) - rtw89_phy_write32_idx(rtwdev, addr[i] + ofst_ofdm, mask, val, - phy_idx); + rtw89_debug(rtwdev, RTW89_DBG_TXPWR, "[TXPWR] set bb ofdm txpwr ref\n"); + for (i = 0; i < RF_PATH_NUM_8852C; i++) { + val = rtw8852c_bb_cal_txpwr_ref(rtwdev, phy_idx, ref_ofdm, ofst_dec[i]); + rtw89_phy_write32_idx(rtwdev, addr[i] + ofst_ofdm, mask, val, phy_idx); + } rtw89_debug(rtwdev, RTW89_DBG_TXPWR, "[TXPWR] set bb cck txpwr ref\n"); - val = rtw8852c_bb_cal_txpwr_ref(rtwdev, phy_idx, ref_cck); - - for (i = 0; i < RF_PATH_NUM_8852C; i++) - rtw89_phy_write32_idx(rtwdev, addr[i] + ofst_cck, mask, val, - phy_idx); + for (i = 0; i < RF_PATH_NUM_8852C; i++) { + val = rtw8852c_bb_cal_txpwr_ref(rtwdev, phy_idx, ref_cck, ofst_dec[i]); + rtw89_phy_write32_idx(rtwdev, addr[i] + ofst_cck, mask, val, phy_idx); + } } static void rtw8852c_bb_set_tx_shape_dfir(struct rtw89_dev *rtwdev, @@ -2052,6 +2055,16 @@ static void rtw8852c_set_tx_shape(struct rtw89_dev *rtwdev, B_P1_DAC_COMP_POST_DPD_EN); } +static void rtw8852c_set_txpwr_diff(struct rtw89_dev *rtwdev, + const struct rtw89_chan *chan, + enum rtw89_phy_idx phy_idx) +{ + s16 pwr_ofst; + + pwr_ofst = rtw89_phy_ant_gain_pwr_offset(rtwdev, chan); + rtw8852c_set_txpwr_ref(rtwdev, phy_idx, pwr_ofst); +} + static void rtw8852c_set_txpwr(struct rtw89_dev *rtwdev, const struct rtw89_chan *chan, enum rtw89_phy_idx phy_idx) @@ -2061,12 +2074,13 @@ static void rtw8852c_set_txpwr(struct rtw89_dev *rtwdev, rtw8852c_set_tx_shape(rtwdev, chan, phy_idx); rtw89_phy_set_txpwr_limit(rtwdev, chan, phy_idx); rtw89_phy_set_txpwr_limit_ru(rtwdev, chan, phy_idx); + rtw8852c_set_txpwr_diff(rtwdev, chan, phy_idx); } static void rtw8852c_set_txpwr_ctrl(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy_idx) { - rtw8852c_set_txpwr_ref(rtwdev, phy_idx); + rtw8852c_set_txpwr_ref(rtwdev, phy_idx, 0); } static void @@ -2959,6 +2973,7 @@ const struct rtw89_chip_info rtw8852c_chip_info = { .dflt_parms = &rtw89_8852c_dflt_parms, .rfe_parms_conf = NULL, .chanctx_listener = &rtw8852c_chanctx_listener, + .txpwr_factor_bb = 3, .txpwr_factor_rf = 2, .txpwr_factor_mac = 1, .dig_table = NULL, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8922a.c b/drivers/net/wireless/realtek/rtw89/rtw8922a.c index b3879b4859181..a5333099668a1 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8922a.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8922a.c @@ -2729,6 +2729,7 @@ const struct rtw89_chip_info rtw8922a_chip_info = { .nctl_post_table = NULL, .dflt_parms = NULL, /* load parm from fw */ .rfe_parms_conf = NULL, /* load parm from fw */ + .txpwr_factor_bb = 3, .txpwr_factor_rf = 2, .txpwr_factor_mac = 1, .dig_table = NULL, From 31be3175bd7be89e39c82b3973c9d4ff55a17583 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 7 Nov 2024 15:08:33 +0100 Subject: [PATCH 0028/1450] wifi: rtl8xxxu: add more missing rtl8192cu USB IDs The rtl8xxxu has all the rtl8192cu USB IDs from rtlwifi/rtl8192cu/sw.c except for the following 10, add these to the untested section so they can be used with the rtl8xxxu as the rtl8192cu are well supported. This fixes these wifi modules not working on distributions which have disabled CONFIG_RTL8192CU replacing it with CONFIG_RTL8XXXU_UNTESTED, like Fedora. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2321540 Cc: stable@vger.kernel.org Cc: Peter Robinson Signed-off-by: Hans de Goede Reviewed-by: Peter Robinson Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241107140833.274986-1-hdegoede@redhat.com --- drivers/net/wireless/realtek/rtl8xxxu/core.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/net/wireless/realtek/rtl8xxxu/core.c b/drivers/net/wireless/realtek/rtl8xxxu/core.c index f95898f68d68a..4ce0c05c51291 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/core.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/core.c @@ -8147,6 +8147,8 @@ static const struct usb_device_id dev_table[] = { .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(USB_VENDOR_ID_REALTEK, 0x817e, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(USB_VENDOR_ID_REALTEK, 0x8186, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(USB_VENDOR_ID_REALTEK, 0x818a, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(USB_VENDOR_ID_REALTEK, 0x317f, 0xff, 0xff, 0xff), @@ -8157,12 +8159,18 @@ static const struct usb_device_id dev_table[] = { .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x050d, 0x1102, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x050d, 0x11f2, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x06f8, 0xe033, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x07b8, 0x8188, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x07b8, 0x8189, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x0846, 0x9041, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x0846, 0x9043, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x0b05, 0x17ba, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(USB_VENDOR_ID_REALTEK, 0x1e1e, 0xff, 0xff, 0xff), @@ -8179,6 +8187,10 @@ static const struct usb_device_id dev_table[] = { .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x13d3, 0x3357, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x13d3, 0x3358, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x13d3, 0x3359, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x330b, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x2019, 0x4902, 0xff, 0xff, 0xff), @@ -8193,6 +8205,8 @@ static const struct usb_device_id dev_table[] = { .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x4856, 0x0091, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x9846, 0x9041, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0xcdab, 0x8010, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x04f2, 0xaff7, 0xff, 0xff, 0xff), @@ -8218,6 +8232,8 @@ static const struct usb_device_id dev_table[] = { .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x0586, 0x341f, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x06f8, 0xe033, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x06f8, 0xe035, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x0b05, 0x17ab, 0xff, 0xff, 0xff), @@ -8226,6 +8242,8 @@ static const struct usb_device_id dev_table[] = { .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x0df6, 0x0070, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x0df6, 0x0077, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x0789, 0x016d, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x07aa, 0x0056, 0xff, 0xff, 0xff), @@ -8248,6 +8266,8 @@ static const struct usb_device_id dev_table[] = { .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x330a, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, +{USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x330d, 0xff, 0xff, 0xff), + .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x2019, 0xab2b, 0xff, 0xff, 0xff), .driver_info = (unsigned long)&rtl8192cu_fops}, {USB_DEVICE_AND_INTERFACE_INFO(0x20f4, 0x624d, 0xff, 0xff, 0xff), From 2c87309ea741341c6722efdf1fb3f50dd427c823 Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Tue, 29 Oct 2024 19:27:12 +0100 Subject: [PATCH 0029/1450] ieee802154: ca8210: Add missing check for kfifo_alloc() in ca8210_probe() ca8210_test_interface_init() returns the result of kfifo_alloc(), which can be non-zero in case of an error. The caller, ca8210_probe(), should check the return value and do error-handling if it fails. Fixes: ded845a781a5 ("ieee802154: Add CA8210 IEEE 802.15.4 device driver") Signed-off-by: Keisuke Nishimura Reviewed-by: Simon Horman Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/20241029182712.318271-1-keisuke.nishimura@inria.fr Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index e685a7f946f0f..753215ebc67c7 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -3072,7 +3072,11 @@ static int ca8210_probe(struct spi_device *spi_device) spi_set_drvdata(priv->spi, priv); if (IS_ENABLED(CONFIG_IEEE802154_CA8210_DEBUGFS)) { cascoda_api_upstream = ca8210_test_int_driver_write; - ca8210_test_interface_init(priv); + ret = ca8210_test_interface_init(priv); + if (ret) { + dev_crit(&spi_device->dev, "ca8210_test_interface_init failed\n"); + goto error; + } } else { cascoda_api_upstream = NULL; } From eb09fbeb48709fe66c0d708aed81e910a577a30a Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Wed, 13 Nov 2024 17:51:29 +0800 Subject: [PATCH 0030/1450] mac802154: check local interfaces before deleting sdata list syzkaller reported a corrupted list in ieee802154_if_remove. [1] Remove an IEEE 802.15.4 network interface after unregister an IEEE 802.15.4 hardware device from the system. CPU0 CPU1 ==== ==== genl_family_rcv_msg_doit ieee802154_unregister_hw ieee802154_del_iface ieee802154_remove_interfaces rdev_del_virtual_intf_deprecated list_del(&sdata->list) ieee802154_if_remove list_del_rcu The net device has been unregistered, since the rcu grace period, unregistration must be run before ieee802154_if_remove. To avoid this issue, add a check for local->interfaces before deleting sdata list. [1] kernel BUG at lib/list_debug.c:58! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 6277 Comm: syz-executor157 Not tainted 6.12.0-rc6-syzkaller-00005-g557329bcecc2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__list_del_entry_valid_or_report+0xf4/0x140 lib/list_debug.c:56 Code: e8 a1 7e 00 07 90 0f 0b 48 c7 c7 e0 37 60 8c 4c 89 fe e8 8f 7e 00 07 90 0f 0b 48 c7 c7 40 38 60 8c 4c 89 fe e8 7d 7e 00 07 90 <0f> 0b 48 c7 c7 a0 38 60 8c 4c 89 fe e8 6b 7e 00 07 90 0f 0b 48 c7 RSP: 0018:ffffc9000490f3d0 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: d211eee56bb28d00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88805b278dd8 R08: ffffffff8174a12c R09: 1ffffffff2852f0d R10: dffffc0000000000 R11: fffffbfff2852f0e R12: dffffc0000000000 R13: dffffc0000000000 R14: dead000000000100 R15: ffff88805b278cc0 FS: 0000555572f94380(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056262e4a3000 CR3: 0000000078496000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_rcu include/linux/rculist.h:157 [inline] ieee802154_if_remove+0x86/0x1e0 net/mac802154/iface.c:687 rdev_del_virtual_intf_deprecated net/ieee802154/rdev-ops.h:24 [inline] ieee802154_del_iface+0x2c0/0x5c0 net/ieee802154/nl-phy.c:323 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb14/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:744 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2607 ___sys_sendmsg net/socket.c:2661 [inline] __sys_sendmsg+0x292/0x380 net/socket.c:2690 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported-and-tested-by: syzbot+985f827280dc3a6e7e92@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=985f827280dc3a6e7e92 Signed-off-by: Lizhi Xu Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/20241113095129.1457225-1-lizhi.xu@windriver.com Signed-off-by: Stefan Schmidt --- net/mac802154/iface.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index c0e2da5072bea..9e4631fade90c 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -684,6 +684,10 @@ void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); + if (list_empty(&sdata->local->interfaces)) { + mutex_unlock(&sdata->local->iflist_mtx); + return; + } list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); From b5f871ab4913b2403a7cdcbcde16d39d0b071fb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 6 Nov 2024 13:41:44 +0100 Subject: [PATCH 0031/1450] wifi: ath9k: Add RX inactivity detection and reset chip when it occurs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some ath9k chips can, seemingly at random, end up in a state which can be described as "deaf". No or nearly no interrupts are generated anymore for incoming packets. Existing links either break down after a while and new links will not be established. The circumstances leading to this "deafness" is still unclear, but some particular chips (especially 2-stream 11n SoCs, but also others) can go 'deaf' when running AP or mesh (or both) after some time. It's probably a hardware issue, and doing a channel scan to trigger a chip reset (which one normally can't do on an AP interface) recovers the hardware. The only way the driver can detect this state, is by detecting if there has been no RX activity for a while. In this case we can proactively reset the chip (which only takes a small number of milliseconds, so shouldn't interrupt things too much if it has been idle for several seconds), which functions as a workaround. OpenWrt, and various derivatives, have been carrying versions of this workaround for years, that were never upstreamed. One version[0], written by Felix Fietkau, used a simple counter and only reset if there was precisely zero RX activity for a long period of time. This had the problem that in some cases a small number of interrupts would appear even if the device was otherwise not responsive. For this reason, another version[1], written by Simon Wunderlich and Sven Eckelmann, used a time-based approach to calculate the average number of RX interrupts over a longer (four-second) interval, and reset the chip when seeing less than one interrupt per second over this period. However, that version relied on debugfs counters to keep track of the number of interrupts, which means it didn't work at all if debugfs was not enabled. This patch unifies the two versions: it uses the same approach as Felix' patch to count the number of RX handler invocations, but uses the same time-based windowing approach as Simon and Sven's patch to still handle the case where occasional interrupts appear but the device is otherwise deaf. Since this is based on ideas by all three people, but not actually directly derived from any of the patches, I'm including Suggested-by tags from Simon, Sven and Felix below, which should hopefully serve as proper credit. [0] https://patchwork.kernel.org/project/linux-wireless/patch/20170125163654.66431-3-nbd@nbd.name/ [1] https://patchwork.kernel.org/project/linux-wireless/patch/20161117083614.19188-2-sven.eckelmann@open-mesh.com/ Suggested-by: Simon Wunderlich Suggested-by: Sven Eckelmann Suggested-by: Felix Fietkau Signed-off-by: Toke Høiland-Jørgensen Tested-by: Sven Eckelmann Reviewed-by: Sven Eckelmann Tested-by: Issam Hamdi Acked-by: Simon Wunderlich Link: https://patch.msgid.link/20241106-ath9k-deaf-detection-v1-1-736a150d2425@redhat.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath9k/ath9k.h | 2 ++ drivers/net/wireless/ath/ath9k/debug.c | 1 + drivers/net/wireless/ath/ath9k/debug.h | 1 + drivers/net/wireless/ath/ath9k/link.c | 33 ++++++++++++++++++++++++-- drivers/net/wireless/ath/ath9k/main.c | 1 + 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h index 29ca65a732a66..bcfc8df0efe5b 100644 --- a/drivers/net/wireless/ath/ath9k/ath9k.h +++ b/drivers/net/wireless/ath/ath9k/ath9k.h @@ -1018,6 +1018,8 @@ struct ath_softc { u8 gtt_cnt; u32 intrstatus; + u32 rx_active_check_time; + u32 rx_active_count; u16 ps_flags; /* PS_* */ bool ps_enabled; bool ps_idle; diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c index eff894958a738..74a0134075cf9 100644 --- a/drivers/net/wireless/ath/ath9k/debug.c +++ b/drivers/net/wireless/ath/ath9k/debug.c @@ -750,6 +750,7 @@ static int read_file_reset(struct seq_file *file, void *data) [RESET_TYPE_CALIBRATION] = "Calibration error", [RESET_TX_DMA_ERROR] = "Tx DMA stop error", [RESET_RX_DMA_ERROR] = "Rx DMA stop error", + [RESET_TYPE_RX_INACTIVE] = "Rx path inactive", }; int i; diff --git a/drivers/net/wireless/ath/ath9k/debug.h b/drivers/net/wireless/ath/ath9k/debug.h index 389459c04d14a..cb3e75969875a 100644 --- a/drivers/net/wireless/ath/ath9k/debug.h +++ b/drivers/net/wireless/ath/ath9k/debug.h @@ -53,6 +53,7 @@ enum ath_reset_type { RESET_TYPE_CALIBRATION, RESET_TX_DMA_ERROR, RESET_RX_DMA_ERROR, + RESET_TYPE_RX_INACTIVE, __RESET_TYPE_MAX }; diff --git a/drivers/net/wireless/ath/ath9k/link.c b/drivers/net/wireless/ath/ath9k/link.c index d1e5767aab3cb..d078a59d7d3cd 100644 --- a/drivers/net/wireless/ath/ath9k/link.c +++ b/drivers/net/wireless/ath/ath9k/link.c @@ -50,7 +50,36 @@ static bool ath_tx_complete_check(struct ath_softc *sc) "tx hung, resetting the chip\n"); ath9k_queue_reset(sc, RESET_TYPE_TX_HANG); return false; +} + +#define RX_INACTIVE_CHECK_INTERVAL (4 * MSEC_PER_SEC) + +static bool ath_hw_rx_inactive_check(struct ath_softc *sc) +{ + struct ath_common *common = ath9k_hw_common(sc->sc_ah); + u32 interval, count; + + interval = jiffies_to_msecs(jiffies - sc->rx_active_check_time); + count = sc->rx_active_count; + + if (interval < RX_INACTIVE_CHECK_INTERVAL) + return true; /* too soon to check */ + sc->rx_active_count = 0; + sc->rx_active_check_time = jiffies; + + /* Need at least one interrupt per second, and we should only react if + * we are within a factor two of the expected interval + */ + if (interval > RX_INACTIVE_CHECK_INTERVAL * 2 || + count >= interval / MSEC_PER_SEC) + return true; + + ath_dbg(common, RESET, + "RX inactivity detected. Schedule chip reset\n"); + ath9k_queue_reset(sc, RESET_TYPE_RX_INACTIVE); + + return false; } void ath_hw_check_work(struct work_struct *work) @@ -58,8 +87,8 @@ void ath_hw_check_work(struct work_struct *work) struct ath_softc *sc = container_of(work, struct ath_softc, hw_check_work.work); - if (!ath_hw_check(sc) || - !ath_tx_complete_check(sc)) + if (!ath_hw_check(sc) || !ath_tx_complete_check(sc) || + !ath_hw_rx_inactive_check(sc)) return; ieee80211_queue_delayed_work(sc->hw, &sc->hw_check_work, diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index dd79107828588..5a3309bda1bdc 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -453,6 +453,7 @@ void ath9k_tasklet(struct tasklet_struct *t) ath_rx_tasklet(sc, 0, true); ath_rx_tasklet(sc, 0, false); + sc->rx_active_count++; } if (status & ATH9K_INT_TX) { From a93d125ebfdd530ea1980a45b7ad2e9471b82c87 Mon Sep 17 00:00:00 2001 From: Dinesh Karthikeyan Date: Fri, 15 Nov 2024 11:58:51 +0530 Subject: [PATCH 0032/1450] wifi: ath12k: Support Downlink Pager Stats Add support to request downlink pager stats from firmware through HTT stats type 36. These stats give paging information like number of pages, their timestamp, number of locked and free pages, synchronous and asynchronous locked pages. Note: MCC firmware version - WLAN.HMT.1.0-03427-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.15378.4 responds to the event requesting stats, but it does not give any data. Sample output: ------------- echo 36 > /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats_type cat /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats HTT_DLPAGER_STATS_TLV: ASYNC locked pages = 2 SYNC locked pages = 0 Total locked pages = 2 Total free pages = 127 LOCKED PAGES HISTORY last_locked_page_idx = 0 Index - 0 ; Page Number - 8495 ; Num of pages - 1 ; Timestamp - 4031009360us Index - 1 ; Page Number - 7219 ; Num of pages - 2 ; Timestamp - 885379515us Index - 2 ; Page Number - 0 ; Num of pages - 0 ; Timestamp - 0us Index - 3 ; Page Number - 0 ; Num of pages - 0 ; Timestamp - 0us ..... UNLOCKED PAGES HISTORY last_unlocked_page_idx = 0 Index - 0 ; Page Number - 7144 ; Num of pages - 2 ; Timestamp - 4032070008us Index - 1 ; Page Number - 7214 ; Num of pages - 2 ; Timestamp - 885379512us Index - 2 ; Page Number - 0 ; Num of pages - 0 ; Timestamp - 0us Index - 3 ; Page Number - 0 ; Num of pages - 0 ; Timestamp - 0us ..... Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Signed-off-by: Dinesh Karthikeyan Signed-off-by: Roopni Devanathan Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241115062854.1919672-2-quic_rdevanat@quicinc.com Signed-off-by: Jeff Johnson --- .../wireless/ath/ath12k/debugfs_htt_stats.c | 86 ++++++++++++++++++- .../wireless/ath/ath12k/debugfs_htt_stats.h | 31 +++++++ 2 files changed, 116 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c index 43ea87e981f42..ba06aed3d880b 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c @@ -2543,6 +2543,88 @@ ath12k_htt_print_pdev_obss_pd_stats_tlv(const void *tag_buf, u16 tag_len, stats_req->buf_len = len; } +static void ath12k_htt_print_dlpager_entry(const struct ath12k_htt_pgs_info *pg_info, + int idx, char *str_buf) +{ + u64 page_timestamp; + u16 index = 0; + + page_timestamp = ath12k_le32hilo_to_u64(pg_info->ts_msb, pg_info->ts_lsb); + + index += snprintf(&str_buf[index], ATH12K_HTT_MAX_STRING_LEN - index, + "Index - %u ; Page Number - %u ; ", + idx, le32_to_cpu(pg_info->page_num)); + index += snprintf(&str_buf[index], ATH12K_HTT_MAX_STRING_LEN - index, + "Num of pages - %u ; Timestamp - %lluus\n", + le32_to_cpu(pg_info->num_pgs), page_timestamp); +} + +static void +ath12k_htt_print_dlpager_stats_tlv(const void *tag_buf, u16 tag_len, + struct debug_htt_stats_req *stats_req) +{ + const struct ath12k_htt_dl_pager_stats_tlv *stat_buf = tag_buf; + u32 len = stats_req->buf_len; + u32 buf_len = ATH12K_HTT_STATS_BUF_SIZE; + u32 dword_lock, dword_unlock; + int i; + u8 *buf = stats_req->buf; + u8 pg_locked; + u8 pg_unlock; + char str_buf[ATH12K_HTT_MAX_STRING_LEN] = {0}; + + if (tag_len < sizeof(*stat_buf)) + return; + + dword_lock = le32_get_bits(stat_buf->info2, + ATH12K_HTT_DLPAGER_TOTAL_LOCK_PAGES_INFO2); + dword_unlock = le32_get_bits(stat_buf->info2, + ATH12K_HTT_DLPAGER_TOTAL_FREE_PAGES_INFO2); + + pg_locked = ATH12K_HTT_STATS_PAGE_LOCKED; + pg_unlock = ATH12K_HTT_STATS_PAGE_UNLOCKED; + + len += scnprintf(buf + len, buf_len - len, "HTT_DLPAGER_STATS_TLV:\n"); + len += scnprintf(buf + len, buf_len - len, "ASYNC locked pages = %u\n", + le32_get_bits(stat_buf->info0, + ATH12K_HTT_DLPAGER_ASYNC_LOCK_PG_CNT_INFO0)); + len += scnprintf(buf + len, buf_len - len, "SYNC locked pages = %u\n", + le32_get_bits(stat_buf->info0, + ATH12K_HTT_DLPAGER_SYNC_LOCK_PG_CNT_INFO0)); + len += scnprintf(buf + len, buf_len - len, "Total locked pages = %u\n", + le32_get_bits(stat_buf->info1, + ATH12K_HTT_DLPAGER_TOTAL_LOCK_PAGES_INFO1)); + len += scnprintf(buf + len, buf_len - len, "Total free pages = %u\n", + le32_get_bits(stat_buf->info1, + ATH12K_HTT_DLPAGER_TOTAL_FREE_PAGES_INFO1)); + + len += scnprintf(buf + len, buf_len - len, "\nLOCKED PAGES HISTORY\n"); + len += scnprintf(buf + len, buf_len - len, "last_locked_page_idx = %u\n", + dword_lock ? dword_lock - 1 : (ATH12K_PAGER_MAX - 1)); + + for (i = 0; i < ATH12K_PAGER_MAX; i++) { + memset(str_buf, 0x0, ATH12K_HTT_MAX_STRING_LEN); + ath12k_htt_print_dlpager_entry(&stat_buf->pgs_info[pg_locked][i], + i, str_buf); + len += scnprintf(buf + len, buf_len - len, "%s", str_buf); + } + + len += scnprintf(buf + len, buf_len - len, "\nUNLOCKED PAGES HISTORY\n"); + len += scnprintf(buf + len, buf_len - len, "last_unlocked_page_idx = %u\n", + dword_unlock ? dword_unlock - 1 : ATH12K_PAGER_MAX - 1); + + for (i = 0; i < ATH12K_PAGER_MAX; i++) { + memset(str_buf, 0x0, ATH12K_HTT_MAX_STRING_LEN); + ath12k_htt_print_dlpager_entry(&stat_buf->pgs_info[pg_unlock][i], + i, str_buf); + len += scnprintf(buf + len, buf_len - len, "%s", str_buf); + } + + len += scnprintf(buf + len, buf_len - len, "\n"); + + stats_req->buf_len = len; +} + static void ath12k_htt_print_dmac_reset_stats_tlv(const void *tag_buf, u16 tag_len, struct debug_htt_stats_req *stats_req) @@ -2562,7 +2644,6 @@ ath12k_htt_print_dmac_reset_stats_tlv(const void *tag_buf, u16 tag_len, time = ath12k_le32hilo_to_u64(htt_stats_buf->reset_time_hi_ms, htt_stats_buf->reset_time_lo_ms); len += scnprintf(buf + len, buf_len - len, "reset_time_ms = %llu\n", time); - time = ath12k_le32hilo_to_u64(htt_stats_buf->disengage_time_hi_ms, htt_stats_buf->disengage_time_lo_ms); len += scnprintf(buf + len, buf_len - len, "disengage_time_ms = %llu\n", time); @@ -2870,6 +2951,9 @@ static int ath12k_dbg_htt_ext_stats_parse(struct ath12k_base *ab, case HTT_STATS_PDEV_OBSS_PD_TAG: ath12k_htt_print_pdev_obss_pd_stats_tlv(tag_buf, len, stats_req); break; + case HTT_STATS_DLPAGER_STATS_TAG: + ath12k_htt_print_dlpager_stats_tlv(tag_buf, len, stats_req); + break; case HTT_STATS_DMAC_RESET_STATS_TAG: ath12k_htt_print_dmac_reset_stats_tlv(tag_buf, len, stats_req); break; diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h index ac86cab234ecb..dfb6538585d51 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h @@ -135,6 +135,7 @@ enum ath12k_dbg_htt_ext_stats_type { ATH12K_DBG_HTT_EXT_STATS_PDEV_TX_MU = 17, ATH12K_DBG_HTT_EXT_STATS_PDEV_CCA_STATS = 19, ATH12K_DBG_HTT_EXT_STATS_PDEV_OBSS_PD_STATS = 23, + ATH12K_DBG_HTT_EXT_STATS_DLPAGER_STATS = 36, ATH12K_DBG_HTT_EXT_STATS_SOC_ERROR = 45, ATH12K_DBG_HTT_EXT_STATS_PDEV_SCHED_ALGO = 49, ATH12K_DBG_HTT_EXT_STATS_MANDATORY_MUOFDMA = 51, @@ -194,6 +195,7 @@ enum ath12k_dbg_htt_tlv_tag { HTT_STATS_PDEV_CTRL_PATH_TX_STATS_TAG = 102, HTT_STATS_TX_SELFGEN_AC_SCHED_STATUS_STATS_TAG = 111, HTT_STATS_TX_SELFGEN_AX_SCHED_STATUS_STATS_TAG = 112, + HTT_STATS_DLPAGER_STATS_TAG = 120, HTT_STATS_MU_PPDU_DIST_TAG = 129, HTT_STATS_TX_PDEV_MUMIMO_GRP_STATS_TAG = 130, HTT_STATS_TX_PDEV_RATE_STATS_BE_OFDMA_TAG = 135, @@ -1054,6 +1056,35 @@ struct ath12k_htt_pdev_obss_pd_stats_tlv { __le32 num_sr_ppdu_abort_flush_cnt; } __packed; +enum ath12k_htt_stats_page_lock_state { + ATH12K_HTT_STATS_PAGE_LOCKED = 0, + ATH12K_HTT_STATS_PAGE_UNLOCKED = 1, + ATH12K_NUM_PG_LOCK_STATE +}; + +#define ATH12K_PAGER_MAX 10 + +#define ATH12K_HTT_DLPAGER_ASYNC_LOCK_PG_CNT_INFO0 GENMASK(7, 0) +#define ATH12K_HTT_DLPAGER_SYNC_LOCK_PG_CNT_INFO0 GENMASK(15, 8) +#define ATH12K_HTT_DLPAGER_TOTAL_LOCK_PAGES_INFO1 GENMASK(15, 0) +#define ATH12K_HTT_DLPAGER_TOTAL_FREE_PAGES_INFO1 GENMASK(31, 16) +#define ATH12K_HTT_DLPAGER_TOTAL_LOCK_PAGES_INFO2 GENMASK(15, 0) +#define ATH12K_HTT_DLPAGER_TOTAL_FREE_PAGES_INFO2 GENMASK(31, 16) + +struct ath12k_htt_pgs_info { + __le32 page_num; + __le32 num_pgs; + __le32 ts_lsb; + __le32 ts_msb; +} __packed; + +struct ath12k_htt_dl_pager_stats_tlv { + __le32 info0; + __le32 info1; + __le32 info2; + struct ath12k_htt_pgs_info pgs_info[ATH12K_NUM_PG_LOCK_STATE][ATH12K_PAGER_MAX]; +} __packed; + struct ath12k_htt_dmac_reset_stats_tlv { __le32 reset_count; __le32 reset_time_lo_ms; From c3527cdfcdf70dcd613c4582283e28e4f81df9ea Mon Sep 17 00:00:00 2001 From: Dinesh Karthikeyan Date: Fri, 15 Nov 2024 11:58:52 +0530 Subject: [PATCH 0033/1450] wifi: ath12k: Support phy counter and TPC stats Add support to request counters and Transmission Power Control (TPC) stats through HTT stats type 37. These stats give information about counters like received packet count, CRC pass count, error count, transmit abort count, etc., about counter reset like reset cause, channel frequency, number and mode, channel flags, etc., about TPC like transmit power scale, maximum transmit power, gain cap, EIRP, etc. Note: MCC firmware version - WLAN.HMT.1.0-03427-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.15378.4 does not support HTT stats type 37, i.e., the firmware does not respond to the command requesting stats. Sample output: ------------- echo 37 > /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats_type cat /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats HTT_PHY_STATS_TLV: bdf_nf_chain[0] = -92 bdf_nf_chain[1] = -94 bdf_nf_chain[2] = -94 bdf_nf_chain[3] = -93 ..... HTT_PHY_COUNTERS_TLV: rx_ofdma_timing_err_cnt = 18068 rx_cck_fail_cnt = 0 mactx_abort_cnt = 2612 macrx_abort_cnt = 0 ..... HTT_PHY_RESET_STATS_TLV: pdev_id = 0 chan_mhz = 0 chan_band_center_freq1 = 0 chan_band_center_freq2 = 0 ..... HTT_PHY_RESET_COUNTERS_TLV: pdev_id = 0 cf_active_low_fail_cnt = 0 cf_active_low_pass_cnt = 0 phy_off_through_vreg_cnt = 0 ..... HTT_PHY_TPC_STATS_TLV: pdev_id = 0 tx_power_scale = 0 tx_power_scale_db = 0 min_negative_tx_power = 0 ..... Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Signed-off-by: Dinesh Karthikeyan Signed-off-by: Roopni Devanathan Acked-by: Jeff Johnson Acked-by: Kalle Valo Link: https://patch.msgid.link/20241115062854.1919672-3-quic_rdevanat@quicinc.com Signed-off-by: Jeff Johnson --- .../wireless/ath/ath12k/debugfs_htt_stats.c | 248 ++++++++++++++++++ .../wireless/ath/ath12k/debugfs_htt_stats.h | 98 +++++++ 2 files changed, 346 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c index ba06aed3d880b..78b9d8efa9578 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c @@ -2625,6 +2625,239 @@ ath12k_htt_print_dlpager_stats_tlv(const void *tag_buf, u16 tag_len, stats_req->buf_len = len; } +static void +ath12k_htt_print_phy_stats_tlv(const void *tag_buf, u16 tag_len, + struct debug_htt_stats_req *stats_req) +{ + const struct ath12k_htt_phy_stats_tlv *htt_stats_buf = tag_buf; + u32 len = stats_req->buf_len; + u32 buf_len = ATH12K_HTT_STATS_BUF_SIZE; + u8 *buf = stats_req->buf, i; + + if (tag_len < sizeof(*htt_stats_buf)) + return; + + len += scnprintf(buf + len, buf_len - len, "HTT_PHY_STATS_TLV:\n"); + for (i = 0; i < ATH12K_HTT_STATS_MAX_CHAINS; i++) + len += scnprintf(buf + len, buf_len - len, "bdf_nf_chain[%d] = %d\n", + i, a_sle32_to_cpu(htt_stats_buf->nf_chain[i])); + for (i = 0; i < ATH12K_HTT_STATS_MAX_CHAINS; i++) + len += scnprintf(buf + len, buf_len - len, "runtime_nf_chain[%d] = %d\n", + i, a_sle32_to_cpu(htt_stats_buf->runtime_nf_chain[i])); + len += scnprintf(buf + len, buf_len - len, "false_radar_cnt = %u / %u (mins)\n", + le32_to_cpu(htt_stats_buf->false_radar_cnt), + le32_to_cpu(htt_stats_buf->fw_run_time)); + len += scnprintf(buf + len, buf_len - len, "radar_cs_cnt = %u\n", + le32_to_cpu(htt_stats_buf->radar_cs_cnt)); + len += scnprintf(buf + len, buf_len - len, "ani_level = %d\n\n", + a_sle32_to_cpu(htt_stats_buf->ani_level)); + + stats_req->buf_len = len; +} + +static void +ath12k_htt_print_phy_counters_tlv(const void *tag_buf, u16 tag_len, + struct debug_htt_stats_req *stats_req) +{ + const struct ath12k_htt_phy_counters_tlv *htt_stats_buf = tag_buf; + u32 len = stats_req->buf_len; + u32 buf_len = ATH12K_HTT_STATS_BUF_SIZE; + u8 *buf = stats_req->buf; + + if (tag_len < sizeof(*htt_stats_buf)) + return; + + len += scnprintf(buf + len, buf_len - len, "HTT_PHY_COUNTERS_TLV:\n"); + len += scnprintf(buf + len, buf_len - len, "rx_ofdma_timing_err_cnt = %u\n", + le32_to_cpu(htt_stats_buf->rx_ofdma_timing_err_cnt)); + len += scnprintf(buf + len, buf_len - len, "rx_cck_fail_cnt = %u\n", + le32_to_cpu(htt_stats_buf->rx_cck_fail_cnt)); + len += scnprintf(buf + len, buf_len - len, "mactx_abort_cnt = %u\n", + le32_to_cpu(htt_stats_buf->mactx_abort_cnt)); + len += scnprintf(buf + len, buf_len - len, "macrx_abort_cnt = %u\n", + le32_to_cpu(htt_stats_buf->macrx_abort_cnt)); + len += scnprintf(buf + len, buf_len - len, "phytx_abort_cnt = %u\n", + le32_to_cpu(htt_stats_buf->phytx_abort_cnt)); + len += scnprintf(buf + len, buf_len - len, "phyrx_abort_cnt = %u\n", + le32_to_cpu(htt_stats_buf->phyrx_abort_cnt)); + len += scnprintf(buf + len, buf_len - len, "phyrx_defer_abort_cnt = %u\n", + le32_to_cpu(htt_stats_buf->phyrx_defer_abort_cnt)); + len += scnprintf(buf + len, buf_len - len, "rx_gain_adj_lstf_event_cnt = %u\n", + le32_to_cpu(htt_stats_buf->rx_gain_adj_lstf_event_cnt)); + len += scnprintf(buf + len, buf_len - len, "rx_gain_adj_non_legacy_cnt = %u\n", + le32_to_cpu(htt_stats_buf->rx_gain_adj_non_legacy_cnt)); + len += print_array_to_buf(buf, len, "rx_pkt_cnt", htt_stats_buf->rx_pkt_cnt, + ATH12K_HTT_MAX_RX_PKT_CNT, "\n"); + len += print_array_to_buf(buf, len, "rx_pkt_crc_pass_cnt", + htt_stats_buf->rx_pkt_crc_pass_cnt, + ATH12K_HTT_MAX_RX_PKT_CRC_PASS_CNT, "\n"); + len += print_array_to_buf(buf, len, "per_blk_err_cnt", + htt_stats_buf->per_blk_err_cnt, + ATH12K_HTT_MAX_PER_BLK_ERR_CNT, "\n"); + len += print_array_to_buf(buf, len, "rx_ota_err_cnt", + htt_stats_buf->rx_ota_err_cnt, + ATH12K_HTT_MAX_RX_OTA_ERR_CNT, "\n\n"); + + stats_req->buf_len = len; +} + +static void +ath12k_htt_print_phy_reset_stats_tlv(const void *tag_buf, u16 tag_len, + struct debug_htt_stats_req *stats_req) +{ + const struct ath12k_htt_phy_reset_stats_tlv *htt_stats_buf = tag_buf; + u32 len = stats_req->buf_len; + u32 buf_len = ATH12K_HTT_STATS_BUF_SIZE; + u8 *buf = stats_req->buf; + + if (tag_len < sizeof(*htt_stats_buf)) + return; + + len += scnprintf(buf + len, buf_len - len, "HTT_PHY_RESET_STATS_TLV:\n"); + len += scnprintf(buf + len, buf_len - len, "pdev_id = %u\n", + le32_to_cpu(htt_stats_buf->pdev_id)); + len += scnprintf(buf + len, buf_len - len, "chan_mhz = %u\n", + le32_to_cpu(htt_stats_buf->chan_mhz)); + len += scnprintf(buf + len, buf_len - len, "chan_band_center_freq1 = %u\n", + le32_to_cpu(htt_stats_buf->chan_band_center_freq1)); + len += scnprintf(buf + len, buf_len - len, "chan_band_center_freq2 = %u\n", + le32_to_cpu(htt_stats_buf->chan_band_center_freq2)); + len += scnprintf(buf + len, buf_len - len, "chan_phy_mode = %u\n", + le32_to_cpu(htt_stats_buf->chan_phy_mode)); + len += scnprintf(buf + len, buf_len - len, "chan_flags = 0x%0x\n", + le32_to_cpu(htt_stats_buf->chan_flags)); + len += scnprintf(buf + len, buf_len - len, "chan_num = %u\n", + le32_to_cpu(htt_stats_buf->chan_num)); + len += scnprintf(buf + len, buf_len - len, "reset_cause = 0x%0x\n", + le32_to_cpu(htt_stats_buf->reset_cause)); + len += scnprintf(buf + len, buf_len - len, "prev_reset_cause = 0x%0x\n", + le32_to_cpu(htt_stats_buf->prev_reset_cause)); + len += scnprintf(buf + len, buf_len - len, "phy_warm_reset_src = 0x%0x\n", + le32_to_cpu(htt_stats_buf->phy_warm_reset_src)); + len += scnprintf(buf + len, buf_len - len, "rx_gain_tbl_mode = %d\n", + le32_to_cpu(htt_stats_buf->rx_gain_tbl_mode)); + len += scnprintf(buf + len, buf_len - len, "xbar_val = 0x%0x\n", + le32_to_cpu(htt_stats_buf->xbar_val)); + len += scnprintf(buf + len, buf_len - len, "force_calibration = %u\n", + le32_to_cpu(htt_stats_buf->force_calibration)); + len += scnprintf(buf + len, buf_len - len, "phyrf_mode = %u\n", + le32_to_cpu(htt_stats_buf->phyrf_mode)); + len += scnprintf(buf + len, buf_len - len, "phy_homechan = %u\n", + le32_to_cpu(htt_stats_buf->phy_homechan)); + len += scnprintf(buf + len, buf_len - len, "phy_tx_ch_mask = 0x%0x\n", + le32_to_cpu(htt_stats_buf->phy_tx_ch_mask)); + len += scnprintf(buf + len, buf_len - len, "phy_rx_ch_mask = 0x%0x\n", + le32_to_cpu(htt_stats_buf->phy_rx_ch_mask)); + len += scnprintf(buf + len, buf_len - len, "phybb_ini_mask = 0x%0x\n", + le32_to_cpu(htt_stats_buf->phybb_ini_mask)); + len += scnprintf(buf + len, buf_len - len, "phyrf_ini_mask = 0x%0x\n", + le32_to_cpu(htt_stats_buf->phyrf_ini_mask)); + len += scnprintf(buf + len, buf_len - len, "phy_dfs_en_mask = 0x%0x\n", + le32_to_cpu(htt_stats_buf->phy_dfs_en_mask)); + len += scnprintf(buf + len, buf_len - len, "phy_sscan_en_mask = 0x%0x\n", + le32_to_cpu(htt_stats_buf->phy_sscan_en_mask)); + len += scnprintf(buf + len, buf_len - len, "phy_synth_sel_mask = 0x%0x\n", + le32_to_cpu(htt_stats_buf->phy_synth_sel_mask)); + len += scnprintf(buf + len, buf_len - len, "phy_adfs_freq = %u\n", + le32_to_cpu(htt_stats_buf->phy_adfs_freq)); + len += scnprintf(buf + len, buf_len - len, "cck_fir_settings = 0x%0x\n", + le32_to_cpu(htt_stats_buf->cck_fir_settings)); + len += scnprintf(buf + len, buf_len - len, "phy_dyn_pri_chan = %u\n", + le32_to_cpu(htt_stats_buf->phy_dyn_pri_chan)); + len += scnprintf(buf + len, buf_len - len, "cca_thresh = 0x%0x\n", + le32_to_cpu(htt_stats_buf->cca_thresh)); + len += scnprintf(buf + len, buf_len - len, "dyn_cca_status = %u\n", + le32_to_cpu(htt_stats_buf->dyn_cca_status)); + len += scnprintf(buf + len, buf_len - len, "rxdesense_thresh_hw = 0x%x\n", + le32_to_cpu(htt_stats_buf->rxdesense_thresh_hw)); + len += scnprintf(buf + len, buf_len - len, "rxdesense_thresh_sw = 0x%x\n\n", + le32_to_cpu(htt_stats_buf->rxdesense_thresh_sw)); + + stats_req->buf_len = len; +} + +static void +ath12k_htt_print_phy_reset_counters_tlv(const void *tag_buf, u16 tag_len, + struct debug_htt_stats_req *stats_req) +{ + const struct ath12k_htt_phy_reset_counters_tlv *htt_stats_buf = tag_buf; + u32 len = stats_req->buf_len; + u32 buf_len = ATH12K_HTT_STATS_BUF_SIZE; + u8 *buf = stats_req->buf; + + if (tag_len < sizeof(*htt_stats_buf)) + return; + + len += scnprintf(buf + len, buf_len - len, "HTT_PHY_RESET_COUNTERS_TLV:\n"); + len += scnprintf(buf + len, buf_len - len, "pdev_id = %u\n", + le32_to_cpu(htt_stats_buf->pdev_id)); + len += scnprintf(buf + len, buf_len - len, "cf_active_low_fail_cnt = %u\n", + le32_to_cpu(htt_stats_buf->cf_active_low_fail_cnt)); + len += scnprintf(buf + len, buf_len - len, "cf_active_low_pass_cnt = %u\n", + le32_to_cpu(htt_stats_buf->cf_active_low_pass_cnt)); + len += scnprintf(buf + len, buf_len - len, "phy_off_through_vreg_cnt = %u\n", + le32_to_cpu(htt_stats_buf->phy_off_through_vreg_cnt)); + len += scnprintf(buf + len, buf_len - len, "force_calibration_cnt = %u\n", + le32_to_cpu(htt_stats_buf->force_calibration_cnt)); + len += scnprintf(buf + len, buf_len - len, "rf_mode_switch_phy_off_cnt = %u\n", + le32_to_cpu(htt_stats_buf->rf_mode_switch_phy_off_cnt)); + len += scnprintf(buf + len, buf_len - len, "temperature_recal_cnt = %u\n\n", + le32_to_cpu(htt_stats_buf->temperature_recal_cnt)); + + stats_req->buf_len = len; +} + +static void +ath12k_htt_print_phy_tpc_stats_tlv(const void *tag_buf, u16 tag_len, + struct debug_htt_stats_req *stats_req) +{ + const struct ath12k_htt_phy_tpc_stats_tlv *htt_stats_buf = tag_buf; + u32 len = stats_req->buf_len; + u32 buf_len = ATH12K_HTT_STATS_BUF_SIZE; + u8 *buf = stats_req->buf; + + if (tag_len < sizeof(*htt_stats_buf)) + return; + + len += scnprintf(buf + len, buf_len - len, "HTT_PHY_TPC_STATS_TLV:\n"); + len += scnprintf(buf + len, buf_len - len, "pdev_id = %u\n", + le32_to_cpu(htt_stats_buf->pdev_id)); + len += scnprintf(buf + len, buf_len - len, "tx_power_scale = %u\n", + le32_to_cpu(htt_stats_buf->tx_power_scale)); + len += scnprintf(buf + len, buf_len - len, "tx_power_scale_db = %u\n", + le32_to_cpu(htt_stats_buf->tx_power_scale_db)); + len += scnprintf(buf + len, buf_len - len, "min_negative_tx_power = %d\n", + le32_to_cpu(htt_stats_buf->min_negative_tx_power)); + len += scnprintf(buf + len, buf_len - len, "reg_ctl_domain = %u\n", + le32_to_cpu(htt_stats_buf->reg_ctl_domain)); + len += scnprintf(buf + len, buf_len - len, "twice_max_rd_power = %u\n", + le32_to_cpu(htt_stats_buf->twice_max_rd_power)); + len += scnprintf(buf + len, buf_len - len, "max_tx_power = %u\n", + le32_to_cpu(htt_stats_buf->max_tx_power)); + len += scnprintf(buf + len, buf_len - len, "home_max_tx_power = %u\n", + le32_to_cpu(htt_stats_buf->home_max_tx_power)); + len += scnprintf(buf + len, buf_len - len, "psd_power = %d\n", + le32_to_cpu(htt_stats_buf->psd_power)); + len += scnprintf(buf + len, buf_len - len, "eirp_power = %u\n", + le32_to_cpu(htt_stats_buf->eirp_power)); + len += scnprintf(buf + len, buf_len - len, "power_type_6ghz = %u\n", + le32_to_cpu(htt_stats_buf->power_type_6ghz)); + len += print_array_to_buf(buf, len, "max_reg_allowed_power", + htt_stats_buf->max_reg_allowed_power, + ATH12K_HTT_STATS_MAX_CHAINS, "\n"); + len += print_array_to_buf(buf, len, "max_reg_allowed_power_6ghz", + htt_stats_buf->max_reg_allowed_power_6ghz, + ATH12K_HTT_STATS_MAX_CHAINS, "\n"); + len += print_array_to_buf(buf, len, "sub_band_cfreq", + htt_stats_buf->sub_band_cfreq, + ATH12K_HTT_MAX_CH_PWR_INFO_SIZE, "\n"); + len += print_array_to_buf(buf, len, "sub_band_txpower", + htt_stats_buf->sub_band_txpower, + ATH12K_HTT_MAX_CH_PWR_INFO_SIZE, "\n\n"); + + stats_req->buf_len = len; +} + static void ath12k_htt_print_dmac_reset_stats_tlv(const void *tag_buf, u16 tag_len, struct debug_htt_stats_req *stats_req) @@ -2954,6 +3187,21 @@ static int ath12k_dbg_htt_ext_stats_parse(struct ath12k_base *ab, case HTT_STATS_DLPAGER_STATS_TAG: ath12k_htt_print_dlpager_stats_tlv(tag_buf, len, stats_req); break; + case HTT_STATS_PHY_STATS_TAG: + ath12k_htt_print_phy_stats_tlv(tag_buf, len, stats_req); + break; + case HTT_STATS_PHY_COUNTERS_TAG: + ath12k_htt_print_phy_counters_tlv(tag_buf, len, stats_req); + break; + case HTT_STATS_PHY_RESET_STATS_TAG: + ath12k_htt_print_phy_reset_stats_tlv(tag_buf, len, stats_req); + break; + case HTT_STATS_PHY_RESET_COUNTERS_TAG: + ath12k_htt_print_phy_reset_counters_tlv(tag_buf, len, stats_req); + break; + case HTT_STATS_PHY_TPC_STATS_TAG: + ath12k_htt_print_phy_tpc_stats_tlv(tag_buf, len, stats_req); + break; case HTT_STATS_DMAC_RESET_STATS_TAG: ath12k_htt_print_dmac_reset_stats_tlv(tag_buf, len, stats_req); break; diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h index dfb6538585d51..10d7ca9c02f42 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h @@ -136,6 +136,7 @@ enum ath12k_dbg_htt_ext_stats_type { ATH12K_DBG_HTT_EXT_STATS_PDEV_CCA_STATS = 19, ATH12K_DBG_HTT_EXT_STATS_PDEV_OBSS_PD_STATS = 23, ATH12K_DBG_HTT_EXT_STATS_DLPAGER_STATS = 36, + ATH12K_DBG_HTT_EXT_PHY_COUNTERS_AND_PHY_STATS = 37, ATH12K_DBG_HTT_EXT_STATS_SOC_ERROR = 45, ATH12K_DBG_HTT_EXT_STATS_PDEV_SCHED_ALGO = 49, ATH12K_DBG_HTT_EXT_STATS_MANDATORY_MUOFDMA = 51, @@ -196,6 +197,10 @@ enum ath12k_dbg_htt_tlv_tag { HTT_STATS_TX_SELFGEN_AC_SCHED_STATUS_STATS_TAG = 111, HTT_STATS_TX_SELFGEN_AX_SCHED_STATUS_STATS_TAG = 112, HTT_STATS_DLPAGER_STATS_TAG = 120, + HTT_STATS_PHY_COUNTERS_TAG = 121, + HTT_STATS_PHY_STATS_TAG = 122, + HTT_STATS_PHY_RESET_COUNTERS_TAG = 123, + HTT_STATS_PHY_RESET_STATS_TAG = 124, HTT_STATS_MU_PPDU_DIST_TAG = 129, HTT_STATS_TX_PDEV_MUMIMO_GRP_STATS_TAG = 130, HTT_STATS_TX_PDEV_RATE_STATS_BE_OFDMA_TAG = 135, @@ -203,6 +208,7 @@ enum ath12k_dbg_htt_tlv_tag { HTT_STATS_TX_SELFGEN_BE_STATS_TAG = 138, HTT_STATS_TX_SELFGEN_BE_SCHED_STATUS_STATS_TAG = 139, HTT_STATS_DMAC_RESET_STATS_TAG = 155, + HTT_STATS_PHY_TPC_STATS_TAG = 157, HTT_STATS_PDEV_SCHED_ALGO_OFDMA_STATS_TAG = 165, HTT_STATS_MAX_TAG, @@ -1085,6 +1091,98 @@ struct ath12k_htt_dl_pager_stats_tlv { struct ath12k_htt_pgs_info pgs_info[ATH12K_NUM_PG_LOCK_STATE][ATH12K_PAGER_MAX]; } __packed; +#define ATH12K_HTT_STATS_MAX_CHAINS 8 +#define ATH12K_HTT_MAX_RX_PKT_CNT 8 +#define ATH12K_HTT_MAX_RX_PKT_CRC_PASS_CNT 8 +#define ATH12K_HTT_MAX_PER_BLK_ERR_CNT 20 +#define ATH12K_HTT_MAX_RX_OTA_ERR_CNT 14 +#define ATH12K_HTT_MAX_CH_PWR_INFO_SIZE 16 + +struct ath12k_htt_phy_stats_tlv { + a_sle32 nf_chain[ATH12K_HTT_STATS_MAX_CHAINS]; + __le32 false_radar_cnt; + __le32 radar_cs_cnt; + a_sle32 ani_level; + __le32 fw_run_time; + a_sle32 runtime_nf_chain[ATH12K_HTT_STATS_MAX_CHAINS]; +} __packed; + +struct ath12k_htt_phy_counters_tlv { + __le32 rx_ofdma_timing_err_cnt; + __le32 rx_cck_fail_cnt; + __le32 mactx_abort_cnt; + __le32 macrx_abort_cnt; + __le32 phytx_abort_cnt; + __le32 phyrx_abort_cnt; + __le32 phyrx_defer_abort_cnt; + __le32 rx_gain_adj_lstf_event_cnt; + __le32 rx_gain_adj_non_legacy_cnt; + __le32 rx_pkt_cnt[ATH12K_HTT_MAX_RX_PKT_CNT]; + __le32 rx_pkt_crc_pass_cnt[ATH12K_HTT_MAX_RX_PKT_CRC_PASS_CNT]; + __le32 per_blk_err_cnt[ATH12K_HTT_MAX_PER_BLK_ERR_CNT]; + __le32 rx_ota_err_cnt[ATH12K_HTT_MAX_RX_OTA_ERR_CNT]; +} __packed; + +struct ath12k_htt_phy_reset_stats_tlv { + __le32 pdev_id; + __le32 chan_mhz; + __le32 chan_band_center_freq1; + __le32 chan_band_center_freq2; + __le32 chan_phy_mode; + __le32 chan_flags; + __le32 chan_num; + __le32 reset_cause; + __le32 prev_reset_cause; + __le32 phy_warm_reset_src; + __le32 rx_gain_tbl_mode; + __le32 xbar_val; + __le32 force_calibration; + __le32 phyrf_mode; + __le32 phy_homechan; + __le32 phy_tx_ch_mask; + __le32 phy_rx_ch_mask; + __le32 phybb_ini_mask; + __le32 phyrf_ini_mask; + __le32 phy_dfs_en_mask; + __le32 phy_sscan_en_mask; + __le32 phy_synth_sel_mask; + __le32 phy_adfs_freq; + __le32 cck_fir_settings; + __le32 phy_dyn_pri_chan; + __le32 cca_thresh; + __le32 dyn_cca_status; + __le32 rxdesense_thresh_hw; + __le32 rxdesense_thresh_sw; +} __packed; + +struct ath12k_htt_phy_reset_counters_tlv { + __le32 pdev_id; + __le32 cf_active_low_fail_cnt; + __le32 cf_active_low_pass_cnt; + __le32 phy_off_through_vreg_cnt; + __le32 force_calibration_cnt; + __le32 rf_mode_switch_phy_off_cnt; + __le32 temperature_recal_cnt; +} __packed; + +struct ath12k_htt_phy_tpc_stats_tlv { + __le32 pdev_id; + __le32 tx_power_scale; + __le32 tx_power_scale_db; + __le32 min_negative_tx_power; + __le32 reg_ctl_domain; + __le32 max_reg_allowed_power[ATH12K_HTT_STATS_MAX_CHAINS]; + __le32 max_reg_allowed_power_6ghz[ATH12K_HTT_STATS_MAX_CHAINS]; + __le32 twice_max_rd_power; + __le32 max_tx_power; + __le32 home_max_tx_power; + __le32 psd_power; + __le32 eirp_power; + __le32 power_type_6ghz; + __le32 sub_band_cfreq[ATH12K_HTT_MAX_CH_PWR_INFO_SIZE]; + __le32 sub_band_txpower[ATH12K_HTT_MAX_CH_PWR_INFO_SIZE]; +} __packed; + struct ath12k_htt_dmac_reset_stats_tlv { __le32 reset_count; __le32 reset_time_lo_ms; From 3a660e7fa44d556f41cbef6d2430f7227ef3f3ef Mon Sep 17 00:00:00 2001 From: Dinesh Karthikeyan Date: Fri, 15 Nov 2024 11:58:53 +0530 Subject: [PATCH 0034/1450] wifi: ath12k: Support SoC Common Stats Add support to request SoC stat from firmware through HTT stat type 38. This stat gives drop count of SoC. Note: MCC firmware version - WLAN.HMT.1.0-03427-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.15378.4 does not support tag HTT_STATS_SOC_TXRX_STATS_COMMON_TAG(125). Sample output: ------------- echo 38 > /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats_type cat /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats HTT_SOC_COMMON_STATS_TLV: soc_drop_count = 0x0000000000000000 Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Signed-off-by: Dinesh Karthikeyan Signed-off-by: Roopni Devanathan Acked-by: Jeff Johnson Acked-by: Kalle Valo Link: https://patch.msgid.link/20241115062854.1919672-4-quic_rdevanat@quicinc.com Signed-off-by: Jeff Johnson --- .../wireless/ath/ath12k/debugfs_htt_stats.c | 26 +++++++++++++++++++ .../wireless/ath/ath12k/debugfs_htt_stats.h | 7 +++++ 2 files changed, 33 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c index 78b9d8efa9578..f310e95a65f77 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c @@ -2858,6 +2858,29 @@ ath12k_htt_print_phy_tpc_stats_tlv(const void *tag_buf, u16 tag_len, stats_req->buf_len = len; } +static void +ath12k_htt_print_soc_txrx_stats_common_tlv(const void *tag_buf, u16 tag_len, + struct debug_htt_stats_req *stats_req) +{ + const struct ath12k_htt_t2h_soc_txrx_stats_common_tlv *htt_stats_buf = tag_buf; + u64 drop_count; + u32 len = stats_req->buf_len; + u32 buf_len = ATH12K_HTT_STATS_BUF_SIZE; + u8 *buf = stats_req->buf; + + if (tag_len < sizeof(*htt_stats_buf)) + return; + + drop_count = ath12k_le32hilo_to_u64(htt_stats_buf->inv_peers_msdu_drop_count_hi, + htt_stats_buf->inv_peers_msdu_drop_count_lo); + + len += scnprintf(buf + len, buf_len - len, "HTT_SOC_COMMON_STATS_TLV:\n"); + len += scnprintf(buf + len, buf_len - len, "soc_drop_count = %llu\n\n", + drop_count); + + stats_req->buf_len = len; +} + static void ath12k_htt_print_dmac_reset_stats_tlv(const void *tag_buf, u16 tag_len, struct debug_htt_stats_req *stats_req) @@ -3202,6 +3225,9 @@ static int ath12k_dbg_htt_ext_stats_parse(struct ath12k_base *ab, case HTT_STATS_PHY_TPC_STATS_TAG: ath12k_htt_print_phy_tpc_stats_tlv(tag_buf, len, stats_req); break; + case HTT_STATS_SOC_TXRX_STATS_COMMON_TAG: + ath12k_htt_print_soc_txrx_stats_common_tlv(tag_buf, len, stats_req); + break; case HTT_STATS_DMAC_RESET_STATS_TAG: ath12k_htt_print_dmac_reset_stats_tlv(tag_buf, len, stats_req); break; diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h index 10d7ca9c02f42..c07b60636c22d 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h @@ -137,6 +137,7 @@ enum ath12k_dbg_htt_ext_stats_type { ATH12K_DBG_HTT_EXT_STATS_PDEV_OBSS_PD_STATS = 23, ATH12K_DBG_HTT_EXT_STATS_DLPAGER_STATS = 36, ATH12K_DBG_HTT_EXT_PHY_COUNTERS_AND_PHY_STATS = 37, + ATH12K_DBG_HTT_EXT_VDEVS_TXRX_STATS = 38, ATH12K_DBG_HTT_EXT_STATS_SOC_ERROR = 45, ATH12K_DBG_HTT_EXT_STATS_PDEV_SCHED_ALGO = 49, ATH12K_DBG_HTT_EXT_STATS_MANDATORY_MUOFDMA = 51, @@ -201,6 +202,7 @@ enum ath12k_dbg_htt_tlv_tag { HTT_STATS_PHY_STATS_TAG = 122, HTT_STATS_PHY_RESET_COUNTERS_TAG = 123, HTT_STATS_PHY_RESET_STATS_TAG = 124, + HTT_STATS_SOC_TXRX_STATS_COMMON_TAG = 125, HTT_STATS_MU_PPDU_DIST_TAG = 129, HTT_STATS_TX_PDEV_MUMIMO_GRP_STATS_TAG = 130, HTT_STATS_TX_PDEV_RATE_STATS_BE_OFDMA_TAG = 135, @@ -1183,6 +1185,11 @@ struct ath12k_htt_phy_tpc_stats_tlv { __le32 sub_band_txpower[ATH12K_HTT_MAX_CH_PWR_INFO_SIZE]; } __packed; +struct ath12k_htt_t2h_soc_txrx_stats_common_tlv { + __le32 inv_peers_msdu_drop_count_hi; + __le32 inv_peers_msdu_drop_count_lo; +} __packed; + struct ath12k_htt_dmac_reset_stats_tlv { __le32 reset_count; __le32 reset_time_lo_ms; From c8f314703bcbade1bcef8dfee9ffc6a1d66b9d8f Mon Sep 17 00:00:00 2001 From: Dinesh Karthikeyan Date: Fri, 15 Nov 2024 11:58:54 +0530 Subject: [PATCH 0035/1450] wifi: ath12k: Support Transmit PER Rate Stats Add support to request per rate stats through HTT stats type 40. These stats give information about rates of PPDUs and MPDUs for single user and for OFDMA and MUMIMO technologies corresponding to multiple users. Sample output: ------------- echo 40 > /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats_type cat /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats HTT_TX_PER_STATS: PER_STATS_SU: PER per BW: ppdus_tried_su = 0:0 1:0 2:0 3:0 4:0 ppdus_ack_failed_su = 0:0 1:0 2:0 3:0 4:0 mpdus_tried_su = 0:0 1:0 2:0 3:0 4:0 mpdus_failed_su = 0:0 1:0 2:0 3:0 4:0 PER per NSS: ppdus_tried_su = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 ppdus_ack_failed_su = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 mpdus_tried_su = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 mpdus_failed_su = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 PER per MCS: ppdus_tried_su = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 ppdus_ack_failed_su = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 mpdus_tried_su = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 mpdus_failed_su = 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 8:0 9:0 10:0 11:0 12:0 13:0 ..... Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0-03427-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.15378.4 Signed-off-by: Dinesh Karthikeyan Signed-off-by: Roopni Devanathan Acked-by: Jeff Johnson Acked-by: Kalle Valo Link: https://patch.msgid.link/20241115062854.1919672-5-quic_rdevanat@quicinc.com Signed-off-by: Jeff Johnson --- .../wireless/ath/ath12k/debugfs_htt_stats.c | 267 +++++++++++++++++- .../wireless/ath/ath12k/debugfs_htt_stats.h | 68 ++++- 2 files changed, 332 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c index f310e95a65f77..92ff53767b2fc 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c @@ -48,6 +48,28 @@ print_array_to_buf(u8 *buf, u32 offset, const char *header, footer); } +static const char *ath12k_htt_ax_tx_rx_ru_size_to_str(u8 ru_size) +{ + switch (ru_size) { + case ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_26: + return "26"; + case ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_52: + return "52"; + case ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_106: + return "106"; + case ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_242: + return "242"; + case ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_484: + return "484"; + case ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_996: + return "996"; + case ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_996x2: + return "996x2"; + default: + return "unknown"; + } +} + static const char *ath12k_htt_be_tx_rx_ru_size_to_str(u8 ru_size) { switch (ru_size) { @@ -88,6 +110,17 @@ static const char *ath12k_htt_be_tx_rx_ru_size_to_str(u8 ru_size) } } +static const char* +ath12k_tx_ru_size_to_str(enum ath12k_htt_stats_ru_type ru_type, u8 ru_size) +{ + if (ru_type == ATH12K_HTT_STATS_RU_TYPE_SINGLE_RU_ONLY) + return ath12k_htt_ax_tx_rx_ru_size_to_str(ru_size); + else if (ru_type == ATH12K_HTT_STATS_RU_TYPE_SINGLE_AND_MULTI_RU) + return ath12k_htt_be_tx_rx_ru_size_to_str(ru_size); + else + return "unknown"; +} + static void htt_print_tx_pdev_stats_cmn_tlv(const void *tag_buf, u16 tag_len, struct debug_htt_stats_req *stats_req) @@ -2881,6 +2914,235 @@ ath12k_htt_print_soc_txrx_stats_common_tlv(const void *tag_buf, u16 tag_len, stats_req->buf_len = len; } +static void +ath12k_htt_print_tx_per_rate_stats_tlv(const void *tag_buf, u16 tag_len, + struct debug_htt_stats_req *stats_req) +{ + const struct ath12k_htt_tx_per_rate_stats_tlv *stats_buf = tag_buf; + u32 len = stats_req->buf_len; + u32 buf_len = ATH12K_HTT_STATS_BUF_SIZE; + u32 ru_size_cnt = 0; + u32 rc_mode, ru_type; + u8 *buf = stats_req->buf, i; + const char *mode_prefix; + + if (tag_len < sizeof(*stats_buf)) + return; + + rc_mode = le32_to_cpu(stats_buf->rc_mode); + ru_type = le32_to_cpu(stats_buf->ru_type); + + switch (rc_mode) { + case ATH12K_HTT_STATS_RC_MODE_DLSU: + len += scnprintf(buf + len, buf_len - len, "HTT_TX_PER_STATS:\n"); + len += scnprintf(buf + len, buf_len - len, "\nPER_STATS_SU:\n"); + mode_prefix = "su"; + break; + case ATH12K_HTT_STATS_RC_MODE_DLMUMIMO: + len += scnprintf(buf + len, buf_len - len, "\nPER_STATS_DL_MUMIMO:\n"); + mode_prefix = "mu"; + break; + case ATH12K_HTT_STATS_RC_MODE_DLOFDMA: + len += scnprintf(buf + len, buf_len - len, "\nPER_STATS_DL_OFDMA:\n"); + mode_prefix = "ofdma"; + if (ru_type == ATH12K_HTT_STATS_RU_TYPE_SINGLE_RU_ONLY) + ru_size_cnt = ATH12K_HTT_TX_RX_PDEV_STATS_NUM_AX_RU_SIZE_CNTRS; + else if (ru_type == ATH12K_HTT_STATS_RU_TYPE_SINGLE_AND_MULTI_RU) + ru_size_cnt = ATH12K_HTT_TX_RX_PDEV_NUM_BE_RU_SIZE_CNTRS; + break; + case ATH12K_HTT_STATS_RC_MODE_ULMUMIMO: + len += scnprintf(buf + len, buf_len - len, "HTT_RX_PER_STATS:\n"); + len += scnprintf(buf + len, buf_len - len, "\nPER_STATS_UL_MUMIMO:\n"); + mode_prefix = "ulmu"; + break; + case ATH12K_HTT_STATS_RC_MODE_ULOFDMA: + len += scnprintf(buf + len, buf_len - len, "\nPER_STATS_UL_OFDMA:\n"); + mode_prefix = "ulofdma"; + if (ru_type == ATH12K_HTT_STATS_RU_TYPE_SINGLE_RU_ONLY) + ru_size_cnt = ATH12K_HTT_TX_RX_PDEV_STATS_NUM_AX_RU_SIZE_CNTRS; + else if (ru_type == ATH12K_HTT_STATS_RU_TYPE_SINGLE_AND_MULTI_RU) + ru_size_cnt = ATH12K_HTT_TX_RX_PDEV_NUM_BE_RU_SIZE_CNTRS; + break; + default: + return; + } + + len += scnprintf(buf + len, buf_len - len, "\nPER per BW:\n"); + if (rc_mode == ATH12K_HTT_STATS_RC_MODE_ULOFDMA || + rc_mode == ATH12K_HTT_STATS_RC_MODE_ULMUMIMO) + len += scnprintf(buf + len, buf_len - len, "data_ppdus_%s = ", + mode_prefix); + else + len += scnprintf(buf + len, buf_len - len, "ppdus_tried_%s = ", + mode_prefix); + for (i = 0; i < ATH12K_HTT_TX_PDEV_STATS_NUM_BW_CNTRS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i, + le32_to_cpu(stats_buf->per_bw[i].ppdus_tried)); + len += scnprintf(buf + len, buf_len - len, " %u:%u\n", i, + le32_to_cpu(stats_buf->per_bw320.ppdus_tried)); + + if (rc_mode == ATH12K_HTT_STATS_RC_MODE_ULOFDMA || + rc_mode == ATH12K_HTT_STATS_RC_MODE_ULMUMIMO) + len += scnprintf(buf + len, buf_len - len, "non_data_ppdus_%s = ", + mode_prefix); + else + len += scnprintf(buf + len, buf_len - len, "ppdus_ack_failed_%s = ", + mode_prefix); + for (i = 0; i < ATH12K_HTT_TX_PDEV_STATS_NUM_BW_CNTRS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i, + le32_to_cpu(stats_buf->per_bw[i].ppdus_ack_failed)); + len += scnprintf(buf + len, buf_len - len, " %u:%u\n", i, + le32_to_cpu(stats_buf->per_bw320.ppdus_ack_failed)); + + len += scnprintf(buf + len, buf_len - len, "mpdus_tried_%s = ", mode_prefix); + for (i = 0; i < ATH12K_HTT_TX_PDEV_STATS_NUM_BW_CNTRS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i, + le32_to_cpu(stats_buf->per_bw[i].mpdus_tried)); + len += scnprintf(buf + len, buf_len - len, " %u:%u\n", i, + le32_to_cpu(stats_buf->per_bw320.mpdus_tried)); + + len += scnprintf(buf + len, buf_len - len, "mpdus_failed_%s = ", mode_prefix); + for (i = 0; i < ATH12K_HTT_TX_PDEV_STATS_NUM_BW_CNTRS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u", i, + le32_to_cpu(stats_buf->per_bw[i].mpdus_failed)); + len += scnprintf(buf + len, buf_len - len, " %u:%u\n", i, + le32_to_cpu(stats_buf->per_bw320.mpdus_failed)); + + len += scnprintf(buf + len, buf_len - len, "\nPER per NSS:\n"); + if (rc_mode == ATH12K_HTT_STATS_RC_MODE_ULOFDMA || + rc_mode == ATH12K_HTT_STATS_RC_MODE_ULMUMIMO) + len += scnprintf(buf + len, buf_len - len, "data_ppdus_%s = ", + mode_prefix); + else + len += scnprintf(buf + len, buf_len - len, "ppdus_tried_%s = ", + mode_prefix); + for (i = 0; i < ATH12K_HTT_PDEV_STAT_NUM_SPATIAL_STREAMS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i + 1, + le32_to_cpu(stats_buf->per_nss[i].ppdus_tried)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + if (rc_mode == ATH12K_HTT_STATS_RC_MODE_ULOFDMA || + rc_mode == ATH12K_HTT_STATS_RC_MODE_ULMUMIMO) + len += scnprintf(buf + len, buf_len - len, "non_data_ppdus_%s = ", + mode_prefix); + else + len += scnprintf(buf + len, buf_len - len, "ppdus_ack_failed_%s = ", + mode_prefix); + for (i = 0; i < ATH12K_HTT_PDEV_STAT_NUM_SPATIAL_STREAMS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i + 1, + le32_to_cpu(stats_buf->per_nss[i].ppdus_ack_failed)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + len += scnprintf(buf + len, buf_len - len, "mpdus_tried_%s = ", mode_prefix); + for (i = 0; i < ATH12K_HTT_PDEV_STAT_NUM_SPATIAL_STREAMS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i + 1, + le32_to_cpu(stats_buf->per_nss[i].mpdus_tried)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + len += scnprintf(buf + len, buf_len - len, "mpdus_failed_%s = ", mode_prefix); + for (i = 0; i < ATH12K_HTT_PDEV_STAT_NUM_SPATIAL_STREAMS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i + 1, + le32_to_cpu(stats_buf->per_nss[i].mpdus_failed)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + len += scnprintf(buf + len, buf_len - len, "\nPER per MCS:\n"); + if (rc_mode == ATH12K_HTT_STATS_RC_MODE_ULOFDMA || + rc_mode == ATH12K_HTT_STATS_RC_MODE_ULMUMIMO) + len += scnprintf(buf + len, buf_len - len, "data_ppdus_%s = ", + mode_prefix); + else + len += scnprintf(buf + len, buf_len - len, "ppdus_tried_%s = ", + mode_prefix); + for (i = 0; i < ATH12K_HTT_TXBF_RATE_STAT_NUM_MCS_CNTRS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i, + le32_to_cpu(stats_buf->per_mcs[i].ppdus_tried)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + if (rc_mode == ATH12K_HTT_STATS_RC_MODE_ULOFDMA || + rc_mode == ATH12K_HTT_STATS_RC_MODE_ULMUMIMO) + len += scnprintf(buf + len, buf_len - len, "non_data_ppdus_%s = ", + mode_prefix); + else + len += scnprintf(buf + len, buf_len - len, "ppdus_ack_failed_%s = ", + mode_prefix); + for (i = 0; i < ATH12K_HTT_TXBF_RATE_STAT_NUM_MCS_CNTRS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i, + le32_to_cpu(stats_buf->per_mcs[i].ppdus_ack_failed)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + len += scnprintf(buf + len, buf_len - len, "mpdus_tried_%s = ", mode_prefix); + for (i = 0; i < ATH12K_HTT_TXBF_RATE_STAT_NUM_MCS_CNTRS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i, + le32_to_cpu(stats_buf->per_mcs[i].mpdus_tried)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + len += scnprintf(buf + len, buf_len - len, "mpdus_failed_%s = ", mode_prefix); + for (i = 0; i < ATH12K_HTT_TXBF_RATE_STAT_NUM_MCS_CNTRS; i++) + len += scnprintf(buf + len, buf_len - len, " %u:%u ", i, + le32_to_cpu(stats_buf->per_mcs[i].mpdus_failed)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + if ((rc_mode == ATH12K_HTT_STATS_RC_MODE_DLOFDMA || + rc_mode == ATH12K_HTT_STATS_RC_MODE_ULOFDMA) && + ru_type != ATH12K_HTT_STATS_RU_TYPE_INVALID) { + len += scnprintf(buf + len, buf_len - len, "\nPER per RU:\n"); + + if (rc_mode == ATH12K_HTT_STATS_RC_MODE_ULOFDMA) + len += scnprintf(buf + len, buf_len - len, "data_ppdus_%s = ", + mode_prefix); + else + len += scnprintf(buf + len, buf_len - len, "ppdus_tried_%s = ", + mode_prefix); + for (i = 0; i < ru_size_cnt; i++) + len += scnprintf(buf + len, buf_len - len, " %s:%u ", + ath12k_tx_ru_size_to_str(ru_type, i), + le32_to_cpu(stats_buf->ru[i].ppdus_tried)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + if (rc_mode == ATH12K_HTT_STATS_RC_MODE_ULOFDMA) + len += scnprintf(buf + len, buf_len - len, + "non_data_ppdus_%s = ", mode_prefix); + else + len += scnprintf(buf + len, buf_len - len, + "ppdus_ack_failed_%s = ", mode_prefix); + for (i = 0; i < ru_size_cnt; i++) + len += scnprintf(buf + len, buf_len - len, " %s:%u ", + ath12k_tx_ru_size_to_str(ru_type, i), + le32_to_cpu(stats_buf->ru[i].ppdus_ack_failed)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + len += scnprintf(buf + len, buf_len - len, "mpdus_tried_%s = ", + mode_prefix); + for (i = 0; i < ru_size_cnt; i++) + len += scnprintf(buf + len, buf_len - len, " %s:%u ", + ath12k_tx_ru_size_to_str(ru_type, i), + le32_to_cpu(stats_buf->ru[i].mpdus_tried)); + len += scnprintf(buf + len, buf_len - len, "\n"); + + len += scnprintf(buf + len, buf_len - len, "mpdus_failed_%s = ", + mode_prefix); + for (i = 0; i < ru_size_cnt; i++) + len += scnprintf(buf + len, buf_len - len, " %s:%u ", + ath12k_tx_ru_size_to_str(ru_type, i), + le32_to_cpu(stats_buf->ru[i].mpdus_failed)); + len += scnprintf(buf + len, buf_len - len, "\n\n"); + } + + if (rc_mode == ATH12K_HTT_STATS_RC_MODE_DLMUMIMO) { + len += scnprintf(buf + len, buf_len - len, "\nlast_probed_bw = %u\n", + le32_to_cpu(stats_buf->last_probed_bw)); + len += scnprintf(buf + len, buf_len - len, "last_probed_nss = %u\n", + le32_to_cpu(stats_buf->last_probed_nss)); + len += scnprintf(buf + len, buf_len - len, "last_probed_mcs = %u\n", + le32_to_cpu(stats_buf->last_probed_mcs)); + len += print_array_to_buf(buf, len, "MU Probe count per RC MODE", + stats_buf->probe_cnt, + ATH12K_HTT_RC_MODE_2D_COUNT, "\n\n"); + } + + stats_req->buf_len = len; +} + static void ath12k_htt_print_dmac_reset_stats_tlv(const void *tag_buf, u16 tag_len, struct debug_htt_stats_req *stats_req) @@ -3018,7 +3280,7 @@ ath12k_htt_print_tx_pdev_rate_stats_be_ofdma_tlv(const void *tag_buf, u16 tag_le len += scnprintf(buf + len, buf_len - len, "\n"); len += print_array_to_buf_index(buf, len, "be_ofdma_tx_nss = ", 1, htt_stats_buf->be_ofdma_tx_nss, - ATH12K_HTT_TX_PDEV_STATS_NUM_SPATIAL_STREAMS, + ATH12K_HTT_PDEV_STAT_NUM_SPATIAL_STREAMS, "\n"); len += print_array_to_buf(buf, len, "be_ofdma_tx_bw", htt_stats_buf->be_ofdma_tx_bw, @@ -3228,6 +3490,9 @@ static int ath12k_dbg_htt_ext_stats_parse(struct ath12k_base *ab, case HTT_STATS_SOC_TXRX_STATS_COMMON_TAG: ath12k_htt_print_soc_txrx_stats_common_tlv(tag_buf, len, stats_req); break; + case HTT_STATS_PER_RATE_STATS_TAG: + ath12k_htt_print_tx_per_rate_stats_tlv(tag_buf, len, stats_req); + break; case HTT_STATS_DMAC_RESET_STATS_TAG: ath12k_htt_print_dmac_reset_stats_tlv(tag_buf, len, stats_req); break; diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h index c07b60636c22d..d199bed0a9d1a 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h @@ -138,6 +138,7 @@ enum ath12k_dbg_htt_ext_stats_type { ATH12K_DBG_HTT_EXT_STATS_DLPAGER_STATS = 36, ATH12K_DBG_HTT_EXT_PHY_COUNTERS_AND_PHY_STATS = 37, ATH12K_DBG_HTT_EXT_VDEVS_TXRX_STATS = 38, + ATH12K_DBG_HTT_EXT_PDEV_PER_STATS = 40, ATH12K_DBG_HTT_EXT_STATS_SOC_ERROR = 45, ATH12K_DBG_HTT_EXT_STATS_PDEV_SCHED_ALGO = 49, ATH12K_DBG_HTT_EXT_STATS_MANDATORY_MUOFDMA = 51, @@ -203,6 +204,7 @@ enum ath12k_dbg_htt_tlv_tag { HTT_STATS_PHY_RESET_COUNTERS_TAG = 123, HTT_STATS_PHY_RESET_STATS_TAG = 124, HTT_STATS_SOC_TXRX_STATS_COMMON_TAG = 125, + HTT_STATS_PER_RATE_STATS_TAG = 128, HTT_STATS_MU_PPDU_DIST_TAG = 129, HTT_STATS_TX_PDEV_MUMIMO_GRP_STATS_TAG = 130, HTT_STATS_TX_PDEV_RATE_STATS_BE_OFDMA_TAG = 135, @@ -1221,6 +1223,10 @@ struct ath12k_htt_pdev_sched_algo_ofdma_stats_tlv { __le32 dlofdma_disabled_consec_no_mpdus_success[ATH12K_HTT_NUM_AC_WMM]; } __packed; +#define ATH12K_HTT_TX_PDEV_STATS_NUM_BW_CNTRS 4 +#define ATH12K_HTT_PDEV_STAT_NUM_SPATIAL_STREAMS 8 +#define ATH12K_HTT_TXBF_RATE_STAT_NUM_MCS_CNTRS 14 + enum ATH12K_HTT_TX_RX_PDEV_STATS_BE_RU_SIZE { ATH12K_HTT_TX_RX_PDEV_STATS_BE_RU_SIZE_26, ATH12K_HTT_TX_RX_PDEV_STATS_BE_RU_SIZE_52, @@ -1241,7 +1247,65 @@ enum ATH12K_HTT_TX_RX_PDEV_STATS_BE_RU_SIZE { ATH12K_HTT_TX_RX_PDEV_NUM_BE_RU_SIZE_CNTRS, }; -#define ATH12K_HTT_TX_PDEV_STATS_NUM_SPATIAL_STREAMS 8 +enum ATH12K_HTT_RC_MODE { + ATH12K_HTT_RC_MODE_SU_OL, + ATH12K_HTT_RC_MODE_SU_BF, + ATH12K_HTT_RC_MODE_MU1_INTF, + ATH12K_HTT_RC_MODE_MU2_INTF, + ATH12K_HTT_RC_MODE_MU3_INTF, + ATH12K_HTT_RC_MODE_MU4_INTF, + ATH12K_HTT_RC_MODE_MU5_INTF, + ATH12K_HTT_RC_MODE_MU6_INTF, + ATH12K_HTT_RC_MODE_MU7_INTF, + ATH12K_HTT_RC_MODE_2D_COUNT +}; + +enum ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE { + ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_26, + ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_52, + ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_106, + ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_242, + ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_484, + ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_996, + ATH12K_HTT_TX_RX_PDEV_STATS_AX_RU_SIZE_996x2, + ATH12K_HTT_TX_RX_PDEV_STATS_NUM_AX_RU_SIZE_CNTRS +}; + +enum ath12k_htt_stats_rc_mode { + ATH12K_HTT_STATS_RC_MODE_DLSU = 0, + ATH12K_HTT_STATS_RC_MODE_DLMUMIMO = 1, + ATH12K_HTT_STATS_RC_MODE_DLOFDMA = 2, + ATH12K_HTT_STATS_RC_MODE_ULMUMIMO = 3, + ATH12K_HTT_STATS_RC_MODE_ULOFDMA = 4, +}; + +enum ath12k_htt_stats_ru_type { + ATH12K_HTT_STATS_RU_TYPE_INVALID, + ATH12K_HTT_STATS_RU_TYPE_SINGLE_RU_ONLY, + ATH12K_HTT_STATS_RU_TYPE_SINGLE_AND_MULTI_RU, +}; + +struct ath12k_htt_tx_rate_stats { + __le32 ppdus_tried; + __le32 ppdus_ack_failed; + __le32 mpdus_tried; + __le32 mpdus_failed; +} __packed; + +struct ath12k_htt_tx_per_rate_stats_tlv { + __le32 rc_mode; + __le32 last_probed_mcs; + __le32 last_probed_nss; + __le32 last_probed_bw; + struct ath12k_htt_tx_rate_stats per_bw[ATH12K_HTT_TX_PDEV_STATS_NUM_BW_CNTRS]; + struct ath12k_htt_tx_rate_stats per_nss[ATH12K_HTT_PDEV_STAT_NUM_SPATIAL_STREAMS]; + struct ath12k_htt_tx_rate_stats per_mcs[ATH12K_HTT_TXBF_RATE_STAT_NUM_MCS_CNTRS]; + struct ath12k_htt_tx_rate_stats per_bw320; + __le32 probe_cnt[ATH12K_HTT_RC_MODE_2D_COUNT]; + __le32 ru_type; + struct ath12k_htt_tx_rate_stats ru[ATH12K_HTT_TX_RX_PDEV_NUM_BE_RU_SIZE_CNTRS]; +} __packed; + #define ATH12K_HTT_TX_PDEV_NUM_BE_MCS_CNTRS 16 #define ATH12K_HTT_TX_PDEV_NUM_BE_BW_CNTRS 5 #define ATH12K_HTT_TX_PDEV_NUM_EHT_SIG_MCS_CNTRS 4 @@ -1251,7 +1315,7 @@ struct ath12k_htt_tx_pdev_rate_stats_be_ofdma_tlv { __le32 mac_id__word; __le32 be_ofdma_tx_ldpc; __le32 be_ofdma_tx_mcs[ATH12K_HTT_TX_PDEV_NUM_BE_MCS_CNTRS]; - __le32 be_ofdma_tx_nss[ATH12K_HTT_TX_PDEV_STATS_NUM_SPATIAL_STREAMS]; + __le32 be_ofdma_tx_nss[ATH12K_HTT_PDEV_STAT_NUM_SPATIAL_STREAMS]; __le32 be_ofdma_tx_bw[ATH12K_HTT_TX_PDEV_NUM_BE_BW_CNTRS]; __le32 gi[ATH12K_HTT_TX_PDEV_NUM_GI_CNTRS][ATH12K_HTT_TX_PDEV_NUM_BE_MCS_CNTRS]; __le32 be_ofdma_tx_ru_size[ATH12K_HTT_TX_RX_PDEV_NUM_BE_RU_SIZE_CNTRS]; From ea58aae8458480d6391c404e62f1a4b30462b9c3 Mon Sep 17 00:00:00 2001 From: Sidhanta Sahu Date: Tue, 5 Nov 2024 10:58:54 +0530 Subject: [PATCH 0036/1450] wifi: ath12k: Support MBSSID Control Frame Stats Add support to request MBSSID control frame stats from firmware through HTT stats type 54. These stats give information such as basic trigger, BSR trigger, multi-user RTS and uplink MUMIMO trigger within and across various BSS. Note: WCN7850 firmware version - WLAN.HMT.1.0-03427-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.15378.4 does not support HTT stats type 54. Sample output: ------------- echo 54 > /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats_type cat /sys/kernel/debug/ath12k/pci-0000\:06\:00.0/mac0/htt_stats HTT_MBSSID_CTRL_FRAME_STATS_TLV: mac_id = 0 basic_trigger_across_bss = 0 basic_trigger_within_bss = 0 bsr_trigger_across_bss = 0 bsr_trigger_within_bss = 0 mu_rts_across_bss = 0 mu_rts_within_bss = 0 ul_mumimo_trigger_across_bss = 0 ul_mumimo_trigger_within_bss = 0 Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.1.1-00214-QCAHKSWPL_SILICONZ-1 Signed-off-by: Sidhanta Sahu Signed-off-by: Roopni Devanathan Acked-by: Kalle Valo Link: https://patch.msgid.link/20241105052854.2118987-1-quic_rdevanat@quicinc.com Signed-off-by: Jeff Johnson --- .../wireless/ath/ath12k/debugfs_htt_stats.c | 43 +++++++++++++++++++ .../wireless/ath/ath12k/debugfs_htt_stats.h | 14 ++++++ 2 files changed, 57 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c index 92ff53767b2fc..0bc945c7a93aa 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c @@ -3296,6 +3296,45 @@ ath12k_htt_print_tx_pdev_rate_stats_be_ofdma_tlv(const void *tag_buf, u16 tag_le stats_req->buf_len = len; } +static void +ath12k_htt_print_pdev_mbssid_ctrl_frame_stats_tlv(const void *tag_buf, u16 tag_len, + struct debug_htt_stats_req *stats_req) +{ + const struct ath12k_htt_pdev_mbssid_ctrl_frame_tlv *htt_stats_buf = tag_buf; + u8 *buf = stats_req->buf; + u32 len = stats_req->buf_len; + u32 buf_len = ATH12K_HTT_STATS_BUF_SIZE; + u32 mac_id_word; + + if (tag_len < sizeof(*htt_stats_buf)) + return; + + mac_id_word = le32_to_cpu(htt_stats_buf->mac_id__word); + + len += scnprintf(buf + len, buf_len - len, "HTT_MBSSID_CTRL_FRAME_STATS_TLV:\n"); + len += scnprintf(buf + len, buf_len - len, "mac_id = %u\n", + u32_get_bits(mac_id_word, ATH12K_HTT_STATS_MAC_ID)); + len += scnprintf(buf + len, buf_len - len, "basic_trigger_across_bss = %u\n", + le32_to_cpu(htt_stats_buf->basic_trigger_across_bss)); + len += scnprintf(buf + len, buf_len - len, "basic_trigger_within_bss = %u\n", + le32_to_cpu(htt_stats_buf->basic_trigger_within_bss)); + len += scnprintf(buf + len, buf_len - len, "bsr_trigger_across_bss = %u\n", + le32_to_cpu(htt_stats_buf->bsr_trigger_across_bss)); + len += scnprintf(buf + len, buf_len - len, "bsr_trigger_within_bss = %u\n", + le32_to_cpu(htt_stats_buf->bsr_trigger_within_bss)); + len += scnprintf(buf + len, buf_len - len, "mu_rts_across_bss = %u\n", + le32_to_cpu(htt_stats_buf->mu_rts_across_bss)); + len += scnprintf(buf + len, buf_len - len, "mu_rts_within_bss = %u\n", + le32_to_cpu(htt_stats_buf->mu_rts_within_bss)); + len += scnprintf(buf + len, buf_len - len, "ul_mumimo_trigger_across_bss = %u\n", + le32_to_cpu(htt_stats_buf->ul_mumimo_trigger_across_bss)); + len += scnprintf(buf + len, buf_len - len, + "ul_mumimo_trigger_within_bss = %u\n\n", + le32_to_cpu(htt_stats_buf->ul_mumimo_trigger_within_bss)); + + stats_req->buf_len = len; +} + static int ath12k_dbg_htt_ext_stats_parse(struct ath12k_base *ab, u16 tag, u16 len, const void *tag_buf, void *user_data) @@ -3502,6 +3541,10 @@ static int ath12k_dbg_htt_ext_stats_parse(struct ath12k_base *ab, case HTT_STATS_TX_PDEV_RATE_STATS_BE_OFDMA_TAG: ath12k_htt_print_tx_pdev_rate_stats_be_ofdma_tlv(tag_buf, len, stats_req); break; + case HTT_STATS_PDEV_MBSSID_CTRL_FRAME_STATS_TAG: + ath12k_htt_print_pdev_mbssid_ctrl_frame_stats_tlv(tag_buf, len, + stats_req); + break; default: break; } diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h index d199bed0a9d1a..cf3c88f8d1b24 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.h @@ -142,6 +142,7 @@ enum ath12k_dbg_htt_ext_stats_type { ATH12K_DBG_HTT_EXT_STATS_SOC_ERROR = 45, ATH12K_DBG_HTT_EXT_STATS_PDEV_SCHED_ALGO = 49, ATH12K_DBG_HTT_EXT_STATS_MANDATORY_MUOFDMA = 51, + ATH12K_DGB_HTT_EXT_STATS_PDEV_MBSSID_CTRL_FRAME = 54, /* keep this last */ ATH12K_DBG_HTT_NUM_EXT_STATS, @@ -214,6 +215,7 @@ enum ath12k_dbg_htt_tlv_tag { HTT_STATS_DMAC_RESET_STATS_TAG = 155, HTT_STATS_PHY_TPC_STATS_TAG = 157, HTT_STATS_PDEV_SCHED_ALGO_OFDMA_STATS_TAG = 165, + HTT_STATS_PDEV_MBSSID_CTRL_FRAME_STATS_TAG = 176, HTT_STATS_MAX_TAG, }; @@ -1322,4 +1324,16 @@ struct ath12k_htt_tx_pdev_rate_stats_be_ofdma_tlv { __le32 be_ofdma_eht_sig_mcs[ATH12K_HTT_TX_PDEV_NUM_EHT_SIG_MCS_CNTRS]; } __packed; +struct ath12k_htt_pdev_mbssid_ctrl_frame_tlv { + __le32 mac_id__word; + __le32 basic_trigger_across_bss; + __le32 basic_trigger_within_bss; + __le32 bsr_trigger_across_bss; + __le32 bsr_trigger_within_bss; + __le32 mu_rts_across_bss; + __le32 mu_rts_within_bss; + __le32 ul_mumimo_trigger_across_bss; + __le32 ul_mumimo_trigger_within_bss; +} __packed; + #endif From 82a35723a67c29f685d7b518962154a73b7163a2 Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Thu, 14 Nov 2024 17:46:08 +0200 Subject: [PATCH 0037/1450] wifi: rtw88: usb: Support USB 3 with RTL8812AU Add the function to automatically switch the RTL8812AU into USB 3 mode. Signed-off-by: Bitterblue Smith Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/19cda72b-f1f1-4b69-8369-0e4376b646bf@gmail.com --- drivers/net/wireless/realtek/rtw88/usb.c | 44 ++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/usb.c b/drivers/net/wireless/realtek/rtw88/usb.c index 8d6db68246f1d..db1769a4b617b 100644 --- a/drivers/net/wireless/realtek/rtw88/usb.c +++ b/drivers/net/wireless/realtek/rtw88/usb.c @@ -930,6 +930,32 @@ static void rtw_usb_intf_deinit(struct rtw_dev *rtwdev, usb_set_intfdata(intf, NULL); } +static int rtw_usb_switch_mode_old(struct rtw_dev *rtwdev) +{ + struct rtw_usb *rtwusb = rtw_get_usb_priv(rtwdev); + enum usb_device_speed cur_speed = rtwusb->udev->speed; + u8 hci_opt; + + if (cur_speed == USB_SPEED_HIGH) { + hci_opt = rtw_read8(rtwdev, REG_HCI_OPT_CTRL); + + if ((hci_opt & (BIT(2) | BIT(3))) != BIT(3)) { + rtw_write8(rtwdev, REG_HCI_OPT_CTRL, 0x8); + rtw_write8(rtwdev, REG_SYS_SDIO_CTRL, 0x2); + rtw_write8(rtwdev, REG_ACLK_MON, 0x1); + rtw_write8(rtwdev, 0x3d, 0x3); + /* usb disconnect */ + rtw_write8(rtwdev, REG_SYS_PW_CTRL + 1, 0x80); + return 1; + } + } else if (cur_speed == USB_SPEED_SUPER) { + rtw_write8_clr(rtwdev, REG_SYS_SDIO_CTRL, BIT(1)); + rtw_write8_clr(rtwdev, REG_ACLK_MON, BIT(0)); + } + + return 0; +} + static int rtw_usb_switch_mode_new(struct rtw_dev *rtwdev) { enum usb_device_speed cur_speed; @@ -979,11 +1005,22 @@ static int rtw_usb_switch_mode_new(struct rtw_dev *rtwdev) return 1; } +static bool rtw_usb3_chip_old(u8 chip_id) +{ + return chip_id == RTW_CHIP_TYPE_8812A; +} + +static bool rtw_usb3_chip_new(u8 chip_id) +{ + return chip_id == RTW_CHIP_TYPE_8822C || + chip_id == RTW_CHIP_TYPE_8822B; +} + static int rtw_usb_switch_mode(struct rtw_dev *rtwdev) { u8 id = rtwdev->chip->id; - if (id != RTW_CHIP_TYPE_8822C && id != RTW_CHIP_TYPE_8822B) + if (!rtw_usb3_chip_new(id) && !rtw_usb3_chip_old(id)) return 0; if (!rtwdev->efuse.usb_mode_switch) { @@ -998,7 +1035,10 @@ static int rtw_usb_switch_mode(struct rtw_dev *rtwdev) return 0; } - return rtw_usb_switch_mode_new(rtwdev); + if (rtw_usb3_chip_old(id)) + return rtw_usb_switch_mode_old(rtwdev); + else + return rtw_usb_switch_mode_new(rtwdev); } int rtw_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) From ce5dea83ee8f945203144fb891fdcb978216e45a Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Thu, 14 Nov 2024 17:48:09 +0200 Subject: [PATCH 0038/1450] wifi: rtw88: usb: Enable RX aggregation for 8821au/8812au USB RX aggregation improves the RX speed on certain ARM systems, like the NanoPi NEO Core2. With RTL8811AU, before: 30 Mbps, after: 224 Mbps. The out-of-tree driver uses aggregation size of 7 in USB 3 mode, but that doesn't work here. rtw88 advertises support for receiving AMSDU in AMPDU, so the AP sends larger frames, up to ~5100 bytes. With a size of 7 RTL8812AU frequently tries to aggregate more frames than will fit in 32768 bytes. Use a size of 6 instead. Signed-off-by: Bitterblue Smith Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/63012163-a425-4b15-b830-43f279c06b73@gmail.com --- drivers/net/wireless/realtek/rtw88/usb.c | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw88/usb.c b/drivers/net/wireless/realtek/rtw88/usb.c index db1769a4b617b..be193c7add77a 100644 --- a/drivers/net/wireless/realtek/rtw88/usb.c +++ b/drivers/net/wireless/realtek/rtw88/usb.c @@ -789,6 +789,30 @@ static void rtw_usb_dynamic_rx_agg_v1(struct rtw_dev *rtwdev, bool enable) rtw_write16(rtwdev, REG_RXDMA_AGG_PG_TH, val16); } +static void rtw_usb_dynamic_rx_agg_v2(struct rtw_dev *rtwdev, bool enable) +{ + struct rtw_usb *rtwusb = rtw_get_usb_priv(rtwdev); + u8 size, timeout; + u16 val16; + + if (!enable) { + size = 0x0; + timeout = 0x1; + } else if (rtwusb->udev->speed == USB_SPEED_SUPER) { + size = 0x6; + timeout = 0x1a; + } else { + size = 0x5; + timeout = 0x20; + } + + val16 = u16_encode_bits(size, BIT_RXDMA_AGG_PG_TH) | + u16_encode_bits(timeout, BIT_DMA_AGG_TO_V1); + + rtw_write16(rtwdev, REG_RXDMA_AGG_PG_TH, val16); + rtw_write8_set(rtwdev, REG_TXDMA_PQ_MAP, BIT_RXDMA_AGG_EN); +} + static void rtw_usb_dynamic_rx_agg(struct rtw_dev *rtwdev, bool enable) { switch (rtwdev->chip->id) { @@ -797,6 +821,10 @@ static void rtw_usb_dynamic_rx_agg(struct rtw_dev *rtwdev, bool enable) case RTW_CHIP_TYPE_8821C: rtw_usb_dynamic_rx_agg_v1(rtwdev, enable); break; + case RTW_CHIP_TYPE_8821A: + case RTW_CHIP_TYPE_8812A: + rtw_usb_dynamic_rx_agg_v2(rtwdev, enable); + break; case RTW_CHIP_TYPE_8723D: /* Doesn't like aggregation. */ break; From 1cfa6d4e5bd9bfb15d165d8d843163363929ba1b Mon Sep 17 00:00:00 2001 From: P Praneesh Date: Tue, 19 Nov 2024 22:15:16 +0530 Subject: [PATCH 0039/1450] wifi: ath12k: Fix endianness issue in struct hal_tlv_64_hdr struct hal_tlv_64_hdr has a 64-bit member that should be in little-endian format, but the current definition uses host byte order. Fix this by changing the definition and updating the corresponding helper functions used for the byte order conversion. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Signed-off-by: P Praneesh Acked-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241119164516.756478-1-quic_ppranees@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/dp_rx.c | 2 +- drivers/net/wireless/ath/ath12k/hal_desc.h | 2 +- drivers/net/wireless/ath/ath12k/hal_rx.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/dp_rx.c b/drivers/net/wireless/ath/ath12k/dp_rx.c index 9ae579e505572..0fb39c174475d 100644 --- a/drivers/net/wireless/ath/ath12k/dp_rx.c +++ b/drivers/net/wireless/ath/ath12k/dp_rx.c @@ -3912,7 +3912,7 @@ void ath12k_dp_rx_process_reo_status(struct ath12k_base *ab) ath12k_hal_srng_access_begin(ab, srng); while ((hdr = ath12k_hal_srng_dst_get_next_entry(ab, srng))) { - tag = u64_get_bits(hdr->tl, HAL_SRNG_TLV_HDR_TAG); + tag = le64_get_bits(hdr->tl, HAL_SRNG_TLV_HDR_TAG); switch (tag) { case HAL_REO_GET_QUEUE_STATS_STATUS: diff --git a/drivers/net/wireless/ath/ath12k/hal_desc.h b/drivers/net/wireless/ath/ath12k/hal_desc.h index 739f73370015e..a460d432288fa 100644 --- a/drivers/net/wireless/ath/ath12k/hal_desc.h +++ b/drivers/net/wireless/ath/ath12k/hal_desc.h @@ -581,7 +581,7 @@ struct hal_tlv_hdr { #define HAL_TLV_64_HDR_LEN GENMASK(21, 10) struct hal_tlv_64_hdr { - u64 tl; + __le64 tl; u8 value[]; } __packed; diff --git a/drivers/net/wireless/ath/ath12k/hal_rx.c b/drivers/net/wireless/ath/ath12k/hal_rx.c index f7c1aaa3b5d47..ac17d6223fa7f 100644 --- a/drivers/net/wireless/ath/ath12k/hal_rx.c +++ b/drivers/net/wireless/ath/ath12k/hal_rx.c @@ -26,8 +26,8 @@ static int ath12k_hal_reo_cmd_queue_stats(struct hal_tlv_64_hdr *tlv, { struct hal_reo_get_queue_stats *desc; - tlv->tl = u32_encode_bits(HAL_REO_GET_QUEUE_STATS, HAL_TLV_HDR_TAG) | - u32_encode_bits(sizeof(*desc), HAL_TLV_HDR_LEN); + tlv->tl = le64_encode_bits(HAL_REO_GET_QUEUE_STATS, HAL_TLV_HDR_TAG) | + le64_encode_bits(sizeof(*desc), HAL_TLV_HDR_LEN); desc = (struct hal_reo_get_queue_stats *)tlv->value; memset_startat(desc, 0, queue_addr_lo); @@ -59,8 +59,8 @@ static int ath12k_hal_reo_cmd_flush_cache(struct ath12k_hal *hal, hal->current_blk_index = avail_slot; } - tlv->tl = u32_encode_bits(HAL_REO_FLUSH_CACHE, HAL_TLV_HDR_TAG) | - u32_encode_bits(sizeof(*desc), HAL_TLV_HDR_LEN); + tlv->tl = le64_encode_bits(HAL_REO_FLUSH_CACHE, HAL_TLV_HDR_TAG) | + le64_encode_bits(sizeof(*desc), HAL_TLV_HDR_LEN); desc = (struct hal_reo_flush_cache *)tlv->value; memset_startat(desc, 0, cache_addr_lo); @@ -97,8 +97,8 @@ static int ath12k_hal_reo_cmd_update_rx_queue(struct hal_tlv_64_hdr *tlv, { struct hal_reo_update_rx_queue *desc; - tlv->tl = u32_encode_bits(HAL_REO_UPDATE_RX_REO_QUEUE, HAL_TLV_HDR_TAG) | - u32_encode_bits(sizeof(*desc), HAL_TLV_HDR_LEN); + tlv->tl = le64_encode_bits(HAL_REO_UPDATE_RX_REO_QUEUE, HAL_TLV_HDR_TAG) | + le64_encode_bits(sizeof(*desc), HAL_TLV_HDR_LEN); desc = (struct hal_reo_update_rx_queue *)tlv->value; memset_startat(desc, 0, queue_addr_lo); From 6200d947f050efdba4090dfefd8a01981363d954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Barnab=C3=A1s=20Cz=C3=A9m=C3=A1n?= Date: Mon, 4 Nov 2024 21:00:35 +0100 Subject: [PATCH 0040/1450] wifi: wcn36xx: fix channel survey memory allocation size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KASAN reported a memory allocation issue in wcn->chan_survey due to incorrect size calculation. This commit uses kcalloc to allocate memory for wcn->chan_survey, ensuring proper initialization and preventing the use of uninitialized values when there are no frames on the channel. Fixes: 29696e0aa413 ("wcn36xx: Track SNR and RSSI for each RX frame") Signed-off-by: Barnabás Czémán Acked-by: Loic Poulain Reviewed-by: Bryan O'Donoghue Link: https://patch.msgid.link/20241104-wcn36xx-memory-allocation-v1-1-5ec901cf37b6@mainlining.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/wcn36xx/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c index 408776562a7e5..cd36cab6db75d 100644 --- a/drivers/net/wireless/ath/wcn36xx/main.c +++ b/drivers/net/wireless/ath/wcn36xx/main.c @@ -1590,7 +1590,10 @@ static int wcn36xx_probe(struct platform_device *pdev) } n_channels = wcn_band_2ghz.n_channels + wcn_band_5ghz.n_channels; - wcn->chan_survey = devm_kmalloc(wcn->dev, n_channels, GFP_KERNEL); + wcn->chan_survey = devm_kcalloc(wcn->dev, + n_channels, + sizeof(struct wcn36xx_chan_survey), + GFP_KERNEL); if (!wcn->chan_survey) { ret = -ENOMEM; goto out_wq; From 733a8c69ded704616b864d30d2531d090ee7a57e Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 11 Nov 2024 10:01:49 +0300 Subject: [PATCH 0041/1450] wifi: ath11k: cleanup struct ath11k_vif Remove set but otherwise unused 'tx_seq_no' member of 'struct ath11k_vif', adjust 'ath11k_control_beaconing()' accordingly. This field was actually unused since an initial commit of the driver. Compile tested only. Signed-off-by: Dmitry Antipov Acked-by: Kalle Valo Link: https://patch.msgid.link/20241111070152.85140-1-dmantipov@yandex.ru Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/core.h | 1 - drivers/net/wireless/ath/ath11k/mac.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index 09c37e19a1680..5ab1fdd211446 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -370,7 +370,6 @@ struct ath11k_vif { struct ath11k *ar; struct ieee80211_vif *vif; - u16 tx_seq_no; struct wmi_wmm_params_all_arg wmm_params; struct list_head list; union { diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index e6acbff067496..9757ac4aae50a 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -1697,8 +1697,6 @@ static void ath11k_control_beaconing(struct ath11k_vif *arvif, return; } - arvif->tx_seq_no = 0x1000; - arvif->aid = 0; ether_addr_copy(arvif->bssid, info->bssid); From 95e5de4aae8ca1af851fc922a854bbe822bf2dd4 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 11 Nov 2024 10:01:50 +0300 Subject: [PATCH 0042/1450] wifi: ath11k: cleanup struct ath11k_reg_tpc_power_info Remove unused 'ap_constraint_power' field of 'struct ath11k_reg_tpc_power_info' and adjust related comment. Compile tested only. Fixes: 6f4e235be655 ("wifi: ath11k: add parse of transmit power envelope element") Signed-off-by: Dmitry Antipov Acked-by: Kalle Valo Link: https://patch.msgid.link/20241111070152.85140-2-dmantipov@yandex.ru Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/core.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index 5ab1fdd211446..b1440a70a886b 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -340,7 +340,6 @@ struct ath11k_chan_power_info { * @ap_power_type: type of power (SP/LPI/VLP) * @num_pwr_levels: number of power levels * @reg_max: Array of maximum TX power (dBm) per PSD value - * @ap_constraint_power: AP constraint power (dBm) * @tpe: TPE values processed from TPE IE * @chan_power_info: power info to send to firmware */ @@ -350,7 +349,6 @@ struct ath11k_reg_tpc_power_info { enum wmi_reg_6ghz_ap_type ap_power_type; u8 num_pwr_levels; u8 reg_max[ATH11K_NUM_PWR_LEVELS]; - u8 ap_constraint_power; s8 tpe[ATH11K_NUM_PWR_LEVELS]; struct ath11k_chan_power_info chan_power_info[ATH11K_NUM_PWR_LEVELS]; }; From 93962446ef907cb169b089d0ff3f356e7ce004ab Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 11 Nov 2024 10:01:51 +0300 Subject: [PATCH 0043/1450] wifi: ath11k: cleanup struct ath11k_mon_data Remove initialized but otherwise unused 'rx_status_q' member of 'struct ath11k_mon_data' and adjust 'ath11k_dp_rx_pdev_mon_status_attach' accordingly. Compile tested only. Fixes: 67a9d399fcb0 ("ath11k: enable RX PPDU stats in monitor co-exist mode") Signed-off-by: Dmitry Antipov Acked-by: Kalle Valo Link: https://patch.msgid.link/20241111070152.85140-3-dmantipov@yandex.ru Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/dp.h | 1 - drivers/net/wireless/ath/ath11k/dp_rx.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/dp.h b/drivers/net/wireless/ath/ath11k/dp.h index 65d2bc0687c88..f777314db8b36 100644 --- a/drivers/net/wireless/ath/ath11k/dp.h +++ b/drivers/net/wireless/ath/ath11k/dp.h @@ -165,7 +165,6 @@ struct ath11k_mon_data { struct ath11k_pdev_mon_stats rx_mon_stats; /* lock for monitor data */ spinlock_t mon_lock; - struct sk_buff_head rx_status_q; }; struct ath11k_pdev_dp { diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c index 176bbc5d95a62..238b0e5e92a0e 100644 --- a/drivers/net/wireless/ath/ath11k/dp_rx.c +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c @@ -5703,8 +5703,6 @@ static int ath11k_dp_rx_pdev_mon_status_attach(struct ath11k *ar) struct ath11k_pdev_dp *dp = &ar->dp; struct ath11k_mon_data *pmon = (struct ath11k_mon_data *)&dp->mon_data; - skb_queue_head_init(&pmon->rx_status_q); - pmon->mon_ppdu_status = DP_PPDU_STATUS_START; memset(&pmon->rx_mon_stats, 0, From 8f5e8e7efb135fc648abbb572bd86d0c96819eaf Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 11 Nov 2024 10:01:52 +0300 Subject: [PATCH 0044/1450] wifi: ath11k: miscellaneous spelling fixes Correct spelling here and there as suggested by codespell. Signed-off-by: Dmitry Antipov Acked-by: Kalle Valo Link: https://patch.msgid.link/20241111070152.85140-4-dmantipov@yandex.ru Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/hal.h | 6 +++--- drivers/net/wireless/ath/ath11k/mac.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/hal.h b/drivers/net/wireless/ath/ath11k/hal.h index dc8bbe0730170..601542410c752 100644 --- a/drivers/net/wireless/ath/ath11k/hal.h +++ b/drivers/net/wireless/ath/ath11k/hal.h @@ -700,7 +700,7 @@ enum hal_rx_buf_return_buf_manager { #define HAL_REO_CMD_FLG_UNBLK_RESOURCE BIT(7) #define HAL_REO_CMD_FLG_UNBLK_CACHE BIT(8) -/* Should be matching with HAL_REO_UPD_RX_QUEUE_INFO0_UPD_* feilds */ +/* Should be matching with HAL_REO_UPD_RX_QUEUE_INFO0_UPD_* fields */ #define HAL_REO_CMD_UPD0_RX_QUEUE_NUM BIT(8) #define HAL_REO_CMD_UPD0_VLD BIT(9) #define HAL_REO_CMD_UPD0_ALDC BIT(10) @@ -725,7 +725,7 @@ enum hal_rx_buf_return_buf_manager { #define HAL_REO_CMD_UPD0_PN_VALID BIT(29) #define HAL_REO_CMD_UPD0_PN BIT(30) -/* Should be matching with HAL_REO_UPD_RX_QUEUE_INFO1_* feilds */ +/* Should be matching with HAL_REO_UPD_RX_QUEUE_INFO1_* fields */ #define HAL_REO_CMD_UPD1_VLD BIT(16) #define HAL_REO_CMD_UPD1_ALDC GENMASK(18, 17) #define HAL_REO_CMD_UPD1_DIS_DUP_DETECTION BIT(19) @@ -741,7 +741,7 @@ enum hal_rx_buf_return_buf_manager { #define HAL_REO_CMD_UPD1_PN_HANDLE_ENABLE BIT(30) #define HAL_REO_CMD_UPD1_IGNORE_AMPDU_FLG BIT(31) -/* Should be matching with HAL_REO_UPD_RX_QUEUE_INFO2_* feilds */ +/* Should be matching with HAL_REO_UPD_RX_QUEUE_INFO2_* fields */ #define HAL_REO_CMD_UPD2_SVLD BIT(10) #define HAL_REO_CMD_UPD2_SSN GENMASK(22, 11) #define HAL_REO_CMD_UPD2_SEQ_2K_ERR BIT(23) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 9757ac4aae50a..31ae9b384a292 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -2228,7 +2228,7 @@ static void ath11k_peer_assoc_h_vht(struct ath11k *ar, __le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map), vht_mcs_mask); /* In IPQ8074 platform, VHT mcs rate 10 and 11 is enabled by default. - * VHT mcs rate 10 and 11 is not suppoerted in 11ac standard. + * VHT mcs rate 10 and 11 is not supported in 11ac standard. * so explicitly disable the VHT MCS rate 10 and 11 in 11ac mode. */ arg->tx_mcs_set &= ~IEEE80211_VHT_MCS_SUPPORT_0_11_MASK; @@ -6950,7 +6950,7 @@ static void ath11k_mac_op_remove_interface(struct ieee80211_hw *hw, /* Recalc txpower for remaining vdev */ ath11k_mac_txpower_recalc(ar); - /* TODO: recal traffic pause state based on the available vdevs */ + /* TODO: recalc traffic pause state based on the available vdevs */ mutex_unlock(&ar->conf_mutex); } From e238638e6f24109c0c7639f4a2db023388bd1b76 Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Thu, 31 Oct 2024 08:05:41 +0800 Subject: [PATCH 0045/1450] wifi: ath11k: add support for QCA6698AQ QCA6698AQ IP core is the same as WCN6855 hw2.1, they share the same PCI device ID, the same major and minor version numbers, the same register address, and same HAL descriptors, etc. The most significant difference is that QCA6698AQ has different RF, IPA, thermal, etc. Follow the approach done in commit 5dc9d1a55e95 ("wifi: ath11k: add support for QCA2066"), enumerate the subversion number to identify the specific card. Tested-on: QCA6698AQ hw2.1 PCI WLAN.HSP.1.1-04479-QCAHSPSWPL_V1_V2_SILICONZ_IOE-1 Signed-off-by: Miaoqing Pan Acked-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241031000541.3331606-1-quic_miaoqing@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/core.c | 87 ++++++++++++++++++++++++++ drivers/net/wireless/ath/ath11k/core.h | 1 + drivers/net/wireless/ath/ath11k/mhi.c | 1 + drivers/net/wireless/ath/ath11k/pci.c | 3 + drivers/net/wireless/ath/ath11k/pcic.c | 13 +++- 5 files changed, 104 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c index a9aefb1a705d7..c576bbba52bf1 100644 --- a/drivers/net/wireless/ath/ath11k/core.c +++ b/drivers/net/wireless/ath/ath11k/core.c @@ -811,6 +811,93 @@ static const struct ath11k_hw_params ath11k_hw_params[] = { .end = 0x0177ffff, }, + .tcl_ring_retry = true, + .tx_ring_size = DP_TCL_DATA_RING_SIZE, + .smp2p_wow_exit = false, + .support_fw_mac_sequence = true, + .support_dual_stations = true, + }, + { + .name = "qca6698aq hw2.1", + .hw_rev = ATH11K_HW_QCA6698AQ_HW21, + .fw = { + .dir = "QCA6698AQ/hw2.1", + .board_size = 256 * 1024, + .cal_offset = 128 * 1024, + }, + .max_radios = 3, + .bdf_addr = 0x4B0C0000, + .hw_ops = &wcn6855_ops, + .ring_mask = &ath11k_hw_ring_mask_qca6390, + .internal_sleep_clock = true, + .regs = &wcn6855_regs, + .qmi_service_ins_id = ATH11K_QMI_WLFW_SERVICE_INS_ID_V01_QCA6390, + .host_ce_config = ath11k_host_ce_config_qca6390, + .ce_count = 9, + .target_ce_config = ath11k_target_ce_config_wlan_qca6390, + .target_ce_count = 9, + .svc_to_ce_map = ath11k_target_service_to_ce_map_wlan_qca6390, + .svc_to_ce_map_len = 14, + .single_pdev_only = true, + .rxdma1_enable = false, + .num_rxdma_per_pdev = 2, + .rx_mac_buf_ring = true, + .vdev_start_delay = true, + .htt_peer_map_v2 = false, + + .spectral = { + .fft_sz = 0, + .fft_pad_sz = 0, + .summary_pad_sz = 0, + .fft_hdr_len = 0, + .max_fft_bins = 0, + .fragment_160mhz = false, + }, + + .interface_modes = BIT(NL80211_IFTYPE_STATION) | + BIT(NL80211_IFTYPE_AP) | + BIT(NL80211_IFTYPE_P2P_DEVICE) | + BIT(NL80211_IFTYPE_P2P_CLIENT) | + BIT(NL80211_IFTYPE_P2P_GO), + .supports_monitor = false, + .supports_shadow_regs = true, + .idle_ps = true, + .supports_sta_ps = true, + .coldboot_cal_mm = false, + .coldboot_cal_ftm = false, + .cbcal_restart_fw = false, + .fw_mem_mode = 0, + .num_vdevs = 2 + 1, + .num_peers = 512, + .supports_suspend = true, + .hal_desc_sz = sizeof(struct hal_rx_desc_wcn6855), + .supports_regdb = true, + .fix_l1ss = false, + .credit_flow = true, + .max_tx_ring = DP_TCL_NUM_RING_MAX_QCA6390, + .hal_params = &ath11k_hw_hal_params_qca6390, + .supports_dynamic_smps_6ghz = false, + .alloc_cacheable_memory = false, + .supports_rssi_stats = true, + .fw_wmi_diag_event = true, + .current_cc_support = true, + .dbr_debug_support = false, + .global_reset = true, + .bios_sar_capa = &ath11k_hw_sar_capa_wcn6855, + .m3_fw_support = true, + .fixed_bdf_addr = false, + .fixed_mem_region = false, + .static_window_map = false, + .hybrid_bus_type = false, + .fixed_fw_mem = false, + .support_off_channel_tx = true, + .supports_multi_bssid = true, + + .sram_dump = { + .start = 0x01400000, + .end = 0x0177ffff, + }, + .tcl_ring_retry = true, .tx_ring_size = DP_TCL_DATA_RING_SIZE, .smp2p_wow_exit = false, diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index b1440a70a886b..a9dc7fe7765a1 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -148,6 +148,7 @@ enum ath11k_hw_rev { ATH11K_HW_WCN6750_HW10, ATH11K_HW_IPQ5018_HW10, ATH11K_HW_QCA2066_HW21, + ATH11K_HW_QCA6698AQ_HW21, }; enum ath11k_firmware_mode { diff --git a/drivers/net/wireless/ath/ath11k/mhi.c b/drivers/net/wireless/ath/ath11k/mhi.c index 6974a551883fc..6e45f464a429b 100644 --- a/drivers/net/wireless/ath/ath11k/mhi.c +++ b/drivers/net/wireless/ath/ath11k/mhi.c @@ -398,6 +398,7 @@ int ath11k_mhi_register(struct ath11k_pci *ab_pci) case ATH11K_HW_WCN6855_HW20: case ATH11K_HW_WCN6855_HW21: case ATH11K_HW_QCA2066_HW21: + case ATH11K_HW_QCA6698AQ_HW21: ath11k_mhi_config = &ath11k_mhi_config_qca6390; break; default: diff --git a/drivers/net/wireless/ath/ath11k/pci.c b/drivers/net/wireless/ath/ath11k/pci.c index be9d2c69cc413..b93f04973ad79 100644 --- a/drivers/net/wireless/ath/ath11k/pci.c +++ b/drivers/net/wireless/ath/ath11k/pci.c @@ -846,6 +846,9 @@ static int ath11k_pci_probe(struct pci_dev *pdev, case 0x1019D0E1: ab->hw_rev = ATH11K_HW_QCA2066_HW21; break; + case 0x001e60e1: + ab->hw_rev = ATH11K_HW_QCA6698AQ_HW21; + break; default: ab->hw_rev = ATH11K_HW_WCN6855_HW21; } diff --git a/drivers/net/wireless/ath/ath11k/pcic.c b/drivers/net/wireless/ath/ath11k/pcic.c index debe7c5919ef0..3fe77310c71fc 100644 --- a/drivers/net/wireless/ath/ath11k/pcic.c +++ b/drivers/net/wireless/ath/ath11k/pcic.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: BSD-3-Clause-Clear /* * Copyright (c) 2019-2021 The Linux Foundation. All rights reserved. - * Copyright (c) 2021-2023 Qualcomm Innovation Center, Inc. All rights reserved. + * Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved. */ #include "core.h" @@ -126,6 +126,17 @@ static const struct ath11k_msi_config ath11k_msi_config[] = { }, .hw_rev = ATH11K_HW_QCA2066_HW21, }, + { + .total_vectors = 32, + .total_users = 4, + .users = (struct ath11k_msi_user[]) { + { .name = "MHI", .num_vectors = 3, .base_vector = 0 }, + { .name = "CE", .num_vectors = 10, .base_vector = 3 }, + { .name = "WAKE", .num_vectors = 1, .base_vector = 13 }, + { .name = "DP", .num_vectors = 18, .base_vector = 14 }, + }, + .hw_rev = ATH11K_HW_QCA6698AQ_HW21, + }, }; int ath11k_pcic_init_msi_config(struct ath11k_base *ab) From 47c7ebfba30f242a24a89317c74eada47adfaa95 Mon Sep 17 00:00:00 2001 From: Roopni Devanathan Date: Mon, 18 Nov 2024 09:27:22 +0530 Subject: [PATCH 0046/1450] wifi: ath12k: Fix inappropriate use of print_array_to_buf_index() Currently in ath12k_htt_print_tx_pdev_mumimo_grp_stats_tlv() the htt_stats_buf->ul_mumimo_grp_best_usrs array is printed using print_array_to_buf_index() with a stats_index of 1. This is meant to convey the semantic that first entry in ul_mumimo_grp_best_usrs is associated with user 1. However, unlike some of the other "usr" arrays which have that semantic, ul_mumimo_grp_best_usrs does not have that semantic. Instead the first entry corresponds to user 0. Fix the issue by calling the API - print_array_to_buf(), instead of print_array_to_buf_index(). Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1 Signed-off-by: Roopni Devanathan Acked-by: Jeff Johnson Acked-by: Kalle Valo Link: https://patch.msgid.link/20241118035722.1755373-1-quic_rdevanat@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c index 0bc945c7a93aa..d8f137bfba7b8 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c +++ b/drivers/net/wireless/ath/ath12k/debugfs_htt_stats.c @@ -2310,9 +2310,9 @@ ath12k_htt_print_tx_pdev_mumimo_grp_stats_tlv(const void *tag_buf, u16 tag_len, len += print_array_to_buf(buf, len, "ul_mumimo_grp_best_grp_size", htt_stats_buf->ul_mumimo_grp_best_grp_size, ATH12K_HTT_STATS_NUM_MAX_MUMIMO_SZ, "\n"); - len += print_array_to_buf_index(buf, len, "ul_mumimo_grp_best_num_usrs = ", 1, - htt_stats_buf->ul_mumimo_grp_best_usrs, - ATH12K_HTT_TX_NUM_AX_MUMIMO_USER_STATS, "\n"); + len += print_array_to_buf(buf, len, "ul_mumimo_grp_best_num_usrs = ", + htt_stats_buf->ul_mumimo_grp_best_usrs, + ATH12K_HTT_TX_NUM_AX_MUMIMO_USER_STATS, "\n"); len += print_array_to_buf(buf, len, "ul_mumimo_grp_tputs_observed (per bin = 300 mbps)", htt_stats_buf->ul_mumimo_grp_tputs, From 8ea1d2072ad1a9c24b326b50ebdc2c810c4b2cce Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 19 Nov 2024 07:47:38 -0800 Subject: [PATCH 0047/1450] wifi: ath11k: mark some QMI driver event helpers as noinline When compiling the ath11k driver using clang with KASAN enabled, the following warning is observed: drivers/net/wireless/ath/ath11k/qmi.c:3199:13: warning: stack frame size (1560) exceeds limit (1024) in 'ath11k_qmi_driver_event_work' [-Wframe-larger-than] This is similar to the issue found in ath12k/qmi.c that was discussed in [1] and fixed with [2]. The issue is that clang inlining can explode stack usage. Just as in ath12k, ath11k_qmi_driver_event_work() itself is a pretty lightweight function, but it dispatches to several other functions which do the real work: ath11k_qmi_driver_event_work() ath11k_qmi_event_server_arrive() ath11k_qmi_fw_ind_register_send() ath11k_qmi_host_cap_send() * ath11k_qmi_event_load_bdf() ath11k_qmi_event_mem_request() ath11k_qmi_respond_fw_mem_request() ath11k_qmi_event_load_bdf() ath11k_qmi_wlanfw_m3_info_send() * ath11k_qmi_m3_load() ath11k_qmi_process_coldboot_calibration() Of these, the two marked with * have non-trivial stack usage. Mark those functions as 'noinline_for_stack' to prevent them from being inlined in ath12k_qmi_driver_event_work(), thereby eliminating the excessive stack usage. Note that this approach is a bit more "surgical" than the ath12k approach as only the two functions with the largest stack usage are modified. Compile tested only. Link: https://msgid.link/bc214795-1c51-4cb7-922f-67d6ef98bff2@quicinc.com # [1] Link: https://patch.msgid.link/20241028-ath12k_qmi_driver_event_work-v1-1-0d532eb593fa@quicinc.com # [2] Acked-by: Kalle Valo Link: https://patch.msgid.link/20241119-ath11k-noinline-v1-1-4ec0a8aa30b2@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/qmi.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/qmi.c b/drivers/net/wireless/ath/ath11k/qmi.c index 7a22483b35cd9..5759fc5213162 100644 --- a/drivers/net/wireless/ath/ath11k/qmi.c +++ b/drivers/net/wireless/ath/ath11k/qmi.c @@ -1704,7 +1704,9 @@ static const struct qmi_elem_info qmi_wlfw_fw_init_done_ind_msg_v01_ei[] = { }, }; -static int ath11k_qmi_host_cap_send(struct ath11k_base *ab) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath11k_qmi_host_cap_send(struct ath11k_base *ab) { struct qmi_wlanfw_host_cap_req_msg_v01 req; struct qmi_wlanfw_host_cap_resp_msg_v01 resp; @@ -2570,7 +2572,9 @@ static void ath11k_qmi_m3_free(struct ath11k_base *ab) m3_mem->size = 0; } -static int ath11k_qmi_wlanfw_m3_info_send(struct ath11k_base *ab) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +int ath11k_qmi_wlanfw_m3_info_send(struct ath11k_base *ab) { struct m3_mem_region *m3_mem = &ab->qmi.m3_mem; struct qmi_wlanfw_m3_info_req_msg_v01 req; From 500d7ec88652ba7316e7fba334754e39e3177e4a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 19 Nov 2024 07:47:39 -0800 Subject: [PATCH 0048/1450] wifi: ath11k: mark ath11k_dp_rx_mon_mpdu_pop() as noinline When compiling the ath11k driver using clang with KASAN enabled, the following warning is observed: drivers/net/wireless/ath/ath11k/dp_rx.c:5244:5: warning: stack frame size (1304) exceeds limit (1024) in 'ath11k_dp_rx_process_mon_status' [-Wframe-larger-than] This is similar to the issue found in ath12k/qmi.c that was discussed in [1] and fixed with [2]. The issue is that clang inlining can explode stack usage. ath11k_dp_rx_process_mon_status() itself is a pretty lightweight function, but it dispatches to several other functions which do the real work: ath11k_dp_rx_process_mon_status() ath11k_dp_rx_reap_mon_status_ring() ath11k_dp_rx_mon_dest_process() ath11k_dp_rx_mon_mpdu_pop() * ath11k_dp_rx_mon_deliver() ath11k_dp_rx_mon_merg_msdus() ath11k_dp_rx_deliver_msdu() ath11k_dp_rx_update_peer_stats() Of these, only ath11k_dp_rx_mon_mpdu_pop() has non-trivial stack usage, so mark that function as 'noinline_for_stack' to prevent it from being inlined in ath11k_dp_rx_process_mon_status(), thereby eliminating the excessive stack usage. Compile tested only. Link: https://msgid.link/bc214795-1c51-4cb7-922f-67d6ef98bff2@quicinc.com # [1] Link: https://patch.msgid.link/20241028-ath12k_qmi_driver_event_work-v1-1-0d532eb593fa@quicinc.com # [2] Acked-by: Kalle Valo Link: https://patch.msgid.link/20241119-ath11k-noinline-v1-2-4ec0a8aa30b2@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/dp_rx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c index 238b0e5e92a0e..8517994effce0 100644 --- a/drivers/net/wireless/ath/ath11k/dp_rx.c +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c @@ -4691,11 +4691,12 @@ static void ath11k_dp_mon_get_buf_len(struct hal_rx_msdu_desc_info *info, } } -static u32 -ath11k_dp_rx_mon_mpdu_pop(struct ath11k *ar, int mac_id, - void *ring_entry, struct sk_buff **head_msdu, - struct sk_buff **tail_msdu, u32 *npackets, - u32 *ppdu_id) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +u32 ath11k_dp_rx_mon_mpdu_pop(struct ath11k *ar, int mac_id, + void *ring_entry, struct sk_buff **head_msdu, + struct sk_buff **tail_msdu, u32 *npackets, + u32 *ppdu_id) { struct ath11k_pdev_dp *dp = &ar->dp; struct ath11k_mon_data *pmon = (struct ath11k_mon_data *)&dp->mon_data; From 4ba72ff2919cad90e1963b708ce23b92120613ff Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 19 Nov 2024 07:47:40 -0800 Subject: [PATCH 0049/1450] wifi: ath11k: mark ath11k_wow_convert_8023_to_80211() as noinline When compiling the ath11k driver using clang with KASAN enabled, the following warning is observed: drivers/net/wireless/ath/ath11k/wow.c:672:5: warning: stack frame size (1336) exceeds limit (1024) in 'ath11k_wow_op_suspend' [-Wframe-larger-than] This is similar to the issue found in ath12k/qmi.c that was discussed in [1] and fixed with [2]. The issue is that clang inlining can explode stack usage. ath11k_wow_op_suspend() itself is a pretty lightweight function, but it dispatches to several other functions which do the real work. One path in particular is: ath11k_wow_op_suspend() ath11k_wow_set_wakeups() ath11k_vif_wow_set_wakeups() ath11k_wow_convert_8023_to_80211() Of these, ath11k_wow_convert_8023_to_80211() has non-trivial stack usage, so mark it as 'noinline_for_stack' to prevent it from being inlined in ath11k_wow_op_suspend(), thereby eliminating the excessive stack usage. Compile tested only. Link: https://msgid.link/bc214795-1c51-4cb7-922f-67d6ef98bff2@quicinc.com # [1] Link: https://patch.msgid.link/20241028-ath12k_qmi_driver_event_work-v1-1-0d532eb593fa@quicinc.com # [2] Acked-by: Kalle Valo Link: https://patch.msgid.link/20241119-ath11k-noinline-v1-3-4ec0a8aa30b2@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/wow.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/wow.c b/drivers/net/wireless/ath/ath11k/wow.c index 827085a926b27..b6f08755129fd 100644 --- a/drivers/net/wireless/ath/ath11k/wow.c +++ b/drivers/net/wireless/ath/ath11k/wow.c @@ -148,8 +148,10 @@ static int ath11k_wow_cleanup(struct ath11k *ar) * 802.11: |4B|dest mac(6B)| 6B |src mac(6B)| 8B |type(2B)| body... | * +--+------------+----+-----------+---------------+-----------+ */ -static void ath11k_wow_convert_8023_to_80211(struct cfg80211_pkt_pattern *new, - const struct cfg80211_pkt_pattern *old) +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack +void ath11k_wow_convert_8023_to_80211(struct cfg80211_pkt_pattern *new, + const struct cfg80211_pkt_pattern *old) { u8 hdr_8023_pattern[ETH_HLEN] = {}; u8 hdr_8023_bit_mask[ETH_HLEN] = {}; From bc7acc0bd0f94c26bc0defc902311794a3d0fae9 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 20 Nov 2024 15:31:16 -0800 Subject: [PATCH 0050/1450] of: property: fw_devlink: Do not use interrupt-parent directly commit 7f00be96f125 ("of: property: Add device link support for interrupt-parent, dmas and -gpio(s)") started adding device links for the interrupt-parent property. commit 4104ca776ba3 ("of: property: Add fw_devlink support for interrupts") and commit f265f06af194 ("of: property: Fix fw_devlink handling of interrupts/interrupts-extended") later added full support for parsing the interrupts and interrupts-extended properties, which includes looking up the node of the parent domain. This made the handler for the interrupt-parent property redundant. In fact, creating device links based solely on interrupt-parent is problematic, because it can create spurious cycles. A node may have this property without itself being an interrupt controller or consumer. For example, this property is often present in the root node or a /soc bus node to set the default interrupt parent for child nodes. However, it is incorrect for the bus to depend on the interrupt controller, as some of the bus's children may not be interrupt consumers at all or may have a different interrupt parent. Resolving these spurious dependency cycles can cause an incorrect probe order for interrupt controller drivers. This was observed on a RISC-V system with both an APLIC and IMSIC under /soc, where interrupt-parent in /soc points to the APLIC, and the APLIC msi-parent points to the IMSIC. fw_devlink found three dependency cycles and attempted to probe the APLIC before the IMSIC. After applying this patch, there were no dependency cycles and the probe order was correct. Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Fixes: 4104ca776ba3 ("of: property: Add fw_devlink support for interrupts") Signed-off-by: Samuel Holland Link: https://lore.kernel.org/r/20241120233124.3649382-1-samuel.holland@sifive.com Signed-off-by: Rob Herring (Arm) --- drivers/of/property.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/of/property.c b/drivers/of/property.c index 519bf9229e613..cfc8aea002e43 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -1286,7 +1286,6 @@ DEFINE_SIMPLE_PROP(iommus, "iommus", "#iommu-cells") DEFINE_SIMPLE_PROP(mboxes, "mboxes", "#mbox-cells") DEFINE_SIMPLE_PROP(io_channels, "io-channels", "#io-channel-cells") DEFINE_SIMPLE_PROP(io_backends, "io-backends", "#io-backend-cells") -DEFINE_SIMPLE_PROP(interrupt_parent, "interrupt-parent", NULL) DEFINE_SIMPLE_PROP(dmas, "dmas", "#dma-cells") DEFINE_SIMPLE_PROP(power_domains, "power-domains", "#power-domain-cells") DEFINE_SIMPLE_PROP(hwlocks, "hwlocks", "#hwlock-cells") @@ -1432,7 +1431,6 @@ static const struct supplier_bindings of_supplier_bindings[] = { { .parse_prop = parse_mboxes, }, { .parse_prop = parse_io_channels, }, { .parse_prop = parse_io_backends, }, - { .parse_prop = parse_interrupt_parent, }, { .parse_prop = parse_dmas, .optional = true, }, { .parse_prop = parse_power_domains, }, { .parse_prop = parse_hwlocks, }, From 8e6f8bc286031291dac9ff09977e60c9a069bf80 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Thu, 21 Nov 2024 17:57:59 +0200 Subject: [PATCH 0051/1450] wifi: ath12k: Add MLO station state change handling Add changes to handle multi-link station state change with proper link handling and add code changes for ML peer creation, peer deletion. In ath12k_mac_assign_link_sta() initialise all arsta fields first and only then call rcu_assign_pointer(). This is to make sure that readers don't have access to arsta which is still modified. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241121155806.1862733-2-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/core.h | 4 + drivers/net/wireless/ath/ath12k/mac.c | 225 ++++++++++++++++++++----- drivers/net/wireless/ath/ath12k/mac.h | 1 + drivers/net/wireless/ath/ath12k/peer.c | 83 ++++++++- drivers/net/wireless/ath/ath12k/peer.h | 1 + 5 files changed, 271 insertions(+), 43 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 8dbdf6818f58c..c1d5e93b679a8 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -510,7 +510,11 @@ struct ath12k_sta { struct ath12k_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; /* indicates bitmap of link sta created in FW */ u16 links_map; + u8 assoc_link_id; u16 ml_peer_id; + u8 num_peer; + + enum ieee80211_sta_state state; }; #define ATH12K_MIN_5G_FREQ 4150 diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index ad27a2552a2c9..d796185c84311 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -1251,7 +1251,7 @@ static int ath12k_mac_monitor_stop(struct ath12k *ar) return ret; } -static int ath12k_mac_vdev_stop(struct ath12k_link_vif *arvif) +int ath12k_mac_vdev_stop(struct ath12k_link_vif *arvif) { struct ath12k_vif *ahvif = arvif->ahvif; struct ath12k *ar = arvif->ar; @@ -4832,6 +4832,35 @@ static void ath12k_sta_rc_update_wk(struct wiphy *wiphy, struct wiphy_work *wk) } } +static void ath12k_mac_free_unassign_link_sta(struct ath12k_hw *ah, + struct ath12k_sta *ahsta, + u8 link_id) +{ + struct ath12k_link_sta *arsta; + + lockdep_assert_wiphy(ah->hw->wiphy); + + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return; + + arsta = wiphy_dereference(ah->hw->wiphy, ahsta->link[link_id]); + if (WARN_ON(!arsta)) + return; + + ahsta->links_map &= ~BIT(link_id); + rcu_assign_pointer(ahsta->link[link_id], NULL); + synchronize_rcu(); + + if (arsta == &ahsta->deflink) { + arsta->link_id = ATH12K_INVALID_LINK_ID; + arsta->ahsta = NULL; + arsta->arvif = NULL; + return; + } + + kfree(arsta); +} + static int ath12k_mac_inc_num_stations(struct ath12k_link_vif *arvif, struct ath12k_link_sta *arsta) { @@ -4871,7 +4900,6 @@ static void ath12k_mac_station_post_remove(struct ath12k *ar, { struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - struct ath12k_sta *ahsta = arsta->ahsta; struct ath12k_peer *peer; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); @@ -4894,14 +4922,6 @@ static void ath12k_mac_station_post_remove(struct ath12k *ar, kfree(arsta->rx_stats); arsta->rx_stats = NULL; - - if (arsta->link_id < IEEE80211_MLD_MAX_NUM_LINKS) { - ahsta->links_map &= ~(BIT(arsta->link_id)); - rcu_assign_pointer(ahsta->link[arsta->link_id], NULL); - synchronize_rcu(); - arsta->link_id = ATH12K_INVALID_LINK_ID; - arsta->ahsta = NULL; - } } static int ath12k_mac_station_unauthorize(struct ath12k *ar, @@ -4977,7 +4997,7 @@ static int ath12k_mac_station_remove(struct ath12k *ar, { struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); struct ath12k_vif *ahvif = arvif->ahvif; - int ret; + int ret = 0; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); @@ -4991,6 +5011,9 @@ static int ath12k_mac_station_remove(struct ath12k *ar, arvif->vdev_id, ret); } + if (sta->mlo) + return ret; + ath12k_dp_peer_cleanup(ar, arvif->vdev_id, sta->addr); ret = ath12k_peer_delete(ar, arvif->vdev_id, sta->addr); @@ -5003,6 +5026,10 @@ static int ath12k_mac_station_remove(struct ath12k *ar, ath12k_mac_station_post_remove(ar, arvif, arsta); + if (sta->valid_links) + ath12k_mac_free_unassign_link_sta(ahvif->ah, + arsta->ahsta, arsta->link_id); + return ret; } @@ -5114,51 +5141,112 @@ static u32 ath12k_mac_ieee80211_sta_bw_to_wmi(struct ath12k *ar, return bw; } +static int ath12k_mac_assign_link_sta(struct ath12k_hw *ah, + struct ath12k_sta *ahsta, + struct ath12k_link_sta *arsta, + struct ath12k_vif *ahvif, + u8 link_id) +{ + struct ieee80211_sta *sta = ath12k_ahsta_to_sta(ahsta); + struct ieee80211_link_sta *link_sta; + struct ath12k_link_vif *arvif; + + lockdep_assert_wiphy(ah->hw->wiphy); + + if (!arsta || link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + return -EINVAL; + + arvif = wiphy_dereference(ah->hw->wiphy, ahvif->link[link_id]); + if (!arvif) + return -EINVAL; + + memset(arsta, 0, sizeof(*arsta)); + + link_sta = wiphy_dereference(ah->hw->wiphy, sta->link[link_id]); + if (!link_sta) + return -EINVAL; + + ether_addr_copy(arsta->addr, link_sta->addr); + + /* logical index of the link sta in order of creation */ + arsta->link_idx = ahsta->num_peer++; + + arsta->link_id = link_id; + ahsta->links_map |= BIT(arsta->link_id); + arsta->arvif = arvif; + arsta->ahsta = ahsta; + wiphy_work_init(&arsta->update_wk, ath12k_sta_rc_update_wk); + + rcu_assign_pointer(ahsta->link[link_id], arsta); + + return 0; +} + +static void ath12k_mac_ml_station_remove(struct ath12k_vif *ahvif, + struct ath12k_sta *ahsta) +{ + struct ieee80211_sta *sta = ath12k_ahsta_to_sta(ahsta); + struct ath12k_hw *ah = ahvif->ah; + struct ath12k_link_vif *arvif; + struct ath12k_link_sta *arsta; + unsigned long links; + struct ath12k *ar; + u8 link_id; + + lockdep_assert_wiphy(ah->hw->wiphy); + + ath12k_peer_mlo_link_peers_delete(ahvif, ahsta); + + /* validate link station removal and clear arsta links */ + links = ahsta->links_map; + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(ah->hw->wiphy, ahvif->link[link_id]); + arsta = wiphy_dereference(ah->hw->wiphy, ahsta->link[link_id]); + if (!arvif || !arsta) + continue; + + ar = arvif->ar; + + ath12k_mac_station_post_remove(ar, arvif, arsta); + + ath12k_mac_free_unassign_link_sta(ah, ahsta, link_id); + } + + ath12k_peer_ml_delete(ah, sta); +} + static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, struct ath12k_link_vif *arvif, struct ath12k_link_sta *arsta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state) { - struct ath12k *ar = arvif->ar; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - struct ath12k_sta *ahsta = arsta->ahsta; + struct ath12k *ar = arvif->ar; int ret = 0; lockdep_assert_wiphy(hw->wiphy); + ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac handle link %u sta %pM state %d -> %d\n", + arsta->link_id, arsta->addr, old_state, new_state); + /* IEEE80211_STA_NONE -> IEEE80211_STA_NOTEXIST: Remove the station * from driver */ if ((old_state == IEEE80211_STA_NONE && new_state == IEEE80211_STA_NOTEXIST)) { - /* ML sta needs separate handling */ - if (sta->mlo) - return 0; - ret = ath12k_mac_station_remove(ar, arvif, arsta); if (ret) { ath12k_warn(ar->ab, "Failed to remove station: %pM for VDEV: %d\n", arsta->addr, arvif->vdev_id); + goto exit; } } /* IEEE80211_STA_NOTEXIST -> IEEE80211_STA_NONE: Add new station to driver */ if (old_state == IEEE80211_STA_NOTEXIST && new_state == IEEE80211_STA_NONE) { - memset(arsta, 0, sizeof(*arsta)); - rcu_assign_pointer(ahsta->link[0], arsta); - /* TODO use appropriate link id once MLO support is added */ - arsta->link_id = ATH12K_DEFAULT_LINK_ID; - ahsta->links_map = BIT(arsta->link_id); - arsta->ahsta = ahsta; - arsta->arvif = arvif; - ether_addr_copy(arsta->addr, sta->addr); - wiphy_work_init(&arsta->update_wk, ath12k_sta_rc_update_wk); - - synchronize_rcu(); - ret = ath12k_mac_station_add(ar, arvif, arsta); if (ret) ath12k_warn(ar->ab, "Failed to add station: %pM for VDEV: %d\n", @@ -5200,6 +5288,7 @@ static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, } else if (old_state == IEEE80211_STA_AUTHORIZED && new_state == IEEE80211_STA_ASSOC) { ath12k_mac_station_unauthorize(ar, arvif, arsta); + /* IEEE80211_STA_ASSOC -> IEEE80211_STA_AUTH: disassoc peer connected to * AP/mesh/ADHOC vif type. */ @@ -5214,6 +5303,7 @@ static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, sta->addr); } +exit: return ret; } @@ -5225,10 +5315,12 @@ static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); + struct ath12k_hw *ah = ath12k_hw_to_ah(hw); struct ath12k_link_vif *arvif; struct ath12k_link_sta *arsta; - int ret; + unsigned long valid_links; u8 link_id = 0; + int ret; lockdep_assert_wiphy(hw->wiphy); @@ -5237,32 +5329,83 @@ static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, link_id = ffs(sta->valid_links) - 1; } - /* Handle for non-ML station */ - if (!sta->mlo) { - arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + /* IEEE80211_STA_NOTEXIST -> IEEE80211_STA_NONE: + * New station add received. If this is a ML station then + * ahsta->links_map will be zero and sta->valid_links will be 1. + * Assign default link to the first link sta. + */ + if (old_state == IEEE80211_STA_NOTEXIST && + new_state == IEEE80211_STA_NONE) { + memset(ahsta, 0, sizeof(*ahsta)); + arsta = &ahsta->deflink; - arsta->ahsta = ahsta; - if (WARN_ON(!arvif || !arsta)) { - ret = -EINVAL; - goto exit; + /* ML sta */ + if (sta->mlo && !ahsta->links_map && + (hweight16(sta->valid_links) == 1)) { + ret = ath12k_peer_ml_create(ah, sta); + if (ret) { + ath12k_hw_warn(ah, "unable to create ML peer for sta %pM", + sta->addr); + goto exit; + } } - /* vdev might be in deleted */ - if (WARN_ON(!arvif->ar)) { - ret = -EINVAL; + ret = ath12k_mac_assign_link_sta(ah, ahsta, arsta, ahvif, + link_id); + if (ret) { + ath12k_hw_warn(ah, "unable assign link %d for sta %pM", + link_id, sta->addr); goto exit; } + /* above arsta will get memset, hence do this after assign + * link sta + */ + if (sta->mlo) { + arsta->is_assoc_link = true; + ahsta->assoc_link_id = link_id; + } + } + + /* Handle all the other state transitions in generic way */ + valid_links = ahsta->links_map; + for_each_set_bit(link_id, &valid_links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + arsta = wiphy_dereference(hw->wiphy, ahsta->link[link_id]); + /* some assumptions went wrong! */ + if (WARN_ON(!arvif || !arsta)) + continue; + + /* vdev might be in deleted */ + if (WARN_ON(!arvif->ar)) + continue; + ret = ath12k_mac_handle_link_sta_state(hw, arvif, arsta, old_state, new_state); - if (ret) + if (ret) { + ath12k_hw_warn(ah, "unable to move link sta %d of sta %pM from state %d to %d", + link_id, arsta->addr, old_state, new_state); goto exit; + } } + /* IEEE80211_STA_NONE -> IEEE80211_STA_NOTEXIST: + * Remove the station from driver (handle ML sta here since that + * needs special handling. Normal sta will be handled in generic + * handler below + */ + if (old_state == IEEE80211_STA_NONE && + new_state == IEEE80211_STA_NOTEXIST && sta->mlo) + ath12k_mac_ml_station_remove(ahvif, ahsta); + ret = 0; exit: + /* update the state if everything went well */ + if (!ret) + ahsta->state = new_state; + return ret; } diff --git a/drivers/net/wireless/ath/ath12k/mac.h b/drivers/net/wireless/ath/ath12k/mac.h index d382337ba6498..c13630ee479a1 100644 --- a/drivers/net/wireless/ath/ath12k/mac.h +++ b/drivers/net/wireless/ath/ath12k/mac.h @@ -89,5 +89,6 @@ int ath12k_mac_vif_set_keepalive(struct ath12k_link_vif *arvif, enum wmi_sta_keepalive_method method, u32 interval); u8 ath12k_mac_get_target_pdev_id(struct ath12k *ar); +int ath12k_mac_vdev_stop(struct ath12k_link_vif *arvif); #endif diff --git a/drivers/net/wireless/ath/ath12k/peer.c b/drivers/net/wireless/ath/ath12k/peer.c index 0e86847edd6e1..ffbc1265ccc1b 100644 --- a/drivers/net/wireless/ath/ath12k/peer.c +++ b/drivers/net/wireless/ath/ath12k/peer.c @@ -264,8 +264,9 @@ int ath12k_wait_for_peer_delete_done(struct ath12k *ar, u32 vdev_id, return 0; } -int ath12k_peer_delete(struct ath12k *ar, u32 vdev_id, u8 *addr) +static int ath12k_peer_delete_send(struct ath12k *ar, u32 vdev_id, const u8 *addr) { + struct ath12k_base *ab = ar->ab; int ret; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); @@ -274,12 +275,25 @@ int ath12k_peer_delete(struct ath12k *ar, u32 vdev_id, u8 *addr) ret = ath12k_wmi_send_peer_delete_cmd(ar, addr, vdev_id); if (ret) { - ath12k_warn(ar->ab, + ath12k_warn(ab, "failed to delete peer vdev_id %d addr %pM ret %d\n", vdev_id, addr, ret); return ret; } + return 0; +} + +int ath12k_peer_delete(struct ath12k *ar, u32 vdev_id, u8 *addr) +{ + int ret; + + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + ret = ath12k_peer_delete_send(ar, vdev_id, addr); + if (ret) + return ret; + ret = ath12k_wait_for_peer_delete_done(ar, vdev_id, addr); if (ret) return ret; @@ -456,3 +470,68 @@ int ath12k_peer_ml_delete(struct ath12k_hw *ah, struct ieee80211_sta *sta) return 0; } + +int ath12k_peer_mlo_link_peers_delete(struct ath12k_vif *ahvif, struct ath12k_sta *ahsta) +{ + struct ieee80211_sta *sta = ath12k_ahsta_to_sta(ahsta); + struct ath12k_hw *ah = ahvif->ah; + struct ath12k_link_vif *arvif; + struct ath12k_link_sta *arsta; + unsigned long links; + struct ath12k *ar; + int ret, err_ret = 0; + u8 link_id; + + lockdep_assert_wiphy(ah->hw->wiphy); + + if (!sta->mlo) + return -EINVAL; + + /* FW expects delete of all link peers at once before waiting for reception + * of peer unmap or delete responses + */ + links = ahsta->links_map; + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(ah->hw->wiphy, ahvif->link[link_id]); + arsta = wiphy_dereference(ah->hw->wiphy, ahsta->link[link_id]); + if (!arvif || !arsta) + continue; + + ar = arvif->ar; + if (!ar) + continue; + + ath12k_dp_peer_cleanup(ar, arvif->vdev_id, arsta->addr); + + ret = ath12k_peer_delete_send(ar, arvif->vdev_id, arsta->addr); + if (ret) { + ath12k_warn(ar->ab, + "failed to delete peer vdev_id %d addr %pM ret %d\n", + arvif->vdev_id, arsta->addr, ret); + err_ret = ret; + continue; + } + } + + /* Ensure all link peers are deleted and unmapped */ + links = ahsta->links_map; + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(ah->hw->wiphy, ahvif->link[link_id]); + arsta = wiphy_dereference(ah->hw->wiphy, ahsta->link[link_id]); + if (!arvif || !arsta) + continue; + + ar = arvif->ar; + if (!ar) + continue; + + ret = ath12k_wait_for_peer_delete_done(ar, arvif->vdev_id, arsta->addr); + if (ret) { + err_ret = ret; + continue; + } + ar->num_peers--; + } + + return err_ret; +} diff --git a/drivers/net/wireless/ath/ath12k/peer.h b/drivers/net/wireless/ath/ath12k/peer.h index c28aca5d88a04..e398d5a3fdc8b 100644 --- a/drivers/net/wireless/ath/ath12k/peer.h +++ b/drivers/net/wireless/ath/ath12k/peer.h @@ -78,5 +78,6 @@ bool ath12k_peer_exist_by_vdev_id(struct ath12k_base *ab, int vdev_id); struct ath12k_peer *ath12k_peer_find_by_ast(struct ath12k_base *ab, int ast_hash); int ath12k_peer_ml_create(struct ath12k_hw *ah, struct ieee80211_sta *sta); int ath12k_peer_ml_delete(struct ath12k_hw *ah, struct ieee80211_sta *sta); +int ath12k_peer_mlo_link_peers_delete(struct ath12k_vif *ahvif, struct ath12k_sta *ahsta); #endif /* _PEER_H_ */ From a27fa6148dacc79451e523c2694bc0a673b1be05 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Thu, 21 Nov 2024 17:58:00 +0200 Subject: [PATCH 0052/1450] wifi: ath12k: support change_sta_links() mac80211 op Add ath12k_mac_op_change_sta_links() for adding and removing link station. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241121155806.1862733-3-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 97 ++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index d796185c84311..d92a5e0afe2e4 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -5554,6 +5554,101 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, rcu_read_unlock(); } +static struct ath12k_link_sta *ath12k_mac_alloc_assign_link_sta(struct ath12k_hw *ah, + struct ath12k_sta *ahsta, + struct ath12k_vif *ahvif, + u8 link_id) +{ + struct ath12k_link_sta *arsta; + int ret; + + lockdep_assert_wiphy(ah->hw->wiphy); + + if (link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + return NULL; + + arsta = wiphy_dereference(ah->hw->wiphy, ahsta->link[link_id]); + if (arsta) + return NULL; + + arsta = kmalloc(sizeof(*arsta), GFP_KERNEL); + if (!arsta) + return NULL; + + ret = ath12k_mac_assign_link_sta(ah, ahsta, arsta, ahvif, link_id); + if (ret) { + kfree(arsta); + return NULL; + } + + return arsta; +} + +static int ath12k_mac_op_change_sta_links(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + u16 old_links, u16 new_links) +{ + struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); + struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); + struct ath12k_hw *ah = hw->priv; + struct ath12k_link_vif *arvif; + struct ath12k_link_sta *arsta; + unsigned long valid_links; + struct ath12k *ar; + u8 link_id; + int ret; + + lockdep_assert_wiphy(hw->wiphy); + + if (!sta->valid_links) + return -EINVAL; + + /* Firmware does not support removal of one of link stas. All sta + * would be removed during ML STA delete in sta_state(), hence link + * sta removal is not handled here. + */ + if (new_links < old_links) + return 0; + + if (ahsta->ml_peer_id == ATH12K_MLO_PEER_ID_INVALID) { + ath12k_hw_warn(ah, "unable to add link for ml sta %pM", sta->addr); + return -EINVAL; + } + + /* this op is expected only after initial sta insertion with default link */ + if (WARN_ON(ahsta->links_map == 0)) + return -EINVAL; + + valid_links = new_links; + for_each_set_bit(link_id, &valid_links, IEEE80211_MLD_MAX_NUM_LINKS) { + if (ahsta->links_map & BIT(link_id)) + continue; + + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + arsta = ath12k_mac_alloc_assign_link_sta(ah, ahsta, ahvif, link_id); + + if (!arvif || !arsta) { + ath12k_hw_warn(ah, "Failed to alloc/assign link sta"); + continue; + } + + ar = arvif->ar; + if (!ar) + continue; + + ret = ath12k_mac_station_add(ar, arvif, arsta); + if (ret) { + ath12k_warn(ar->ab, "Failed to add station: %pM for VDEV: %d\n", + arsta->addr, arvif->vdev_id); + ath12k_mac_free_unassign_link_sta(ah, ahsta, link_id); + return ret; + } + } + + return 0; +} + static int ath12k_conf_tx_uapsd(struct ath12k_link_vif *arvif, u16 ac, bool enable) { @@ -9604,7 +9699,7 @@ static const struct ieee80211_ops ath12k_ops = { .sta_statistics = ath12k_mac_op_sta_statistics, .remain_on_channel = ath12k_mac_op_remain_on_channel, .cancel_remain_on_channel = ath12k_mac_op_cancel_remain_on_channel, - + .change_sta_links = ath12k_mac_op_change_sta_links, #ifdef CONFIG_PM .suspend = ath12k_wow_op_suspend, .resume = ath12k_wow_op_resume, From ea4192553850cc6b46d5676a9514f759ef3dee0d Mon Sep 17 00:00:00 2001 From: Sriram R Date: Thu, 21 Nov 2024 17:58:01 +0200 Subject: [PATCH 0053/1450] wifi: ath12k: add primary link for data path operations In case of Multi-link operation, data path peer setup and tid setup should be done only for primary link of multi-link station. Add changes to introduce primary link is peer. Currently, association link will be considered as primary link. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241121155806.1862733-4-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/dp.c | 5 +++++ drivers/net/wireless/ath/ath12k/dp_rx.c | 10 ++++++++++ drivers/net/wireless/ath/ath12k/peer.c | 23 +++++++++++++++++++++++ drivers/net/wireless/ath/ath12k/peer.h | 8 ++++++++ 4 files changed, 46 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/dp.c b/drivers/net/wireless/ath/ath12k/dp.c index c99e9ceb1a6e8..23326e2dfe8d2 100644 --- a/drivers/net/wireless/ath/ath12k/dp.c +++ b/drivers/net/wireless/ath/ath12k/dp.c @@ -41,6 +41,11 @@ void ath12k_dp_peer_cleanup(struct ath12k *ar, int vdev_id, const u8 *addr) return; } + if (!peer->primary_link) { + spin_unlock_bh(&ab->base_lock); + return; + } + ath12k_dp_rx_peer_tid_cleanup(ar, peer); crypto_free_shash(peer->tfm_mmic); peer->dp_setup_done = false; diff --git a/drivers/net/wireless/ath/ath12k/dp_rx.c b/drivers/net/wireless/ath/ath12k/dp_rx.c index 0fb39c174475d..da3ebdf094c3c 100644 --- a/drivers/net/wireless/ath/ath12k/dp_rx.c +++ b/drivers/net/wireless/ath/ath12k/dp_rx.c @@ -940,6 +940,11 @@ int ath12k_dp_rx_peer_tid_setup(struct ath12k *ar, const u8 *peer_mac, int vdev_ return -ENOENT; } + if (!peer->primary_link) { + spin_unlock_bh(&ab->base_lock); + return 0; + } + if (ab->hw_params->reoq_lut_support && !dp->reoq_lut.vaddr) { spin_unlock_bh(&ab->base_lock); ath12k_warn(ab, "reo qref table is not setup\n"); @@ -2781,6 +2786,11 @@ int ath12k_dp_rx_peer_frag_setup(struct ath12k *ar, const u8 *peer_mac, int vdev return -ENOENT; } + if (!peer->primary_link) { + spin_unlock_bh(&ab->base_lock); + return 0; + } + for (i = 0; i <= IEEE80211_NUM_TIDS; i++) { rx_tid = &peer->rx_tid[i]; rx_tid->ab = ab; diff --git a/drivers/net/wireless/ath/ath12k/peer.c b/drivers/net/wireless/ath/ath12k/peer.c index ffbc1265ccc1b..25905498e8fb3 100644 --- a/drivers/net/wireless/ath/ath12k/peer.c +++ b/drivers/net/wireless/ath/ath12k/peer.c @@ -313,7 +313,11 @@ int ath12k_peer_create(struct ath12k *ar, struct ath12k_link_vif *arvif, struct ath12k_wmi_peer_create_arg *arg) { struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); + struct ath12k_link_sta *arsta; + u8 link_id = arvif->link_id; struct ath12k_peer *peer; + struct ath12k_sta *ahsta; + u16 ml_peer_id; int ret; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); @@ -379,6 +383,25 @@ int ath12k_peer_create(struct ath12k *ar, struct ath12k_link_vif *arvif, arvif->ast_idx = peer->hw_peer_id; } + if (sta) { + ahsta = ath12k_sta_to_ahsta(sta); + arsta = wiphy_dereference(ath12k_ar_to_hw(ar)->wiphy, + ahsta->link[link_id]); + + /* Fill ML info into created peer */ + if (sta->mlo) { + ml_peer_id = ahsta->ml_peer_id; + peer->ml_id = ml_peer_id | ATH12K_PEER_ML_ID_VALID; + ether_addr_copy(peer->ml_addr, sta->addr); + + /* the assoc link is considered primary for now */ + peer->primary_link = arsta->is_assoc_link; + } else { + peer->ml_id = ATH12K_MLO_PEER_ID_INVALID; + peer->primary_link = true; + } + } + peer->sec_type = HAL_ENCRYPT_TYPE_OPEN; peer->sec_type_grp = HAL_ENCRYPT_TYPE_OPEN; diff --git a/drivers/net/wireless/ath/ath12k/peer.h b/drivers/net/wireless/ath/ath12k/peer.h index e398d5a3fdc8b..a39e943bd66b1 100644 --- a/drivers/net/wireless/ath/ath12k/peer.h +++ b/drivers/net/wireless/ath/ath12k/peer.h @@ -51,6 +51,14 @@ struct ath12k_peer { bool dp_setup_done; u16 ml_id; + + /* any other ML info common for all partners can be added + * here and would be same for all partner peers. + */ + u8 ml_addr[ETH_ALEN]; + + /* To ensure only certain work related to dp is done once */ + bool primary_link; }; struct ath12k_ml_peer { From 061097e5732dc478ef0e57995fae307e1b95ed62 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Thu, 21 Nov 2024 17:58:02 +0200 Subject: [PATCH 0054/1450] wifi: ath12k: use arsta instead of sta Currently, struct ieee80211_sta (sta) is used for many WMI and mac80211 ops but for multi link station, driver should use struct ath12k_link_sta (arsta) instead of sta. Add changes to use arsta object for WMI commands and other mac80211 ops. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241121155806.1862733-5-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 144 ++++++++++++++------------ 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index d92a5e0afe2e4..01932aeab4f34 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -1771,7 +1771,7 @@ static void ath12k_peer_assoc_h_basic(struct ath12k *ar, else aid = sta->aid; - ether_addr_copy(arg->peer_mac, sta->addr); + ether_addr_copy(arg->peer_mac, arsta->addr); arg->vdev_id = arvif->vdev_id; arg->peer_associd = aid; arg->auth_flag = true; @@ -2163,7 +2163,7 @@ static void ath12k_peer_assoc_h_vht(struct ath12k *ar, arg->tx_max_mcs_nss = 0xFF; ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vht peer %pM max_mpdu %d flags 0x%x\n", - sta->addr, arg->peer_max_mpdu, arg->peer_flags); + arsta->addr, arg->peer_max_mpdu, arg->peer_flags); /* TODO: rxnss_override */ } @@ -2459,7 +2459,7 @@ static void ath12k_peer_assoc_h_qos(struct ath12k *ar, } ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac peer %pM qos %d\n", - sta->addr, arg->qos_flag); + arsta->addr, arg->qos_flag); } static int ath12k_peer_assoc_qos_ap(struct ath12k *ar, @@ -2499,26 +2499,26 @@ static int ath12k_peer_assoc_qos_ap(struct ath12k *ar, arg.param = WMI_AP_PS_PEER_PARAM_UAPSD; arg.value = uapsd; - ret = ath12k_wmi_send_set_ap_ps_param_cmd(ar, sta->addr, &arg); + ret = ath12k_wmi_send_set_ap_ps_param_cmd(ar, arsta->addr, &arg); if (ret) goto err; arg.param = WMI_AP_PS_PEER_PARAM_MAX_SP; arg.value = max_sp; - ret = ath12k_wmi_send_set_ap_ps_param_cmd(ar, sta->addr, &arg); + ret = ath12k_wmi_send_set_ap_ps_param_cmd(ar, arsta->addr, &arg); if (ret) goto err; /* TODO: revisit during testing */ arg.param = WMI_AP_PS_PEER_PARAM_SIFS_RESP_FRMTYPE; arg.value = DISABLE_SIFS_RESPONSE_TRIGGER; - ret = ath12k_wmi_send_set_ap_ps_param_cmd(ar, sta->addr, &arg); + ret = ath12k_wmi_send_set_ap_ps_param_cmd(ar, arsta->addr, &arg); if (ret) goto err; arg.param = WMI_AP_PS_PEER_PARAM_SIFS_RESP_UAPSD; arg.value = DISABLE_SIFS_RESPONSE_TRIGGER; - ret = ath12k_wmi_send_set_ap_ps_param_cmd(ar, sta->addr, &arg); + ret = ath12k_wmi_send_set_ap_ps_param_cmd(ar, arsta->addr, &arg); if (ret) goto err; @@ -2705,7 +2705,7 @@ static void ath12k_peer_assoc_h_phymode(struct ath12k *ar, } ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac peer %pM phymode %s\n", - sta->addr, ath12k_mac_phymode_str(phymode)); + arsta->addr, ath12k_mac_phymode_str(phymode)); arg->peer_phymode = phymode; WARN_ON(phymode == MODE_UNKNOWN); @@ -4295,7 +4295,7 @@ static int ath12k_mac_set_key(struct ath12k *ar, enum set_key_cmd cmd, return 1; if (sta) - peer_addr = sta->addr; + peer_addr = arsta->addr; else if (ahvif->vdev_type == WMI_VDEV_TYPE_STA) peer_addr = vif->bss_conf.bssid; else @@ -4526,7 +4526,6 @@ ath12k_mac_set_peer_vht_fixed_rate(struct ath12k_link_vif *arvif, const struct cfg80211_bitrate_mask *mask, enum nl80211_band band) { - struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); struct ath12k *ar = arvif->ar; u8 vht_rate, nss; u32 rate_code; @@ -4545,24 +4544,24 @@ ath12k_mac_set_peer_vht_fixed_rate(struct ath12k_link_vif *arvif, if (!nss) { ath12k_warn(ar->ab, "No single VHT Fixed rate found to set for %pM", - sta->addr); + arsta->addr); return -EINVAL; } ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "Setting Fixed VHT Rate for peer %pM. Device will not switch to any other selected rates", - sta->addr); + arsta->addr); rate_code = ATH12K_HW_RATE_CODE(vht_rate, nss - 1, WMI_RATE_PREAMBLE_VHT); - ret = ath12k_wmi_set_peer_param(ar, sta->addr, + ret = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_PARAM_FIXED_RATE, rate_code); if (ret) ath12k_warn(ar->ab, "failed to update STA %pM Fixed Rate %d: %d\n", - sta->addr, rate_code, ret); + arsta->addr, rate_code, ret); return ret; } @@ -4580,12 +4579,16 @@ static int ath12k_mac_station_assoc(struct ath12k *ar, enum nl80211_band band; struct cfg80211_bitrate_mask *mask; u8 num_vht_rates; + u8 link_id = arvif->link_id; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); if (WARN_ON(ath12k_mac_vif_link_chan(vif, arvif->link_id, &def))) return -EPERM; + if (WARN_ON(!rcu_access_pointer(sta->link[link_id]))) + return -EINVAL; + band = def.chan->band; mask = &arvif->bitrate_mask; @@ -4599,13 +4602,13 @@ static int ath12k_mac_station_assoc(struct ath12k *ar, ret = ath12k_wmi_send_peer_assoc_cmd(ar, &peer_arg); if (ret) { ath12k_warn(ar->ab, "failed to run peer assoc for STA %pM vdev %i: %d\n", - sta->addr, arvif->vdev_id, ret); + arsta->addr, arvif->vdev_id, ret); return ret; } if (!wait_for_completion_timeout(&ar->peer_assoc_done, 1 * HZ)) { ath12k_warn(ar->ab, "failed to get peer assoc conf event for %pM vdev %i\n", - sta->addr, arvif->vdev_id); + arsta->addr, arvif->vdev_id); return -ETIMEDOUT; } @@ -4629,7 +4632,7 @@ static int ath12k_mac_station_assoc(struct ath12k *ar, if (reassoc) return 0; - ret = ath12k_setup_peer_smps(ar, arvif, sta->addr, + ret = ath12k_setup_peer_smps(ar, arvif, arsta->addr, &sta->deflink.ht_cap, &sta->deflink.he_6ghz_capa); if (ret) { @@ -4649,7 +4652,7 @@ static int ath12k_mac_station_assoc(struct ath12k *ar, ret = ath12k_peer_assoc_qos_ap(ar, arvif, arsta); if (ret) { ath12k_warn(ar->ab, "failed to set qos params for STA %pM for vdev %i: %d\n", - sta->addr, arvif->vdev_id, ret); + arsta->addr, arvif->vdev_id, ret); return ret; } } @@ -4732,65 +4735,65 @@ static void ath12k_sta_rc_update_wk(struct wiphy *wiphy, struct wiphy_work *wk) * WMI_PEER_CHWIDTH */ ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac bandwidth upgrade for sta %pM new %d old %d\n", - sta->addr, bw, bw_prev); - err = ath12k_wmi_set_peer_param(ar, sta->addr, + arsta->addr, bw, bw_prev); + err = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_PHYMODE, peer_phymode); if (err) { ath12k_warn(ar->ab, "failed to update STA %pM to peer phymode %d: %d\n", - sta->addr, peer_phymode, err); + arsta->addr, peer_phymode, err); return; } - err = ath12k_wmi_set_peer_param(ar, sta->addr, + err = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_CHWIDTH, bw); if (err) ath12k_warn(ar->ab, "failed to update STA %pM to peer bandwidth %d: %d\n", - sta->addr, bw, err); + arsta->addr, bw, err); } else { /* When we downgrade bandwidth this will conflict with phymode * and cause to trigger firmware crash. In this case we send * WMI_PEER_CHWIDTH followed by WMI_PEER_PHYMODE */ ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac bandwidth downgrade for sta %pM new %d old %d\n", - sta->addr, bw, bw_prev); - err = ath12k_wmi_set_peer_param(ar, sta->addr, + arsta->addr, bw, bw_prev); + err = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_CHWIDTH, bw); if (err) { ath12k_warn(ar->ab, "failed to update STA %pM peer to bandwidth %d: %d\n", - sta->addr, bw, err); + arsta->addr, bw, err); return; } - err = ath12k_wmi_set_peer_param(ar, sta->addr, + err = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_PHYMODE, peer_phymode); if (err) ath12k_warn(ar->ab, "failed to update STA %pM to peer phymode %d: %d\n", - sta->addr, peer_phymode, err); + arsta->addr, peer_phymode, err); } } if (changed & IEEE80211_RC_NSS_CHANGED) { ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac update sta %pM nss %d\n", - sta->addr, nss); + arsta->addr, nss); - err = ath12k_wmi_set_peer_param(ar, sta->addr, arvif->vdev_id, + err = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_NSS, nss); if (err) ath12k_warn(ar->ab, "failed to update STA %pM nss %d: %d\n", - sta->addr, nss, err); + arsta->addr, nss, err); } if (changed & IEEE80211_RC_SMPS_CHANGED) { ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac update sta %pM smps %d\n", - sta->addr, smps); + arsta->addr, smps); - err = ath12k_wmi_set_peer_param(ar, sta->addr, arvif->vdev_id, + err = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_MIMO_PS_STATE, smps); if (err) ath12k_warn(ar->ab, "failed to update STA %pM smps %d: %d\n", - sta->addr, smps, err); + arsta->addr, smps, err); } if (changed & IEEE80211_RC_SUPP_RATES_CHANGED) { @@ -4823,11 +4826,11 @@ static void ath12k_sta_rc_update_wk(struct wiphy *wiphy, struct wiphy_work *wk) err = ath12k_wmi_send_peer_assoc_cmd(ar, &peer_arg); if (err) ath12k_warn(ar->ab, "failed to run peer assoc for STA %pM vdev %i: %d\n", - sta->addr, arvif->vdev_id, err); + arsta->addr, arvif->vdev_id, err); if (!wait_for_completion_timeout(&ar->peer_assoc_done, 1 * HZ)) ath12k_warn(ar->ab, "failed to get peer assoc conf event for %pM vdev %i\n", - sta->addr, arvif->vdev_id); + arsta->addr, arvif->vdev_id); } } } @@ -4908,7 +4911,7 @@ static void ath12k_mac_station_post_remove(struct ath12k *ar, spin_lock_bh(&ar->ab->base_lock); - peer = ath12k_peer_find(ar->ab, arvif->vdev_id, sta->addr); + peer = ath12k_peer_find(ar->ab, arvif->vdev_id, arsta->addr); if (peer && peer->sta == sta) { ath12k_warn(ar->ab, "Found peer entry %pM n vdev %i after it was supposedly removed\n", vif->addr, arvif->vdev_id); @@ -4963,27 +4966,26 @@ static int ath12k_mac_station_authorize(struct ath12k *ar, { struct ath12k_peer *peer; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); - struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); int ret; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); spin_lock_bh(&ar->ab->base_lock); - peer = ath12k_peer_find(ar->ab, arvif->vdev_id, sta->addr); + peer = ath12k_peer_find(ar->ab, arvif->vdev_id, arsta->addr); if (peer) peer->is_authorized = true; spin_unlock_bh(&ar->ab->base_lock); if (vif->type == NL80211_IFTYPE_STATION && arvif->is_up) { - ret = ath12k_wmi_set_peer_param(ar, sta->addr, + ret = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_AUTHORIZE, 1); if (ret) { ath12k_warn(ar->ab, "Unable to authorize peer %pM vdev %d: %d\n", - sta->addr, arvif->vdev_id, ret); + arsta->addr, arvif->vdev_id, ret); return ret; } } @@ -5014,15 +5016,15 @@ static int ath12k_mac_station_remove(struct ath12k *ar, if (sta->mlo) return ret; - ath12k_dp_peer_cleanup(ar, arvif->vdev_id, sta->addr); + ath12k_dp_peer_cleanup(ar, arvif->vdev_id, arsta->addr); - ret = ath12k_peer_delete(ar, arvif->vdev_id, sta->addr); + ret = ath12k_peer_delete(ar, arvif->vdev_id, arsta->addr); if (ret) ath12k_warn(ar->ab, "Failed to delete peer: %pM for VDEV: %d\n", - sta->addr, arvif->vdev_id); + arsta->addr, arvif->vdev_id); else ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "Removed peer: %pM for VDEV: %d\n", - sta->addr, arvif->vdev_id); + arsta->addr, arvif->vdev_id); ath12k_mac_station_post_remove(ar, arvif, arsta); @@ -5040,7 +5042,7 @@ static int ath12k_mac_station_add(struct ath12k *ar, struct ath12k_base *ab = ar->ab; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - struct ath12k_wmi_peer_create_arg peer_param; + struct ath12k_wmi_peer_create_arg peer_param = {0}; int ret; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); @@ -5065,28 +5067,28 @@ static int ath12k_mac_station_add(struct ath12k *ar, ret = ath12k_peer_create(ar, arvif, sta, &peer_param); if (ret) { ath12k_warn(ab, "Failed to add peer: %pM for VDEV: %d\n", - sta->addr, arvif->vdev_id); + arsta->addr, arvif->vdev_id); goto free_peer; } ath12k_dbg(ab, ATH12K_DBG_MAC, "Added peer: %pM for VDEV: %d\n", - sta->addr, arvif->vdev_id); + arsta->addr, arvif->vdev_id); if (ieee80211_vif_is_mesh(vif)) { - ret = ath12k_wmi_set_peer_param(ar, sta->addr, + ret = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_USE_4ADDR, 1); if (ret) { ath12k_warn(ab, "failed to STA %pM 4addr capability: %d\n", - sta->addr, ret); + arsta->addr, ret); goto free_peer; } } - ret = ath12k_dp_peer_setup(ar, arvif->vdev_id, sta->addr); + ret = ath12k_dp_peer_setup(ar, arvif->vdev_id, arsta->addr); if (ret) { ath12k_warn(ab, "failed to setup dp for peer %pM on vdev %i (%d)\n", - sta->addr, arvif->vdev_id, ret); + arsta->addr, arvif->vdev_id, ret); goto free_peer; } @@ -5103,7 +5105,7 @@ static int ath12k_mac_station_add(struct ath12k *ar, return 0; free_peer: - ath12k_peer_delete(ar, arvif->vdev_id, sta->addr); + ath12k_peer_delete(ar, arvif->vdev_id, arsta->addr); dec_num_station: ath12k_mac_dec_num_stations(arvif, arsta); exit: @@ -5175,6 +5177,8 @@ static int ath12k_mac_assign_link_sta(struct ath12k_hw *ah, ahsta->links_map |= BIT(arsta->link_id); arsta->arvif = arvif; arsta->ahsta = ahsta; + ahsta->ahvif = ahvif; + wiphy_work_init(&arsta->update_wk, ath12k_sta_rc_update_wk); rcu_assign_pointer(ahsta->link[link_id], arsta); @@ -5250,7 +5254,7 @@ static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, ret = ath12k_mac_station_add(ar, arvif, arsta); if (ret) ath12k_warn(ar->ab, "Failed to add station: %pM for VDEV: %d\n", - sta->addr, arvif->vdev_id); + arsta->addr, arvif->vdev_id); /* IEEE80211_STA_AUTH -> IEEE80211_STA_ASSOC: Send station assoc command for * peer associated to AP/Mesh/ADHOC vif type. @@ -5263,7 +5267,7 @@ static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, ret = ath12k_mac_station_assoc(ar, arvif, arsta, false); if (ret) ath12k_warn(ar->ab, "Failed to associate station: %pM\n", - sta->addr); + arsta->addr); spin_lock_bh(&ar->data_lock); @@ -5280,7 +5284,7 @@ static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, ret = ath12k_mac_station_authorize(ar, arvif, arsta); if (ret) ath12k_warn(ar->ab, "Failed to authorize station: %pM\n", - sta->addr); + arsta->addr); /* IEEE80211_STA_AUTHORIZED -> IEEE80211_STA_ASSOC: station may be in removal, * deauthorize it. @@ -5300,7 +5304,7 @@ static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, ret = ath12k_mac_station_disassoc(ar, arvif, arsta); if (ret) ath12k_warn(ar->ab, "Failed to disassociate station: %pM\n", - sta->addr); + arsta->addr); } exit: @@ -5413,16 +5417,22 @@ static int ath12k_mac_op_sta_set_txpwr(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta) { - struct ath12k_hw *ah = ath12k_hw_to_ah(hw); + struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); struct ath12k *ar; struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); struct ath12k_link_vif *arvif; + struct ath12k_link_sta *arsta; + u8 link_id; int ret; s16 txpwr; lockdep_assert_wiphy(hw->wiphy); - arvif = &ahvif->deflink; + /* TODO: use link id from mac80211 once that's implemented */ + link_id = 0; + + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + arsta = wiphy_dereference(hw->wiphy, ahsta->link[link_id]); if (sta->deflink.txpwr.type == NL80211_TX_POWER_AUTOMATIC) { txpwr = 0; @@ -5439,9 +5449,9 @@ static int ath12k_mac_op_sta_set_txpwr(struct ieee80211_hw *hw, goto out; } - ar = ath12k_ah_to_ar(ah, 0); + ar = arvif->ar; - ret = ath12k_wmi_set_peer_param(ar, sta->addr, arvif->vdev_id, + ret = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_USE_FIXED_PWR, txpwr); if (ret) { ath12k_warn(ar->ab, "failed to set tx power for station ret: %d\n", @@ -5494,12 +5504,12 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, } spin_lock_bh(&ar->ab->base_lock); - peer = ath12k_peer_find(ar->ab, arvif->vdev_id, sta->addr); + peer = ath12k_peer_find(ar->ab, arvif->vdev_id, arsta->addr); if (!peer) { spin_unlock_bh(&ar->ab->base_lock); rcu_read_unlock(); ath12k_warn(ar->ab, "mac sta rc update failed to find peer %pM on vdev %i\n", - sta->addr, arvif->vdev_id); + arsta->addr, arvif->vdev_id); return; } @@ -5507,7 +5517,7 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac sta rc update for %pM changed %08x bw %d nss %d smps %d\n", - sta->addr, changed, sta->deflink.bandwidth, sta->deflink.rx_nss, + arsta->addr, changed, sta->deflink.bandwidth, sta->deflink.rx_nss, sta->deflink.smps_mode); spin_lock_bh(&ar->data_lock); @@ -5537,7 +5547,7 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, break; default: ath12k_warn(ar->ab, "Invalid smps %d in sta rc update for %pM\n", - sta->deflink.smps_mode, sta->addr); + sta->deflink.smps_mode, arsta->addr); smps = WMI_PEER_SMPS_PS_NONE; break; } @@ -9121,14 +9131,14 @@ static void ath12k_mac_disable_peer_fixed_rate(void *data, if (arsta->arvif != arvif) return; - ret = ath12k_wmi_set_peer_param(ar, sta->addr, + ret = ath12k_wmi_set_peer_param(ar, arsta->addr, arvif->vdev_id, WMI_PEER_PARAM_FIXED_RATE, WMI_FIXED_RATE_NONE); if (ret) ath12k_warn(ar->ab, "failed to disable peer fixed rate for STA %pM ret %d\n", - sta->addr, ret); + arsta->addr, ret); } static int From a0300e6bcfd4bc3ea9865856a3abad5f9dd6fd89 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Thu, 21 Nov 2024 17:58:03 +0200 Subject: [PATCH 0055/1450] wifi: ath12k: add reo queue lookup table for ML peers Currently reoqueue tid setup uses lookup table (LUT) during peer association, but for ML peer there will be multiple link peers (belonging to different underlying firmware) affiliated to each other. Hence the reo queue should be setup only on one of the links which is the primary link. Add changes to create separate ML reo queue lookup table for ML peers and use the same while setting up rx tid for ML peer's primary link. For ML peers use ml_peer_id instead of peer_id to setup/lookup the reo queue entry in the LUT. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241121155806.1862733-6-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/dp.c | 41 ++++++++++++++++----- drivers/net/wireless/ath/ath12k/dp.h | 1 + drivers/net/wireless/ath/ath12k/dp_rx.c | 48 ++++++++++++++++++------- drivers/net/wireless/ath/ath12k/peer.c | 2 ++ drivers/net/wireless/ath/ath12k/peer.h | 2 +- 5 files changed, 71 insertions(+), 23 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/dp.c b/drivers/net/wireless/ath/ath12k/dp.c index 23326e2dfe8d2..328be2c635d67 100644 --- a/drivers/net/wireless/ath/ath12k/dp.c +++ b/drivers/net/wireless/ath/ath12k/dp.c @@ -1265,15 +1265,23 @@ static void ath12k_dp_reoq_lut_cleanup(struct ath12k_base *ab) if (!ab->hw_params->reoq_lut_support) return; - if (!dp->reoq_lut.vaddr) - return; - - dma_free_coherent(ab->dev, DP_REOQ_LUT_SIZE, - dp->reoq_lut.vaddr, dp->reoq_lut.paddr); - dp->reoq_lut.vaddr = NULL; - - ath12k_hif_write32(ab, - HAL_SEQ_WCSS_UMAC_REO_REG + HAL_REO1_QDESC_LUT_BASE0(ab), 0); + if (dp->reoq_lut.vaddr) { + ath12k_hif_write32(ab, + HAL_SEQ_WCSS_UMAC_REO_REG + + HAL_REO1_QDESC_LUT_BASE0(ab), 0); + dma_free_coherent(ab->dev, DP_REOQ_LUT_SIZE, + dp->reoq_lut.vaddr, dp->reoq_lut.paddr); + dp->reoq_lut.vaddr = NULL; + } + + if (dp->ml_reoq_lut.vaddr) { + ath12k_hif_write32(ab, + HAL_SEQ_WCSS_UMAC_REO_REG + + HAL_REO1_QDESC_LUT_BASE1(ab), 0); + dma_free_coherent(ab->dev, DP_REOQ_LUT_SIZE, + dp->ml_reoq_lut.vaddr, dp->ml_reoq_lut.paddr); + dp->ml_reoq_lut.vaddr = NULL; + } } void ath12k_dp_free(struct ath12k_base *ab) @@ -1599,8 +1607,23 @@ static int ath12k_dp_reoq_lut_setup(struct ath12k_base *ab) return -ENOMEM; } + dp->ml_reoq_lut.vaddr = dma_alloc_coherent(ab->dev, + DP_REOQ_LUT_SIZE, + &dp->ml_reoq_lut.paddr, + GFP_KERNEL | __GFP_ZERO); + if (!dp->ml_reoq_lut.vaddr) { + ath12k_warn(ab, "failed to allocate memory for ML reoq table"); + dma_free_coherent(ab->dev, DP_REOQ_LUT_SIZE, + dp->reoq_lut.vaddr, dp->reoq_lut.paddr); + dp->reoq_lut.vaddr = NULL; + return -ENOMEM; + } + ath12k_hif_write32(ab, HAL_SEQ_WCSS_UMAC_REO_REG + HAL_REO1_QDESC_LUT_BASE0(ab), dp->reoq_lut.paddr); + ath12k_hif_write32(ab, HAL_SEQ_WCSS_UMAC_REO_REG + HAL_REO1_QDESC_LUT_BASE1(ab), + dp->ml_reoq_lut.paddr >> 8); + return 0; } diff --git a/drivers/net/wireless/ath/ath12k/dp.h b/drivers/net/wireless/ath/ath12k/dp.h index 2e05fc19410e8..a120b7a8477d8 100644 --- a/drivers/net/wireless/ath/ath12k/dp.h +++ b/drivers/net/wireless/ath/ath12k/dp.h @@ -368,6 +368,7 @@ struct ath12k_dp { struct dp_rxdma_mon_ring rxdma_mon_buf_ring; struct dp_rxdma_mon_ring tx_mon_buf_ring; struct ath12k_reo_q_addr_lut reoq_lut; + struct ath12k_reo_q_addr_lut ml_reoq_lut; }; /* HTT definitions */ diff --git a/drivers/net/wireless/ath/ath12k/dp_rx.c b/drivers/net/wireless/ath/ath12k/dp_rx.c index da3ebdf094c3c..70680f2124e52 100644 --- a/drivers/net/wireless/ath/ath12k/dp_rx.c +++ b/drivers/net/wireless/ath/ath12k/dp_rx.c @@ -740,15 +740,22 @@ static void ath12k_peer_rx_tid_qref_setup(struct ath12k_base *ab, u16 peer_id, u { struct ath12k_reo_queue_ref *qref; struct ath12k_dp *dp = &ab->dp; + bool ml_peer = false; if (!ab->hw_params->reoq_lut_support) return; - /* TODO: based on ML peer or not, select the LUT. below assumes non - * ML peer - */ - qref = (struct ath12k_reo_queue_ref *)dp->reoq_lut.vaddr + - (peer_id * (IEEE80211_NUM_TIDS + 1) + tid); + if (peer_id & ATH12K_PEER_ML_ID_VALID) { + peer_id &= ~ATH12K_PEER_ML_ID_VALID; + ml_peer = true; + } + + if (ml_peer) + qref = (struct ath12k_reo_queue_ref *)dp->ml_reoq_lut.vaddr + + (peer_id * (IEEE80211_NUM_TIDS + 1) + tid); + else + qref = (struct ath12k_reo_queue_ref *)dp->reoq_lut.vaddr + + (peer_id * (IEEE80211_NUM_TIDS + 1) + tid); qref->info0 = u32_encode_bits(lower_32_bits(paddr), BUFFER_ADDR_INFO0_ADDR); @@ -761,15 +768,22 @@ static void ath12k_peer_rx_tid_qref_reset(struct ath12k_base *ab, u16 peer_id, u { struct ath12k_reo_queue_ref *qref; struct ath12k_dp *dp = &ab->dp; + bool ml_peer = false; if (!ab->hw_params->reoq_lut_support) return; - /* TODO: based on ML peer or not, select the LUT. below assumes non - * ML peer - */ - qref = (struct ath12k_reo_queue_ref *)dp->reoq_lut.vaddr + - (peer_id * (IEEE80211_NUM_TIDS + 1) + tid); + if (peer_id & ATH12K_PEER_ML_ID_VALID) { + peer_id &= ~ATH12K_PEER_ML_ID_VALID; + ml_peer = true; + } + + if (ml_peer) + qref = (struct ath12k_reo_queue_ref *)dp->ml_reoq_lut.vaddr + + (peer_id * (IEEE80211_NUM_TIDS + 1) + tid); + else + qref = (struct ath12k_reo_queue_ref *)dp->reoq_lut.vaddr + + (peer_id * (IEEE80211_NUM_TIDS + 1) + tid); qref->info0 = u32_encode_bits(0, BUFFER_ADDR_INFO0_ADDR); qref->info1 = u32_encode_bits(0, BUFFER_ADDR_INFO1_ADDR) | @@ -802,7 +816,10 @@ void ath12k_dp_rx_peer_tid_delete(struct ath12k *ar, rx_tid->vaddr = NULL; } - ath12k_peer_rx_tid_qref_reset(ar->ab, peer->peer_id, tid); + if (peer->mlo) + ath12k_peer_rx_tid_qref_reset(ar->ab, peer->ml_id, tid); + else + ath12k_peer_rx_tid_qref_reset(ar->ab, peer->peer_id, tid); rx_tid->active = false; } @@ -945,7 +962,8 @@ int ath12k_dp_rx_peer_tid_setup(struct ath12k *ar, const u8 *peer_mac, int vdev_ return 0; } - if (ab->hw_params->reoq_lut_support && !dp->reoq_lut.vaddr) { + if (ab->hw_params->reoq_lut_support && + (!dp->reoq_lut.vaddr || !dp->ml_reoq_lut.vaddr)) { spin_unlock_bh(&ab->base_lock); ath12k_warn(ab, "reo qref table is not setup\n"); return -EINVAL; @@ -1026,7 +1044,11 @@ int ath12k_dp_rx_peer_tid_setup(struct ath12k *ar, const u8 *peer_mac, int vdev_ /* Update the REO queue LUT at the corresponding peer id * and tid with qaddr. */ - ath12k_peer_rx_tid_qref_setup(ab, peer->peer_id, tid, paddr); + if (peer->mlo) + ath12k_peer_rx_tid_qref_setup(ab, peer->ml_id, tid, paddr); + else + ath12k_peer_rx_tid_qref_setup(ab, peer->peer_id, tid, paddr); + spin_unlock_bh(&ab->base_lock); } else { spin_unlock_bh(&ab->base_lock); diff --git a/drivers/net/wireless/ath/ath12k/peer.c b/drivers/net/wireless/ath/ath12k/peer.c index 25905498e8fb3..5763c5a40cfc3 100644 --- a/drivers/net/wireless/ath/ath12k/peer.c +++ b/drivers/net/wireless/ath/ath12k/peer.c @@ -396,9 +396,11 @@ int ath12k_peer_create(struct ath12k *ar, struct ath12k_link_vif *arvif, /* the assoc link is considered primary for now */ peer->primary_link = arsta->is_assoc_link; + peer->mlo = true; } else { peer->ml_id = ATH12K_MLO_PEER_ID_INVALID; peer->primary_link = true; + peer->mlo = false; } } diff --git a/drivers/net/wireless/ath/ath12k/peer.h b/drivers/net/wireless/ath/ath12k/peer.h index a39e943bd66b1..7e6231cb2b529 100644 --- a/drivers/net/wireless/ath/ath12k/peer.h +++ b/drivers/net/wireless/ath/ath12k/peer.h @@ -46,7 +46,7 @@ struct ath12k_peer { struct ppdu_user_delayba ppdu_stats_delayba; bool delayba_flag; bool is_authorized; - + bool mlo; /* protected by ab->data_lock */ bool dp_setup_done; From aaac8850a07f9072ed62f54b0e5fcb14c8e0d044 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Thu, 21 Nov 2024 17:58:04 +0200 Subject: [PATCH 0056/1450] wifi: ath12k: modify chanctx iterators for MLO Currently ath12k's chanctx iterator functions use deflink of given ahvif and bss_conf of corresponding vif to make sure the iterator returns intended vif. An ML vif can have multiple affiliated links each having its own channel context, hence iterate through the links of the given ahvif and use the link objects (arvif and link_conf) to make sure the chan ctx iterator returns intended link of the given vif. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241121155806.1862733-7-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 80 +++++++++++++++++++-------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 01932aeab4f34..956d43520510f 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -8309,19 +8309,32 @@ ath12k_mac_change_chanctx_cnt_iter(void *data, u8 *mac, { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); struct ath12k_mac_change_chanctx_arg *arg = data; + struct ieee80211_bss_conf *link_conf; struct ath12k_link_vif *arvif; + unsigned long links_map; + u8 link_id; lockdep_assert_wiphy(ahvif->ah->hw->wiphy); - arvif = &ahvif->deflink; + links_map = ahvif->links_map; + for_each_set_bit(link_id, &links_map, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(ahvif->ah->hw->wiphy, ahvif->link[link_id]); + if (WARN_ON(!arvif)) + continue; - if (arvif->ar != arg->ar) - return; + if (arvif->ar != arg->ar) + continue; - if (rcu_access_pointer(vif->bss_conf.chanctx_conf) != arg->ctx) - return; + link_conf = wiphy_dereference(ahvif->ah->hw->wiphy, + vif->link_conf[link_id]); + if (WARN_ON(!link_conf)) + continue; + + if (rcu_access_pointer(link_conf->chanctx_conf) != arg->ctx) + continue; - arg->n_vifs++; + arg->n_vifs++; + } } static void @@ -8330,27 +8343,41 @@ ath12k_mac_change_chanctx_fill_iter(void *data, u8 *mac, { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); struct ath12k_mac_change_chanctx_arg *arg = data; + struct ieee80211_bss_conf *link_conf; struct ieee80211_chanctx_conf *ctx; struct ath12k_link_vif *arvif; + unsigned long links_map; + u8 link_id; lockdep_assert_wiphy(ahvif->ah->hw->wiphy); - arvif = &ahvif->deflink; + links_map = ahvif->links_map; + for_each_set_bit(link_id, &links_map, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(ahvif->ah->hw->wiphy, ahvif->link[link_id]); + if (WARN_ON(!arvif)) + continue; - if (arvif->ar != arg->ar) - return; + if (arvif->ar != arg->ar) + continue; - ctx = rcu_access_pointer(vif->bss_conf.chanctx_conf); - if (ctx != arg->ctx) - return; + link_conf = wiphy_dereference(ahvif->ah->hw->wiphy, + vif->link_conf[arvif->link_id]); + if (WARN_ON(!link_conf)) + continue; - if (WARN_ON(arg->next_vif == arg->n_vifs)) - return; + ctx = rcu_access_pointer(link_conf->chanctx_conf); + if (ctx != arg->ctx) + continue; + + if (WARN_ON(arg->next_vif == arg->n_vifs)) + return; - arg->vifs[arg->next_vif].vif = vif; - arg->vifs[arg->next_vif].old_ctx = ctx; - arg->vifs[arg->next_vif].new_ctx = ctx; - arg->next_vif++; + arg->vifs[arg->next_vif].vif = vif; + arg->vifs[arg->next_vif].old_ctx = ctx; + arg->vifs[arg->next_vif].new_ctx = ctx; + arg->vifs[arg->next_vif].link_conf = link_conf; + arg->next_vif++; + } } static u32 ath12k_mac_nlwidth_to_wmiwidth(enum nl80211_chan_width width) @@ -8410,10 +8437,12 @@ ath12k_mac_update_vif_chan(struct ath12k *ar, int n_vifs) { struct ath12k_wmi_vdev_up_params params = {}; + struct ieee80211_bss_conf *link_conf; struct ath12k_base *ab = ar->ab; struct ath12k_link_vif *arvif; struct ieee80211_vif *vif; struct ath12k_vif *ahvif; + u8 link_id; int ret; int i; bool monitor_vif = false; @@ -8423,7 +8452,10 @@ ath12k_mac_update_vif_chan(struct ath12k *ar, for (i = 0; i < n_vifs; i++) { vif = vifs[i].vif; ahvif = ath12k_vif_to_ahvif(vif); - arvif = &ahvif->deflink; + link_conf = vifs[i].link_conf; + link_id = link_conf->link_id; + arvif = wiphy_dereference(ath12k_ar_to_hw(ar)->wiphy, + ahvif->link[link_id]); if (vif->type == NL80211_IFTYPE_MONITOR) monitor_vif = true; @@ -8476,13 +8508,13 @@ ath12k_mac_update_vif_chan(struct ath12k *ar, params.aid = ahvif->aid; params.bssid = arvif->bssid; if (vif->mbssid_tx_vif) { - struct ath12k_vif *ahvif = + struct ath12k_vif *tx_ahvif = ath12k_vif_to_ahvif(vif->mbssid_tx_vif); - struct ath12k_link_vif *arvif = &ahvif->deflink; + struct ath12k_link_vif *tx_arvif = &tx_ahvif->deflink; - params.tx_bssid = arvif->bssid; - params.nontx_profile_idx = vif->bss_conf.bssid_index; - params.nontx_profile_cnt = 1 << vif->bss_conf.bssid_indicator; + params.tx_bssid = tx_arvif->bssid; + params.nontx_profile_idx = link_conf->bssid_index; + params.nontx_profile_cnt = 1 << link_conf->bssid_indicator; } ret = ath12k_wmi_vdev_up(arvif->ar, ¶ms); if (ret) { From 3952657848c035855007f7a430a753e123935b3a Mon Sep 17 00:00:00 2001 From: Sriram R Date: Thu, 21 Nov 2024 17:58:05 +0200 Subject: [PATCH 0057/1450] wifi: ath12k: Use mac80211 vif's link_conf instead of bss_conf Currently mac80211 vif's bss_conf is used to fetch any vif related configurations in driver but with MLO multiple links are affiliated to a vif and corresponding link configs are present in vif->link_conf[]. Fetch link_conf for corresponding link from vif and use the same for configurations. Add ath12k_mac_get_link_bss_conf() helper to fetch link_conf from arvif. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241121155806.1862733-8-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 149 ++++++++++++++++++++++---- drivers/net/wireless/ath/ath12k/wmi.c | 16 ++- 2 files changed, 141 insertions(+), 24 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 956d43520510f..827a1700e8ba0 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -501,6 +501,24 @@ static int ath12k_mac_vif_link_chan(struct ieee80211_vif *vif, u8 link_id, return 0; } +static struct ieee80211_bss_conf * +ath12k_mac_get_link_bss_conf(struct ath12k_link_vif *arvif) +{ + struct ieee80211_vif *vif = arvif->ahvif->vif; + struct ieee80211_bss_conf *link_conf; + struct ath12k *ar = arvif->ar; + + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + if (arvif->link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + return NULL; + + link_conf = wiphy_dereference(ath12k_ar_to_hw(ar)->wiphy, + vif->link_conf[arvif->link_id]); + + return link_conf; +} + static bool ath12k_mac_bitrate_is_cck(int bitrate) { switch (bitrate) { @@ -1498,7 +1516,7 @@ static void ath12k_mac_set_arvif_ies(struct ath12k_link_vif *arvif, struct sk_bu static int ath12k_mac_setup_bcn_tmpl_ema(struct ath12k_link_vif *arvif) { struct ath12k_vif *ahvif = arvif->ahvif; - struct ieee80211_bss_conf *bss_conf = &ahvif->vif->bss_conf; + struct ieee80211_bss_conf *bss_conf; struct ath12k_wmi_bcn_tmpl_ema_arg ema_args; struct ieee80211_ema_beacons *beacons; struct ath12k_link_vif *tx_arvif; @@ -1507,6 +1525,14 @@ static int ath12k_mac_setup_bcn_tmpl_ema(struct ath12k_link_vif *arvif) int ret = 0; u8 i; + bss_conf = ath12k_mac_get_link_bss_conf(arvif); + if (!bss_conf) { + ath12k_warn(arvif->ar->ab, + "failed to get link bss conf to update bcn tmpl for vif %pM link %u\n", + ahvif->vif->addr, arvif->link_id); + return -ENOLINK; + } + tx_ahvif = ath12k_vif_to_ahvif(ahvif->vif->mbssid_tx_vif); tx_arvif = &tx_ahvif->deflink; beacons = ieee80211_beacon_get_template_ema_list(ath12k_ar_to_hw(tx_arvif->ar), @@ -1553,6 +1579,7 @@ static int ath12k_mac_setup_bcn_tmpl(struct ath12k_link_vif *arvif) { struct ath12k_vif *ahvif = arvif->ahvif; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif); + struct ieee80211_bss_conf *link_conf; struct ath12k_link_vif *tx_arvif = arvif; struct ath12k *ar = arvif->ar; struct ath12k_base *ab = ar->ab; @@ -1565,13 +1592,20 @@ static int ath12k_mac_setup_bcn_tmpl(struct ath12k_link_vif *arvif) if (ahvif->vdev_type != WMI_VDEV_TYPE_AP) return 0; + link_conf = ath12k_mac_get_link_bss_conf(arvif); + if (!link_conf) { + ath12k_warn(ar->ab, "unable to access bss link conf to set bcn tmpl for vif %pM link %u\n", + vif->addr, arvif->link_id); + return -ENOLINK; + } + if (vif->mbssid_tx_vif) { tx_ahvif = ath12k_vif_to_ahvif(vif->mbssid_tx_vif); tx_arvif = &tx_ahvif->deflink; if (tx_arvif != arvif && arvif->is_up) return 0; - if (vif->bss_conf.ema_ap) + if (link_conf->ema_ap) return ath12k_mac_setup_bcn_tmpl_ema(arvif); } @@ -1586,7 +1620,7 @@ static int ath12k_mac_setup_bcn_tmpl(struct ath12k_link_vif *arvif) ath12k_mac_set_arvif_ies(arvif, bcn, 0, NULL); } else { ath12k_mac_set_arvif_ies(arvif, bcn, - ahvif->vif->bss_conf.bssid_index, + link_conf->bssid_index, &nontx_profile_found); if (!nontx_profile_found) ath12k_warn(ab, @@ -1762,6 +1796,7 @@ static void ath12k_peer_assoc_h_basic(struct ath12k *ar, struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); struct ieee80211_hw *hw = ath12k_ar_to_hw(ar); + struct ieee80211_bss_conf *bss_conf; u32 aid; lockdep_assert_wiphy(hw->wiphy); @@ -1778,7 +1813,15 @@ static void ath12k_peer_assoc_h_basic(struct ath12k *ar, /* TODO: STA WAR in ath10k for listen interval required? */ arg->peer_listen_intval = hw->conf.listen_interval; arg->peer_nss = 1; - arg->peer_caps = vif->bss_conf.assoc_capability; + + bss_conf = ath12k_mac_get_link_bss_conf(arvif); + if (!bss_conf) { + ath12k_warn(ar->ab, "unable to access bss link conf in peer assoc for vif %pM link %u\n", + vif->addr, arvif->link_id); + return; + } + + arg->peer_caps = bss_conf->assoc_capability; } static void ath12k_peer_assoc_h_crypto(struct ath12k *ar, @@ -1788,7 +1831,7 @@ static void ath12k_peer_assoc_h_crypto(struct ath12k *ar, { struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - struct ieee80211_bss_conf *info = &vif->bss_conf; + struct ieee80211_bss_conf *info; struct cfg80211_chan_def def; struct cfg80211_bss *bss; struct ieee80211_hw *hw = ath12k_ar_to_hw(ar); @@ -1797,6 +1840,13 @@ static void ath12k_peer_assoc_h_crypto(struct ath12k *ar, lockdep_assert_wiphy(hw->wiphy); + info = ath12k_mac_get_link_bss_conf(arvif); + if (!info) { + ath12k_warn(ar->ab, "unable to access bss link conf for peer assoc crypto for vif %pM link %u\n", + vif->addr, arvif->link_id); + return; + } + if (WARN_ON(ath12k_mac_vif_link_chan(vif, arvif->link_id, &def))) return; @@ -2176,6 +2226,7 @@ static void ath12k_peer_assoc_h_he(struct ath12k *ar, struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); const struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; + struct ieee80211_bss_conf *link_conf; int i; u8 ampdu_factor, max_nss; u8 rx_mcs_80 = IEEE80211_HE_MCS_NOT_SUPPORTED; @@ -2184,6 +2235,13 @@ static void ath12k_peer_assoc_h_he(struct ath12k *ar, bool support_160; u16 v; + link_conf = ath12k_mac_get_link_bss_conf(arvif); + if (!link_conf) { + ath12k_warn(ar->ab, "unable to access bss link conf in peer assoc he for vif %pM link %u", + vif->addr, arvif->link_id); + return; + } + if (!he_cap->has_he) return; @@ -2227,7 +2285,7 @@ static void ath12k_peer_assoc_h_he(struct ath12k *ar, sizeof(he_cap->he_cap_elem.mac_cap_info)); memcpy(&arg->peer_he_cap_phyinfo, he_cap->he_cap_elem.phy_cap_info, sizeof(he_cap->he_cap_elem.phy_cap_info)); - arg->peer_he_ops = vif->bss_conf.he_oper.params; + arg->peer_he_ops = link_conf->he_oper.params; /* the top most byte is used to indicate BSS color info */ arg->peer_he_ops &= 0xffffff; @@ -3133,6 +3191,7 @@ static void ath12k_recalculate_mgmt_rate(struct ath12k *ar, struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_hw *hw = ath12k_ar_to_hw(ar); const struct ieee80211_supported_band *sband; + struct ieee80211_bss_conf *bss_conf; u8 basic_rate_idx; int hw_rate_code; u32 vdev_param; @@ -3141,8 +3200,15 @@ static void ath12k_recalculate_mgmt_rate(struct ath12k *ar, lockdep_assert_wiphy(hw->wiphy); + bss_conf = ath12k_mac_get_link_bss_conf(arvif); + if (!bss_conf) { + ath12k_warn(ar->ab, "unable to access bss link conf in mgmt rate calc for vif %pM link %u\n", + vif->addr, arvif->link_id); + return; + } + sband = hw->wiphy->bands[def->chan->band]; - basic_rate_idx = ffs(vif->bss_conf.basic_rates) - 1; + basic_rate_idx = ffs(bss_conf->basic_rates) - 1; bitrate = sband->bitrates[basic_rate_idx].bitrate; hw_rate_code = ath12k_mac_get_rate_hw_value(bitrate); @@ -3226,6 +3292,7 @@ static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); unsigned long links = ahvif->links_map; + struct ieee80211_bss_conf *info; struct ath12k_link_vif *arvif; struct ath12k *ar; u8 link_id; @@ -3246,10 +3313,15 @@ static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, ar = arvif->ar; - if (vif->cfg.assoc) - ath12k_bss_assoc(ar, arvif, &vif->bss_conf); - else + if (vif->cfg.assoc) { + info = ath12k_mac_get_link_bss_conf(arvif); + if (!info) + continue; + + ath12k_bss_assoc(ar, arvif, info); + } else { ath12k_bss_disassoc(ar, arvif); + } } } } @@ -3260,6 +3332,7 @@ static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) struct ieee80211_vif *vif = arvif->ahvif->vif; struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf; enum wmi_sta_powersave_param param; + struct ieee80211_bss_conf *info; enum wmi_sta_ps_mode psmode; int ret; int timeout; @@ -3277,8 +3350,15 @@ static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) timeout = conf->dynamic_ps_timeout; if (timeout == 0) { + info = ath12k_mac_get_link_bss_conf(arvif); + if (!info) { + ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n", + vif->addr, arvif->link_id); + return; + } + /* firmware doesn't like 0 */ - timeout = ieee80211_tu_to_usec(vif->bss_conf.beacon_int) / 1000; + timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000; } ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param, @@ -3389,8 +3469,8 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, if (changed & BSS_CHANGED_BEACON_ENABLED) { ath12k_control_beaconing(arvif, info); - if (arvif->is_up && vif->bss_conf.he_support && - vif->bss_conf.he_oper.params) { + if (arvif->is_up && info->he_support && + info->he_oper.params) { /* TODO: Extend to support 1024 BA Bitmap size */ ret = ath12k_wmi_vdev_set_param_cmd(ar, arvif->vdev_id, WMI_VDEV_PARAM_BA_MODE, @@ -3401,7 +3481,7 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, arvif->vdev_id); param_id = WMI_VDEV_PARAM_HEOPS_0_31; - param_value = vif->bss_conf.he_oper.params; + param_value = info->he_oper.params; ret = ath12k_wmi_vdev_set_param_cmd(ar, arvif->vdev_id, param_id, param_value); ath12k_dbg(ar->ab, ATH12K_DBG_MAC, @@ -3493,12 +3573,12 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, if (changed & BSS_CHANGED_MCAST_RATE && !ath12k_mac_vif_link_chan(vif, arvif->link_id, &def)) { band = def.chan->band; - mcast_rate = vif->bss_conf.mcast_rate[band]; + mcast_rate = info->mcast_rate[band]; if (mcast_rate > 0) rateidx = mcast_rate - 1; else - rateidx = ffs(vif->bss_conf.basic_rates) - 1; + rateidx = ffs(info->basic_rates) - 1; if (ar->pdev->cap.supported_bands & WMI_HOST_WLAN_5G_CAP) rateidx += ATH12K_MAC_FIRST_OFDM_RATE_IDX; @@ -4278,6 +4358,7 @@ static int ath12k_mac_set_key(struct ath12k *ar, enum set_key_cmd cmd, { struct ath12k_vif *ahvif = arvif->ahvif; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif); + struct ieee80211_bss_conf *link_conf; struct ieee80211_sta *sta = NULL; struct ath12k_base *ab = ar->ab; struct ath12k_peer *peer; @@ -4294,12 +4375,19 @@ static int ath12k_mac_set_key(struct ath12k *ar, enum set_key_cmd cmd, if (test_bit(ATH12K_FLAG_HW_CRYPTO_DISABLED, &ab->dev_flags)) return 1; + link_conf = ath12k_mac_get_link_bss_conf(arvif); + if (!link_conf) { + ath12k_warn(ab, "unable to access bss link conf in set key for vif %pM link %u\n", + vif->addr, arvif->link_id); + return -ENOLINK; + } + if (sta) peer_addr = arsta->addr; else if (ahvif->vdev_type == WMI_VDEV_TYPE_STA) - peer_addr = vif->bss_conf.bssid; + peer_addr = link_conf->bssid; else - peer_addr = vif->addr; + peer_addr = link_conf->addr; key->hw_key_idx = key->keyidx; @@ -7086,6 +7174,7 @@ static int ath12k_mac_setup_vdev_params_mbssid(struct ath12k_link_vif *arvif, { struct ath12k_vif *ahvif = arvif->ahvif; struct ieee80211_vif *tx_vif = ahvif->vif->mbssid_tx_vif; + struct ieee80211_bss_conf *link_conf; struct ath12k *ar = arvif->ar; struct ath12k_link_vif *tx_arvif; struct ath12k_vif *tx_ahvif; @@ -7093,10 +7182,17 @@ static int ath12k_mac_setup_vdev_params_mbssid(struct ath12k_link_vif *arvif, if (!tx_vif) return 0; + link_conf = ath12k_mac_get_link_bss_conf(arvif); + if (!link_conf) { + ath12k_warn(ar->ab, "unable to access bss link conf in set mbssid params for vif %pM link %u\n", + ahvif->vif->addr, arvif->link_id); + return -ENOLINK; + } + tx_ahvif = ath12k_vif_to_ahvif(tx_vif); tx_arvif = &tx_ahvif->deflink; - if (ahvif->vif->bss_conf.nontransmitted) { + if (link_conf->nontransmitted) { if (ar->ah->hw->wiphy != ieee80211_vif_to_wdev(tx_vif)->wiphy) return -EINVAL; @@ -7108,7 +7204,7 @@ static int ath12k_mac_setup_vdev_params_mbssid(struct ath12k_link_vif *arvif, return -EINVAL; } - if (ahvif->vif->bss_conf.ema_ap) + if (link_conf->ema_ap) *flags |= WMI_VDEV_MBSSID_FLAGS_EMA_MODE; return 0; @@ -7476,7 +7572,7 @@ int ath12k_mac_vdev_create(struct ath12k *ar, struct ath12k_link_vif *arvif) break; } - arvif->txpower = vif->bss_conf.txpower; + arvif->txpower = link_conf->txpower; ret = ath12k_mac_txpower_recalc(ar); if (ret) goto err_peer_del; @@ -8165,11 +8261,18 @@ ath12k_mac_vdev_start_restart(struct ath12k_link_vif *arvif, struct wmi_vdev_start_req_arg arg = {}; const struct cfg80211_chan_def *chandef = &ctx->def; struct ath12k_vif *ahvif = arvif->ahvif; - int he_support = ahvif->vif->bss_conf.he_support; + struct ieee80211_bss_conf *link_conf; int ret; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + link_conf = ath12k_mac_get_link_bss_conf(arvif); + if (!link_conf) { + ath12k_warn(ar->ab, "unable to access bss link conf in vdev start for vif %pM link %u\n", + ahvif->vif->addr, arvif->link_id); + return -ENOLINK; + } + reinit_completion(&ar->vdev_setup_done); arg.vdev_id = arvif->vdev_id; @@ -8221,7 +8324,7 @@ ath12k_mac_vdev_start_restart(struct ath12k_link_vif *arvif, spin_unlock_bh(&ab->base_lock); /* TODO: Notify if secondary 80Mhz also needs radar detection */ - if (he_support) { + if (link_conf->he_support) { ret = ath12k_set_he_mu_sounding_mode(ar, arvif); if (ret) { ath12k_warn(ar->ab, "failed to set he mode vdev %i\n", diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 50ed7e72f178f..402ae477da61b 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -6854,6 +6854,7 @@ ath12k_wmi_process_csa_switch_count_event(struct ath12k_base *ab, const u32 *vdev_ids) { int i; + struct ieee80211_bss_conf *conf; struct ath12k_link_vif *arvif; struct ath12k_vif *ahvif; @@ -6872,7 +6873,20 @@ ath12k_wmi_process_csa_switch_count_event(struct ath12k_base *ab, } ahvif = arvif->ahvif; - if (arvif->is_up && ahvif->vif->bss_conf.csa_active) + if (arvif->link_id > IEEE80211_MLD_MAX_NUM_LINKS) { + ath12k_warn(ab, "Invalid CSA switch count even link id: %d\n", + arvif->link_id); + continue; + } + + conf = rcu_dereference(ahvif->vif->link_conf[arvif->link_id]); + if (!conf) { + ath12k_warn(ab, "unable to access bss link conf in process csa for vif %pM link %u\n", + ahvif->vif->addr, arvif->link_id); + continue; + } + + if (arvif->is_up && conf->csa_active) ieee80211_csa_finish(ahvif->vif, 0); } rcu_read_unlock(); From 63fdb90642eec9439dd13c93c4b5c184b60a50cd Mon Sep 17 00:00:00 2001 From: Sriram R Date: Thu, 21 Nov 2024 17:58:06 +0200 Subject: [PATCH 0058/1450] wifi: ath12k: Use mac80211 sta's link_sta instead of deflink Currently mac80211's struct ieee80211_sta deflink is used to fetch any sta related configurations in driver. With MLO multiple link sta's (struct ieee80211_link_sta) are affiliated to an ML sta and corresponding link configs are present in sta->link[]. Fetch link sta of corresponding link from ML sta and use the same for configurations. Add ath12k_mac_get_link_sta() helper to fetch ieee80211_link_sta from arsta. But as ath12k_mac_op_sta_rc_update() is called in atomic context the helper cannot be used and instead rcu_dereference() has to be called directly. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Co-developed-by: Rameshkumar Sundaram Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Link: https://patch.msgid.link/20241121155806.1862733-9-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 292 ++++++++++++++++++-------- 1 file changed, 209 insertions(+), 83 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 827a1700e8ba0..60702bf071415 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -519,6 +519,23 @@ ath12k_mac_get_link_bss_conf(struct ath12k_link_vif *arvif) return link_conf; } +static struct ieee80211_link_sta *ath12k_mac_get_link_sta(struct ath12k_link_sta *arsta) +{ + struct ath12k_sta *ahsta = arsta->ahsta; + struct ieee80211_sta *sta = ath12k_ahsta_to_sta(ahsta); + struct ieee80211_link_sta *link_sta; + + lockdep_assert_wiphy(ahsta->ahvif->ah->hw->wiphy); + + if (arsta->link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + return NULL; + + link_sta = wiphy_dereference(ahsta->ahvif->ah->hw->wiphy, + sta->link[arsta->link_id]); + + return link_sta; +} + static bool ath12k_mac_bitrate_is_cck(int bitrate) { switch (bitrate) { @@ -1902,6 +1919,7 @@ static void ath12k_peer_assoc_h_rates(struct ath12k *ar, struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); struct wmi_rate_set_arg *rateset = &arg->peer_legacy_rates; + struct ieee80211_link_sta *link_sta; struct cfg80211_chan_def def; const struct ieee80211_supported_band *sband; const struct ieee80211_rate *rates; @@ -1916,9 +1934,16 @@ static void ath12k_peer_assoc_h_rates(struct ath12k *ar, if (WARN_ON(ath12k_mac_vif_link_chan(vif, arvif->link_id, &def))) return; + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in peer assoc rates for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + band = def.chan->band; sband = hw->wiphy->bands[band]; - ratemask = sta->deflink.supp_rates[band]; + ratemask = link_sta->supp_rates[band]; ratemask &= arvif->bitrate_mask.control[band].legacy; rates = sband->bitrates; @@ -1965,7 +1990,8 @@ static void ath12k_peer_assoc_h_ht(struct ath12k *ar, { struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - const struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; + const struct ieee80211_sta_ht_cap *ht_cap; + struct ieee80211_link_sta *link_sta; struct cfg80211_chan_def def; enum nl80211_band band; const u8 *ht_mcs_mask; @@ -1978,6 +2004,14 @@ static void ath12k_peer_assoc_h_ht(struct ath12k *ar, if (WARN_ON(ath12k_mac_vif_link_chan(vif, arvif->link_id, &def))) return; + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in peer assoc ht for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + + ht_cap = &link_sta->ht_cap; if (!ht_cap->ht_supported) return; @@ -2001,7 +2035,7 @@ static void ath12k_peer_assoc_h_ht(struct ath12k *ar, if (ht_cap->cap & IEEE80211_HT_CAP_LDPC_CODING) arg->ldpc_flag = true; - if (sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40) { + if (link_sta->bandwidth >= IEEE80211_STA_RX_BW_40) { arg->bw_40 = true; arg->peer_rate_caps |= WMI_HOST_RC_CW40_FLAG; } @@ -2051,7 +2085,7 @@ static void ath12k_peer_assoc_h_ht(struct ath12k *ar, arg->peer_ht_rates.rates[i] = i; } else { arg->peer_ht_rates.num_rates = n; - arg->peer_nss = min(sta->deflink.rx_nss, max_nss); + arg->peer_nss = min(link_sta->rx_nss, max_nss); } ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac ht peer %pM mcs cnt %d nss %d\n", @@ -2127,7 +2161,8 @@ static void ath12k_peer_assoc_h_vht(struct ath12k *ar, { struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - const struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; + const struct ieee80211_sta_vht_cap *vht_cap; + struct ieee80211_link_sta *link_sta; struct cfg80211_chan_def def; enum nl80211_band band; const u16 *vht_mcs_mask; @@ -2141,6 +2176,14 @@ static void ath12k_peer_assoc_h_vht(struct ath12k *ar, if (WARN_ON(ath12k_mac_vif_link_chan(vif, arvif->link_id, &def))) return; + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in peer assoc vht for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + + vht_cap = &link_sta->vht_cap; if (!vht_cap->vht_supported) return; @@ -2173,10 +2216,10 @@ static void ath12k_peer_assoc_h_vht(struct ath12k *ar, (1U << (IEEE80211_HT_MAX_AMPDU_FACTOR + ampdu_factor)) - 1); - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_80) arg->bw_80 = true; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_160) arg->bw_160 = true; /* Calculate peer NSS capability from VHT capabilities if STA @@ -2190,7 +2233,7 @@ static void ath12k_peer_assoc_h_vht(struct ath12k *ar, vht_mcs_mask[i]) max_nss = i + 1; } - arg->peer_nss = min(sta->deflink.rx_nss, max_nss); + arg->peer_nss = min(link_sta->rx_nss, max_nss); arg->rx_max_rate = __le16_to_cpu(vht_cap->vht_mcs.rx_highest); arg->rx_mcs_set = __le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); arg->tx_max_rate = __le16_to_cpu(vht_cap->vht_mcs.tx_highest); @@ -2225,8 +2268,9 @@ static void ath12k_peer_assoc_h_he(struct ath12k *ar, { struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - const struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; + const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_bss_conf *link_conf; + struct ieee80211_link_sta *link_sta; int i; u8 ampdu_factor, max_nss; u8 rx_mcs_80 = IEEE80211_HE_MCS_NOT_SUPPORTED; @@ -2242,6 +2286,14 @@ static void ath12k_peer_assoc_h_he(struct ath12k *ar, return; } + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in peer assoc he for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + + he_cap = &link_sta->he_cap; if (!he_cap->has_he) return; @@ -2279,7 +2331,7 @@ static void ath12k_peer_assoc_h_he(struct ath12k *ar, else max_nss = rx_mcs_80; - arg->peer_nss = min(sta->deflink.rx_nss, max_nss); + arg->peer_nss = min(link_sta->rx_nss, max_nss); memcpy(&arg->peer_he_cap_macinfo, he_cap->he_cap_elem.mac_cap_info, sizeof(he_cap->he_cap_elem.mac_cap_info)); @@ -2306,10 +2358,10 @@ static void ath12k_peer_assoc_h_he(struct ath12k *ar, IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK); if (ampdu_factor) { - if (sta->deflink.vht_cap.vht_supported) + if (link_sta->vht_cap.vht_supported) arg->peer_max_mpdu = (1 << (IEEE80211_HE_VHT_MAX_AMPDU_FACTOR + ampdu_factor)) - 1; - else if (sta->deflink.ht_cap.ht_supported) + else if (link_sta->ht_cap.ht_supported) arg->peer_max_mpdu = (1 << (IEEE80211_HE_HT_MAX_AMPDU_FACTOR + ampdu_factor)) - 1; } @@ -2350,7 +2402,7 @@ static void ath12k_peer_assoc_h_he(struct ath12k *ar, if (he_cap->he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_TWT_REQ) arg->twt_requester = true; - switch (sta->deflink.bandwidth) { + switch (link_sta->bandwidth) { case IEEE80211_STA_RX_BW_160: if (he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) { @@ -2390,7 +2442,8 @@ static void ath12k_peer_assoc_h_he_6ghz(struct ath12k *ar, { struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - const struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; + const struct ieee80211_sta_he_cap *he_cap; + struct ieee80211_link_sta *link_sta; struct cfg80211_chan_def def; enum nl80211_band band; u8 ampdu_factor, mpdu_density; @@ -2400,22 +2453,31 @@ static void ath12k_peer_assoc_h_he_6ghz(struct ath12k *ar, band = def.chan->band; - if (!arg->he_flag || band != NL80211_BAND_6GHZ || !sta->deflink.he_6ghz_capa.capa) + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in peer assoc he 6ghz for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + + he_cap = &link_sta->he_cap; + + if (!arg->he_flag || band != NL80211_BAND_6GHZ || !link_sta->he_6ghz_capa.capa) return; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_40) arg->bw_40 = true; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_80) arg->bw_80 = true; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_160) arg->bw_160 = true; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_320) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_320) arg->bw_320 = true; - arg->peer_he_caps_6ghz = le16_to_cpu(sta->deflink.he_6ghz_capa.capa); + arg->peer_he_caps_6ghz = le16_to_cpu(link_sta->he_6ghz_capa.capa); mpdu_density = u32_get_bits(arg->peer_he_caps_6ghz, IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START); @@ -2459,10 +2521,23 @@ static void ath12k_peer_assoc_h_smps(struct ath12k_link_sta *arsta, struct ath12k_wmi_peer_assoc_arg *arg) { struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - const struct ieee80211_he_6ghz_capa *he_6ghz_capa = &sta->deflink.he_6ghz_capa; - const struct ieee80211_sta_ht_cap *ht_cap = &sta->deflink.ht_cap; + const struct ieee80211_he_6ghz_capa *he_6ghz_capa; + struct ath12k_link_vif *arvif = arsta->arvif; + const struct ieee80211_sta_ht_cap *ht_cap; + struct ieee80211_link_sta *link_sta; + struct ath12k *ar = arvif->ar; int smps; + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in peer assoc he for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + + he_6ghz_capa = &link_sta->he_6ghz_capa; + ht_cap = &link_sta->ht_cap; + if (!ht_cap->ht_supported && !he_6ghz_capa->capa) return; @@ -2588,17 +2663,17 @@ static int ath12k_peer_assoc_qos_ap(struct ath12k *ar, return ret; } -static bool ath12k_mac_sta_has_ofdm_only(struct ieee80211_sta *sta) +static bool ath12k_mac_sta_has_ofdm_only(struct ieee80211_link_sta *sta) { - return sta->deflink.supp_rates[NL80211_BAND_2GHZ] >> + return sta->supp_rates[NL80211_BAND_2GHZ] >> ATH12K_MAC_FIRST_OFDM_RATE_IDX; } static enum wmi_phy_mode ath12k_mac_get_phymode_vht(struct ath12k *ar, - struct ieee80211_sta *sta) + struct ieee80211_link_sta *link_sta) { - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) { - switch (sta->deflink.vht_cap.cap & + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_160) { + switch (link_sta->vht_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: return MODE_11AC_VHT160; @@ -2610,74 +2685,74 @@ static enum wmi_phy_mode ath12k_mac_get_phymode_vht(struct ath12k *ar, } } - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_80) return MODE_11AC_VHT80; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_40) return MODE_11AC_VHT40; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_20) return MODE_11AC_VHT20; return MODE_UNKNOWN; } static enum wmi_phy_mode ath12k_mac_get_phymode_he(struct ath12k *ar, - struct ieee80211_sta *sta) + struct ieee80211_link_sta *link_sta) { - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) { - if (sta->deflink.he_cap.he_cap_elem.phy_cap_info[0] & + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_160) { + if (link_sta->he_cap.he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) return MODE_11AX_HE160; - else if (sta->deflink.he_cap.he_cap_elem.phy_cap_info[0] & + else if (link_sta->he_cap.he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) return MODE_11AX_HE80_80; /* not sure if this is a valid case? */ return MODE_11AX_HE160; } - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_80) return MODE_11AX_HE80; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_40) return MODE_11AX_HE40; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_20) return MODE_11AX_HE20; return MODE_UNKNOWN; } static enum wmi_phy_mode ath12k_mac_get_phymode_eht(struct ath12k *ar, - struct ieee80211_sta *sta) + struct ieee80211_link_sta *link_sta) { - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_320) - if (sta->deflink.eht_cap.eht_cap_elem.phy_cap_info[0] & + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_320) + if (link_sta->eht_cap.eht_cap_elem.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) return MODE_11BE_EHT320; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_160) { - if (sta->deflink.he_cap.he_cap_elem.phy_cap_info[0] & + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_160) { + if (link_sta->he_cap.he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) return MODE_11BE_EHT160; - if (sta->deflink.he_cap.he_cap_elem.phy_cap_info[0] & + if (link_sta->he_cap.he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) return MODE_11BE_EHT80_80; ath12k_warn(ar->ab, "invalid EHT PHY capability info for 160 Mhz: %d\n", - sta->deflink.he_cap.he_cap_elem.phy_cap_info[0]); + link_sta->he_cap.he_cap_elem.phy_cap_info[0]); return MODE_11BE_EHT160; } - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_80) return MODE_11BE_EHT80; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_40) return MODE_11BE_EHT40; - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_20) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_20) return MODE_11BE_EHT20; return MODE_UNKNOWN; @@ -2688,6 +2763,7 @@ static void ath12k_peer_assoc_h_phymode(struct ath12k *ar, struct ath12k_link_sta *arsta, struct ath12k_wmi_peer_assoc_arg *arg) { + struct ieee80211_link_sta *link_sta; struct cfg80211_chan_def def; enum nl80211_band band; const u8 *ht_mcs_mask; @@ -2706,33 +2782,40 @@ static void ath12k_peer_assoc_h_phymode(struct ath12k *ar, ht_mcs_mask = arvif->bitrate_mask.control[band].ht_mcs; vht_mcs_mask = arvif->bitrate_mask.control[band].vht_mcs; + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in peer assoc he for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + switch (band) { case NL80211_BAND_2GHZ: - if (sta->deflink.eht_cap.has_eht) { - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) + if (link_sta->eht_cap.has_eht) { + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_40) phymode = MODE_11BE_EHT40_2G; else phymode = MODE_11BE_EHT20_2G; - } else if (sta->deflink.he_cap.has_he) { - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_80) + } else if (link_sta->he_cap.has_he) { + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_80) phymode = MODE_11AX_HE80_2G; - else if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) + else if (link_sta->bandwidth == IEEE80211_STA_RX_BW_40) phymode = MODE_11AX_HE40_2G; else phymode = MODE_11AX_HE20_2G; - } else if (sta->deflink.vht_cap.vht_supported && + } else if (link_sta->vht_cap.vht_supported && !ath12k_peer_assoc_h_vht_masked(vht_mcs_mask)) { - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_40) phymode = MODE_11AC_VHT40; else phymode = MODE_11AC_VHT20; - } else if (sta->deflink.ht_cap.ht_supported && + } else if (link_sta->ht_cap.ht_supported && !ath12k_peer_assoc_h_ht_masked(ht_mcs_mask)) { - if (sta->deflink.bandwidth == IEEE80211_STA_RX_BW_40) + if (link_sta->bandwidth == IEEE80211_STA_RX_BW_40) phymode = MODE_11NG_HT40; else phymode = MODE_11NG_HT20; - } else if (ath12k_mac_sta_has_ofdm_only(sta)) { + } else if (ath12k_mac_sta_has_ofdm_only(link_sta)) { phymode = MODE_11G; } else { phymode = MODE_11B; @@ -2741,16 +2824,16 @@ static void ath12k_peer_assoc_h_phymode(struct ath12k *ar, case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: /* Check EHT first */ - if (sta->deflink.eht_cap.has_eht) { - phymode = ath12k_mac_get_phymode_eht(ar, sta); - } else if (sta->deflink.he_cap.has_he) { - phymode = ath12k_mac_get_phymode_he(ar, sta); - } else if (sta->deflink.vht_cap.vht_supported && + if (link_sta->eht_cap.has_eht) { + phymode = ath12k_mac_get_phymode_eht(ar, link_sta); + } else if (link_sta->he_cap.has_he) { + phymode = ath12k_mac_get_phymode_he(ar, link_sta); + } else if (link_sta->vht_cap.vht_supported && !ath12k_peer_assoc_h_vht_masked(vht_mcs_mask)) { - phymode = ath12k_mac_get_phymode_vht(ar, sta); - } else if (sta->deflink.ht_cap.ht_supported && + phymode = ath12k_mac_get_phymode_vht(ar, link_sta); + } else if (link_sta->ht_cap.ht_supported && !ath12k_peer_assoc_h_ht_masked(ht_mcs_mask)) { - if (sta->deflink.bandwidth >= IEEE80211_STA_RX_BW_40) + if (link_sta->bandwidth >= IEEE80211_STA_RX_BW_40) phymode = MODE_11NA_HT40; else phymode = MODE_11NA_HT20; @@ -2838,15 +2921,25 @@ static void ath12k_peer_assoc_h_eht(struct ath12k *ar, struct ath12k_wmi_peer_assoc_arg *arg) { struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); - const struct ieee80211_sta_eht_cap *eht_cap = &sta->deflink.eht_cap; - const struct ieee80211_sta_he_cap *he_cap = &sta->deflink.he_cap; const struct ieee80211_eht_mcs_nss_supp_20mhz_only *bw_20; const struct ieee80211_eht_mcs_nss_supp_bw *bw; + const struct ieee80211_sta_eht_cap *eht_cap; + const struct ieee80211_sta_he_cap *he_cap; + struct ieee80211_link_sta *link_sta; u32 *rx_mcs, *tx_mcs; lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); - if (!sta->deflink.he_cap.has_he || !eht_cap->has_eht) + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in peer assoc eht for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + + eht_cap = &link_sta->eht_cap; + he_cap = &link_sta->he_cap; + if (!he_cap->has_he || !eht_cap->has_eht) return; arg->eht_flag = true; @@ -2865,7 +2958,7 @@ static void ath12k_peer_assoc_h_eht(struct ath12k *ar, rx_mcs = arg->peer_eht_rx_mcs_set; tx_mcs = arg->peer_eht_tx_mcs_set; - switch (sta->deflink.bandwidth) { + switch (link_sta->bandwidth) { case IEEE80211_STA_RX_BW_320: bw = &eht_cap->eht_mcs_nss_supp.bw._320; ath12k_mac_set_eht_mcs(bw->rx_tx_mcs9_max_nss, @@ -4662,6 +4755,7 @@ static int ath12k_mac_station_assoc(struct ath12k *ar, struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); struct ieee80211_sta *sta = ath12k_ahsta_to_sta(arsta->ahsta); struct ath12k_wmi_peer_assoc_arg peer_arg; + struct ieee80211_link_sta *link_sta; int ret; struct cfg80211_chan_def def; enum nl80211_band band; @@ -4707,7 +4801,13 @@ static int ath12k_mac_station_assoc(struct ath12k *ar, * fixed param. * Note that all other rates and NSS will be disabled for this peer. */ - if (sta->deflink.vht_cap.vht_supported && num_vht_rates == 1) { + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in station assoc\n"); + return -EINVAL; + } + + if (link_sta->vht_cap.vht_supported && num_vht_rates == 1) { ret = ath12k_mac_set_peer_vht_fixed_rate(arvif, arsta, mask, band); if (ret) @@ -4721,8 +4821,7 @@ static int ath12k_mac_station_assoc(struct ath12k *ar, return 0; ret = ath12k_setup_peer_smps(ar, arvif, arsta->addr, - &sta->deflink.ht_cap, - &sta->deflink.he_6ghz_capa); + &link_sta->ht_cap, &link_sta->he_6ghz_capa); if (ret) { ath12k_warn(ar->ab, "failed to setup peer SMPS for vdev %d: %d\n", arvif->vdev_id, ret); @@ -4766,6 +4865,7 @@ static int ath12k_mac_station_disassoc(struct ath12k *ar, static void ath12k_sta_rc_update_wk(struct wiphy *wiphy, struct wiphy_work *wk) { + struct ieee80211_link_sta *link_sta; struct ath12k *ar; struct ath12k_link_vif *arvif; struct ieee80211_sta *sta; @@ -4900,7 +5000,14 @@ static void ath12k_sta_rc_update_wk(struct wiphy *wiphy, struct wiphy_work *wk) * TODO: Check RATEMASK_CMDID to support auto rates selection * across HT/VHT and for multiple VHT MCS support. */ - if (sta->deflink.vht_cap.vht_supported && num_vht_rates == 1) { + link_sta = ath12k_mac_get_link_sta(arsta); + if (!link_sta) { + ath12k_warn(ar->ab, "unable to access link sta in peer assoc he for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + + if (link_sta->vht_cap.vht_supported && num_vht_rates == 1) { ath12k_mac_set_peer_vht_fixed_rate(arvif, arsta, mask, band); } else { @@ -5603,10 +5710,23 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, spin_unlock_bh(&ar->ab->base_lock); + if (arsta->link_id >= IEEE80211_MLD_MAX_NUM_LINKS) { + rcu_read_unlock(); + return; + } + + link_sta = rcu_dereference(sta->link[arsta->link_id]); + if (!link_sta) { + rcu_read_unlock(); + ath12k_warn(ar->ab, "unable to access link sta in rc update for sta %pM link %u\n", + sta->addr, arsta->link_id); + return; + } + ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac sta rc update for %pM changed %08x bw %d nss %d smps %d\n", - arsta->addr, changed, sta->deflink.bandwidth, sta->deflink.rx_nss, - sta->deflink.smps_mode); + arsta->addr, changed, link_sta->bandwidth, link_sta->rx_nss, + link_sta->smps_mode); spin_lock_bh(&ar->data_lock); @@ -5617,12 +5737,12 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, } if (changed & IEEE80211_RC_NSS_CHANGED) - arsta->nss = sta->deflink.rx_nss; + arsta->nss = link_sta->rx_nss; if (changed & IEEE80211_RC_SMPS_CHANGED) { smps = WMI_PEER_SMPS_PS_NONE; - switch (sta->deflink.smps_mode) { + switch (link_sta->smps_mode) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_OFF: smps = WMI_PEER_SMPS_PS_NONE; @@ -5634,8 +5754,8 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, smps = WMI_PEER_SMPS_DYNAMIC; break; default: - ath12k_warn(ar->ab, "Invalid smps %d in sta rc update for %pM\n", - sta->deflink.smps_mode, arsta->addr); + ath12k_warn(ar->ab, "Invalid smps %d in sta rc update for %pM link %u\n", + link_sta->smps_mode, arsta->addr, link_sta->link_id); smps = WMI_PEER_SMPS_PS_NONE; break; } @@ -9241,10 +9361,11 @@ static void ath12k_mac_set_bitrate_mask_iter(void *data, { struct ath12k_link_vif *arvif = data; struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); - struct ath12k_link_sta *arsta = &ahsta->deflink; + struct ath12k_link_sta *arsta; struct ath12k *ar = arvif->ar; - if (arsta->arvif != arvif) + arsta = rcu_dereference(ahsta->link[arvif->link_id]); + if (!arsta || arsta->arvif != arvif) return; spin_lock_bh(&ar->data_lock); @@ -9259,11 +9380,16 @@ static void ath12k_mac_disable_peer_fixed_rate(void *data, { struct ath12k_link_vif *arvif = data; struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); - struct ath12k_link_sta *arsta = &ahsta->deflink; + struct ath12k_link_sta *arsta; struct ath12k *ar = arvif->ar; int ret; - if (arsta->arvif != arvif) + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + arsta = wiphy_dereference(ath12k_ar_to_hw(ar)->wiphy, + ahsta->link[arvif->link_id]); + + if (!arsta || arsta->arvif != arvif) return; ret = ath12k_wmi_set_peer_param(ar, arsta->addr, From 144c6cd24b3556e6e7a14271cee57a42ebf97450 Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Wed, 20 Nov 2024 11:40:49 +0800 Subject: [PATCH 0059/1450] wifi: rtw89: 8922a: configure AP_LINK_PS if FW supports After FW v0.35.46.0, for AP mode, RTL8922A FW supports a new FW feature, called NOTIFY_AP_INFO, to notify driver information related to AP mode. And, one function of it is to monitor PS states of remote stations. Once one of them changes, FW will send a C2H event to tell driver. With this FW feature, we can declare AP_LINK_PS. For now, driver still needs to determine if a frame is ps-poll or U-APSD trigger. So, add the corresponding RX handling in driver, which activates only when at least one AP is running. Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241120034054.13575-2-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.c | 42 ++++++++++++ drivers/net/wireless/realtek/rtw89/core.h | 37 +++++++++++ drivers/net/wireless/realtek/rtw89/fw.c | 66 +++++++++++++++++++ drivers/net/wireless/realtek/rtw89/fw.h | 19 ++++++ drivers/net/wireless/realtek/rtw89/mac.c | 50 ++++++++++++++ drivers/net/wireless/realtek/rtw89/mac.h | 7 ++ drivers/net/wireless/realtek/rtw89/mac80211.c | 13 +++- drivers/net/wireless/realtek/rtw89/ser.c | 1 + 8 files changed, 234 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c index 34034f44c050c..c99111df90a7e 100644 --- a/drivers/net/wireless/realtek/rtw89/core.c +++ b/drivers/net/wireless/realtek/rtw89/core.c @@ -2749,6 +2749,41 @@ static void rtw89_core_flush_ppdu_rx_queue(struct rtw89_dev *rtwdev, } } +static +void rtw89_core_rx_pkt_hdl(struct rtw89_dev *rtwdev, const struct sk_buff *skb, + const struct rtw89_rx_desc_info *desc) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + struct rtw89_sta_link *rtwsta_link; + struct ieee80211_sta *sta; + struct rtw89_sta *rtwsta; + u8 macid = desc->mac_id; + + if (!refcount_read(&rtwdev->refcount_ap_info)) + return; + + rcu_read_lock(); + + rtwsta_link = rtw89_assoc_link_rcu_dereference(rtwdev, macid); + if (!rtwsta_link) + goto out; + + rtwsta = rtwsta_link->rtwsta; + if (!test_bit(RTW89_REMOTE_STA_IN_PS, rtwsta->flags)) + goto out; + + sta = rtwsta_to_sta(rtwsta); + if (ieee80211_is_pspoll(hdr->frame_control)) + ieee80211_sta_pspoll(sta); + else if (ieee80211_has_pm(hdr->frame_control) && + (ieee80211_is_data_qos(hdr->frame_control) || + ieee80211_is_qos_nullfunc(hdr->frame_control))) + ieee80211_sta_uapsd_trigger(sta, ieee80211_get_tid(hdr)); + +out: + rcu_read_unlock(); +} + void rtw89_core_rx(struct rtw89_dev *rtwdev, struct rtw89_rx_desc_info *desc_info, struct sk_buff *skb) @@ -2771,6 +2806,7 @@ void rtw89_core_rx(struct rtw89_dev *rtwdev, rx_status = IEEE80211_SKB_RXCB(skb); memset(rx_status, 0, sizeof(*rx_status)); rtw89_core_update_rx_status(rtwdev, desc_info, rx_status); + rtw89_core_rx_pkt_hdl(rtwdev, skb, desc_info); if (desc_info->long_rxdesc && BIT(desc_info->frame_type) & PPDU_FILTER_BITMAP) skb_queue_tail(&ppdu_sts->rx_queue[band], skb); @@ -3748,6 +3784,8 @@ int rtw89_core_sta_link_disassoc(struct rtw89_dev *rtwdev, { const struct ieee80211_vif *vif = rtwvif_link_to_vif(rtwvif_link); + rtw89_assoc_link_clr(rtwsta_link); + if (vif->type == NL80211_IFTYPE_STATION) rtw89_fw_h2c_set_bcn_fltr_cfg(rtwdev, rtwvif_link, false); @@ -3883,6 +3921,7 @@ int rtw89_core_sta_link_assoc(struct rtw89_dev *rtwdev, rtw89_fw_h2c_set_bcn_fltr_cfg(rtwdev, rtwvif_link, true); } + rtw89_assoc_link_set(rtwsta_link); return ret; } @@ -5150,6 +5189,9 @@ static int rtw89_core_register_hw(struct rtw89_dev *rtwdev) if (RTW89_CHK_FW_FEATURE(BEACON_FILTER, &rtwdev->fw)) ieee80211_hw_set(hw, CONNECTION_MONITOR); + if (RTW89_CHK_FW_FEATURE(NOTIFY_AP_INFO, &rtwdev->fw)) + ieee80211_hw_set(hw, AP_LINK_PS); + hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION) | BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_P2P_CLIENT) | diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index ecccb51184be9..3e93b53fd67ba 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -4452,6 +4452,7 @@ enum rtw89_fw_feature { RTW89_FW_FEATURE_RFK_PRE_NOTIFY_V0, RTW89_FW_FEATURE_RFK_RXDCK_V0, RTW89_FW_FEATURE_NO_WOW_CPU_IO_RX, + RTW89_FW_FEATURE_NOTIFY_AP_INFO, }; struct rtw89_fw_suit { @@ -5596,6 +5597,9 @@ struct rtw89_dev { struct rtw89_rfe_data *rfe_data; enum rtw89_custid custid; + struct rtw89_sta_link __rcu *assoc_link_on_macid[RTW89_MAX_MAC_ID_NUM]; + refcount_t refcount_ap_info; + /* ensures exclusive access from mac80211 callbacks */ struct mutex mutex; struct list_head rtwvifs_list; @@ -5730,10 +5734,18 @@ static inline bool rtw89_vif_assign_link_is_valid(struct rtw89_vif_link **rtwvif for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) \ if (rtw89_vif_assign_link_is_valid(&(rtwvif_link), rtwvif, link_id)) +enum rtw89_sta_flags { + RTW89_REMOTE_STA_IN_PS, + + NUM_OF_RTW89_STA_FLAGS, +}; + struct rtw89_sta { struct rtw89_dev *rtwdev; struct rtw89_vif *rtwvif; + DECLARE_BITMAP(flags, NUM_OF_RTW89_STA_FLAGS); + bool disassoc; struct sk_buff_head roc_queue; @@ -5811,6 +5823,31 @@ u8 rtw89_sta_link_inst_get_index(struct rtw89_sta_link *rtwsta_link) return rtwsta_link - rtwsta->links_inst; } +static inline void rtw89_assoc_link_set(struct rtw89_sta_link *rtwsta_link) +{ + struct rtw89_sta *rtwsta = rtwsta_link->rtwsta; + struct rtw89_dev *rtwdev = rtwsta->rtwdev; + + rcu_assign_pointer(rtwdev->assoc_link_on_macid[rtwsta_link->mac_id], + rtwsta_link); +} + +static inline void rtw89_assoc_link_clr(struct rtw89_sta_link *rtwsta_link) +{ + struct rtw89_sta *rtwsta = rtwsta_link->rtwsta; + struct rtw89_dev *rtwdev = rtwsta->rtwdev; + + rcu_assign_pointer(rtwdev->assoc_link_on_macid[rtwsta_link->mac_id], + NULL); + synchronize_rcu(); +} + +static inline struct rtw89_sta_link * +rtw89_assoc_link_rcu_dereference(struct rtw89_dev *rtwdev, u8 macid) +{ + return rcu_dereference(rtwdev->assoc_link_on_macid[macid]); +} + static inline int rtw89_hci_tx_write(struct rtw89_dev *rtwdev, struct rtw89_core_tx_request *tx_req) { diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index 2191c037d72e4..7bda9aab382cf 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -728,6 +728,7 @@ static const struct __fw_feat_cfg fw_feat_tbl[] = { __CFG_FW_FEAT(RTL8922A, ge, 0, 35, 22, 0, WOW_REASON_V1), __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 31, 0, RFK_PRE_NOTIFY_V0), __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 42, 0, RFK_RXDCK_V0), + __CFG_FW_FEAT(RTL8922A, ge, 0, 35, 46, 0, NOTIFY_AP_INFO), }; static void rtw89_fw_iterate_feature_cfg(struct rtw89_fw_info *fw, @@ -8164,6 +8165,71 @@ int rtw89_fw_h2c_mrc_upd_duration(struct rtw89_dev *rtwdev, return 0; } +static int rtw89_fw_h2c_ap_info(struct rtw89_dev *rtwdev, bool en) +{ + struct rtw89_h2c_ap_info *h2c; + u32 len = sizeof(*h2c); + struct sk_buff *skb; + int ret; + + skb = rtw89_fw_h2c_alloc_skb_with_hdr(rtwdev, len); + if (!skb) { + rtw89_err(rtwdev, "failed to alloc skb for ap info\n"); + return -ENOMEM; + } + + skb_put(skb, len); + h2c = (struct rtw89_h2c_ap_info *)skb->data; + + h2c->w0 = le32_encode_bits(en, RTW89_H2C_AP_INFO_W0_PWR_INT_EN); + + rtw89_h2c_pkt_set_hdr(rtwdev, skb, FWCMD_TYPE_H2C, + H2C_CAT_MAC, + H2C_CL_AP, + H2C_FUNC_AP_INFO, 0, 0, + len); + + ret = rtw89_h2c_tx(rtwdev, skb, false); + if (ret) { + rtw89_err(rtwdev, "failed to send h2c\n"); + dev_kfree_skb_any(skb); + return -EBUSY; + } + + return 0; +} + +int rtw89_fw_h2c_ap_info_refcount(struct rtw89_dev *rtwdev, bool en) +{ + int ret; + + if (en) { + if (refcount_inc_not_zero(&rtwdev->refcount_ap_info)) + return 0; + } else { + if (!refcount_dec_and_test(&rtwdev->refcount_ap_info)) + return 0; + } + + ret = rtw89_fw_h2c_ap_info(rtwdev, en); + if (ret) { + if (!test_bit(RTW89_FLAG_SER_HANDLING, rtwdev->flags)) + return ret; + + /* During recovery, neither driver nor stack has full error + * handling, so show a warning, but return 0 with refcount + * increased normally. It can avoid underflow when calling + * with @en == false later. + */ + rtw89_warn(rtwdev, "h2c ap_info failed during SER\n"); + } + + if (en) + refcount_set(&rtwdev->refcount_ap_info, 1); + + return 0; +} + static bool __fw_txpwr_entry_zero_ext(const void *ext_ptr, u8 ext_len) { static const u8 zeros[U8_MAX] = {}; diff --git a/drivers/net/wireless/realtek/rtw89/fw.h b/drivers/net/wireless/realtek/rtw89/fw.h index efa63d4448210..9106bcce18518 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.h +++ b/drivers/net/wireless/realtek/rtw89/fw.h @@ -3466,6 +3466,12 @@ struct rtw89_h2c_wow_aoac { __le32 w0; } __packed; +struct rtw89_h2c_ap_info { + __le32 w0; +} __packed; + +#define RTW89_H2C_AP_INFO_W0_PWR_INT_EN BIT(0) + #define RTW89_C2H_HEADER_LEN 8 struct rtw89_c2h_hdr { @@ -3725,6 +3731,14 @@ struct rtw89_c2h_wow_aoac_report { #define RTW89_C2H_WOW_AOAC_RPT_REKEY_IDX BIT(0) +struct rtw89_c2h_pwr_int_notify { + struct rtw89_c2h_hdr hdr; + __le32 w2; +} __packed; + +#define RTW89_C2H_PWR_INT_NOTIFY_W2_MACID GENMASK(15, 0) +#define RTW89_C2H_PWR_INT_NOTIFY_W2_PWR_STATUS BIT(16) + struct rtw89_h2c_tx_duty { __le32 w0; __le32 w1; @@ -4168,6 +4182,10 @@ enum rtw89_mrc_h2c_func { #define RTW89_MRC_WAIT_COND_REQ_TSF \ RTW89_MRC_WAIT_COND(0 /* don't care */, H2C_FUNC_MRC_REQ_TSF) +/* CLASS 36 - AP */ +#define H2C_CL_AP 0x24 +#define H2C_FUNC_AP_INFO 0x0 + #define H2C_CAT_OUTSRC 0x2 #define H2C_CL_OUTSRC_RA 0x1 @@ -4697,6 +4715,7 @@ int rtw89_fw_h2c_mrc_sync(struct rtw89_dev *rtwdev, const struct rtw89_fw_mrc_sync_arg *arg); int rtw89_fw_h2c_mrc_upd_duration(struct rtw89_dev *rtwdev, const struct rtw89_fw_mrc_upd_duration_arg *arg); +int rtw89_fw_h2c_ap_info_refcount(struct rtw89_dev *rtwdev, bool en); static inline void rtw89_fw_h2c_init_ba_cam(struct rtw89_dev *rtwdev) { diff --git a/drivers/net/wireless/realtek/rtw89/mac.c b/drivers/net/wireless/realtek/rtw89/mac.c index 7907b84d204b3..03fc214402710 100644 --- a/drivers/net/wireless/realtek/rtw89/mac.c +++ b/drivers/net/wireless/realtek/rtw89/mac.c @@ -5364,6 +5364,39 @@ rtw89_mac_c2h_mrc_status_rpt(struct rtw89_dev *rtwdev, struct sk_buff *c2h, u32 rtw89_complete_cond(wait, cond, &data); } +static void +rtw89_mac_c2h_pwr_int_notify(struct rtw89_dev *rtwdev, struct sk_buff *skb, u32 len) +{ + const struct rtw89_c2h_pwr_int_notify *c2h; + struct rtw89_sta_link *rtwsta_link; + struct ieee80211_sta *sta; + struct rtw89_sta *rtwsta; + u16 macid; + bool ps; + + c2h = (const struct rtw89_c2h_pwr_int_notify *)skb->data; + macid = le32_get_bits(c2h->w2, RTW89_C2H_PWR_INT_NOTIFY_W2_MACID); + ps = le32_get_bits(c2h->w2, RTW89_C2H_PWR_INT_NOTIFY_W2_PWR_STATUS); + + rcu_read_lock(); + + rtwsta_link = rtw89_assoc_link_rcu_dereference(rtwdev, macid); + if (unlikely(!rtwsta_link)) + goto out; + + rtwsta = rtwsta_link->rtwsta; + if (ps) + set_bit(RTW89_REMOTE_STA_IN_PS, rtwsta->flags); + else + clear_bit(RTW89_REMOTE_STA_IN_PS, rtwsta->flags); + + sta = rtwsta_to_sta(rtwsta); + ieee80211_sta_ps_transition(sta, ps); + +out: + rcu_read_unlock(); +} + static void (* const rtw89_mac_c2h_ofld_handler[])(struct rtw89_dev *rtwdev, struct sk_buff *c2h, u32 len) = { @@ -5409,6 +5442,12 @@ void (* const rtw89_mac_c2h_wow_handler[])(struct rtw89_dev *rtwdev, [RTW89_MAC_C2H_FUNC_AOAC_REPORT] = rtw89_mac_c2h_wow_aoac_rpt, }; +static +void (* const rtw89_mac_c2h_ap_handler[])(struct rtw89_dev *rtwdev, + struct sk_buff *c2h, u32 len) = { + [RTW89_MAC_C2H_FUNC_PWR_INT_NOTIFY] = rtw89_mac_c2h_pwr_int_notify, +}; + static void rtw89_mac_c2h_scanofld_rsp_atomic(struct rtw89_dev *rtwdev, struct sk_buff *skb) { @@ -5463,6 +5502,13 @@ bool rtw89_mac_c2h_chk_atomic(struct rtw89_dev *rtwdev, struct sk_buff *c2h, return true; case RTW89_MAC_C2H_CLASS_WOW: return true; + case RTW89_MAC_C2H_CLASS_AP: + switch (func) { + default: + return false; + case RTW89_MAC_C2H_FUNC_PWR_INT_NOTIFY: + return true; + } } } @@ -5493,6 +5539,10 @@ void rtw89_mac_c2h_handle(struct rtw89_dev *rtwdev, struct sk_buff *skb, if (func < NUM_OF_RTW89_MAC_C2H_FUNC_WOW) handler = rtw89_mac_c2h_wow_handler[func]; break; + case RTW89_MAC_C2H_CLASS_AP: + if (func < NUM_OF_RTW89_MAC_C2H_FUNC_AP) + handler = rtw89_mac_c2h_ap_handler[func]; + break; case RTW89_MAC_C2H_CLASS_FWDBG: return; default: diff --git a/drivers/net/wireless/realtek/rtw89/mac.h b/drivers/net/wireless/realtek/rtw89/mac.h index 18579c0205484..81507274a97e2 100644 --- a/drivers/net/wireless/realtek/rtw89/mac.h +++ b/drivers/net/wireless/realtek/rtw89/mac.h @@ -426,6 +426,12 @@ enum rtw89_mac_c2h_wow_func { NUM_OF_RTW89_MAC_C2H_FUNC_WOW, }; +enum rtw89_mac_c2h_ap_func { + RTW89_MAC_C2H_FUNC_PWR_INT_NOTIFY = 0, + + NUM_OF_RTW89_MAC_C2H_FUNC_AP, +}; + enum rtw89_mac_c2h_class { RTW89_MAC_C2H_CLASS_INFO = 0x0, RTW89_MAC_C2H_CLASS_OFLD = 0x1, @@ -434,6 +440,7 @@ enum rtw89_mac_c2h_class { RTW89_MAC_C2H_CLASS_MCC = 0x4, RTW89_MAC_C2H_CLASS_FWDBG = 0x5, RTW89_MAC_C2H_CLASS_MRC = 0xe, + RTW89_MAC_C2H_CLASS_AP = 0x18, RTW89_MAC_C2H_CLASS_MAX, }; diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c index 619d2d3771d52..8dc475f6fdf7b 100644 --- a/drivers/net/wireless/realtek/rtw89/mac80211.c +++ b/drivers/net/wireless/realtek/rtw89/mac80211.c @@ -775,6 +775,7 @@ static int rtw89_ops_start_ap(struct ieee80211_hw *hw, struct rtw89_vif *rtwvif = vif_to_rtwvif(vif); struct rtw89_vif_link *rtwvif_link; const struct rtw89_chan *chan; + int ret = 0; mutex_lock(&rtwdev->mutex); @@ -783,6 +784,7 @@ static int rtw89_ops_start_ap(struct ieee80211_hw *hw, rtw89_err(rtwdev, "%s: rtwvif link (link_id %u) is not active\n", __func__, link_conf->link_id); + ret = -ENOLINK; goto out; } @@ -804,12 +806,18 @@ static int rtw89_ops_start_ap(struct ieee80211_hw *hw, rtw89_fw_h2c_cam(rtwdev, rtwvif_link, NULL, NULL); rtw89_chip_rfk_channel(rtwdev, rtwvif_link); + if (RTW89_CHK_FW_FEATURE(NOTIFY_AP_INFO, &rtwdev->fw)) { + ret = rtw89_fw_h2c_ap_info_refcount(rtwdev, true); + if (ret) + goto out; + } + rtw89_queue_chanctx_work(rtwdev); out: mutex_unlock(&rtwdev->mutex); - return 0; + return ret; } static @@ -830,6 +838,9 @@ void rtw89_ops_stop_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif, goto out; } + if (RTW89_CHK_FW_FEATURE(NOTIFY_AP_INFO, &rtwdev->fw)) + rtw89_fw_h2c_ap_info_refcount(rtwdev, false); + rtw89_mac_stop_ap(rtwdev, rtwvif_link); rtw89_chip_h2c_assoc_cmac_tbl(rtwdev, rtwvif_link, NULL); rtw89_fw_h2c_join_info(rtwdev, rtwvif_link, NULL, true); diff --git a/drivers/net/wireless/realtek/rtw89/ser.c b/drivers/net/wireless/realtek/rtw89/ser.c index 7b203bb7f151a..26a944d3b6727 100644 --- a/drivers/net/wireless/realtek/rtw89/ser.c +++ b/drivers/net/wireless/realtek/rtw89/ser.c @@ -365,6 +365,7 @@ static void ser_reset_mac_binding(struct rtw89_dev *rtwdev) ser_reset_vif(rtwdev, rtwvif); rtwdev->total_sta_assoc = 0; + refcount_set(&rtwdev->refcount_ap_info, 0); } /* hal function */ From c821a8af435ca1b029ebdf0520cfb5c35b5e3d77 Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Wed, 20 Nov 2024 11:40:50 +0800 Subject: [PATCH 0060/1450] wifi: rtw89: register ops of can_activate_links Register mac80211 ops of can_activate_links which is required when we are ready to enable multiple active links. Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241120034054.13575-3-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.h | 1 + drivers/net/wireless/realtek/rtw89/mac80211.c | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index 3e93b53fd67ba..dcd2ccadac96c 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -830,6 +830,7 @@ enum rtw89_phy_idx { }; #define __RTW89_MLD_MAX_LINK_NUM 2 +#define RTW89_MLD_NON_STA_LINK_NUM 1 enum rtw89_chanctx_idx { RTW89_CHANCTX_0 = 0, diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c index 8dc475f6fdf7b..40ff5aa041fa6 100644 --- a/drivers/net/wireless/realtek/rtw89/mac80211.c +++ b/drivers/net/wireless/realtek/rtw89/mac80211.c @@ -1484,6 +1484,30 @@ static int rtw89_ops_set_tid_config(struct ieee80211_hw *hw, return 0; } +static bool rtw89_can_work_on_links(struct rtw89_dev *rtwdev, + struct ieee80211_vif *vif, u16 links) +{ + struct rtw89_vif *rtwvif = vif_to_rtwvif(vif); + u8 w = hweight16(links); + + if (vif->type != NL80211_IFTYPE_STATION && + w > RTW89_MLD_NON_STA_LINK_NUM) + return false; + + return w <= rtwvif->links_inst_valid_num; +} + +static bool rtw89_ops_can_activate_links(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u16 active_links) +{ + struct rtw89_dev *rtwdev = hw->priv; + + guard(mutex)(&rtwdev->mutex); + + return rtw89_can_work_on_links(rtwdev, vif, active_links); +} + #ifdef CONFIG_PM static int rtw89_ops_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) @@ -1611,6 +1635,7 @@ const struct ieee80211_ops rtw89_ops = { .set_sar_specs = rtw89_ops_set_sar_specs, .link_sta_rc_update = rtw89_ops_sta_rc_update, .set_tid_config = rtw89_ops_set_tid_config, + .can_activate_links = rtw89_ops_can_activate_links, #ifdef CONFIG_PM .suspend = rtw89_ops_suspend, .resume = rtw89_ops_resume, From d2b387bdca4684052d58f81667fe6fd6c746faca Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Wed, 20 Nov 2024 11:40:51 +0800 Subject: [PATCH 0061/1450] wifi: rtw89: implement ops of change vif/sta links To support MLO, implement change_vif_links() and change_sta_links() ops. Basically, we follow arguments to set/clear links. One special thing is that when vif is idle, i.e. no connection, link id 0 is set up by us for default uses. So, when bitmap of vif links change from 0x0 to non-zero, we clear the default one first. And when bitmap of vif links change from non-zero to 0x0, we set up a default one at the end. Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241120034054.13575-4-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.h | 17 ++ drivers/net/wireless/realtek/rtw89/mac80211.c | 226 +++++++++++++++++- 2 files changed, 238 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index dcd2ccadac96c..bcfefd7fa01ea 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -5700,10 +5700,17 @@ struct rtw89_dev { u8 priv[] __aligned(sizeof(void *)); }; +struct rtw89_link_conf_container { + struct ieee80211_bss_conf *link_conf[IEEE80211_MLD_MAX_NUM_LINKS]; +}; + +#define RTW89_VIF_IDLE_LINK_ID 0 + struct rtw89_vif { struct rtw89_dev *rtwdev; struct list_head list; struct list_head mgnt_entry; + struct rtw89_link_conf_container __rcu *snap_link_confs; u8 mac_addr[ETH_ALEN]; __be32 ip_addr; @@ -6273,9 +6280,19 @@ static inline struct ieee80211_bss_conf * __rtw89_vif_rcu_dereference_link(struct rtw89_vif_link *rtwvif_link, bool *nolink) { struct ieee80211_vif *vif = rtwvif_link_to_vif(rtwvif_link); + struct rtw89_vif *rtwvif = rtwvif_link->rtwvif; + struct rtw89_link_conf_container *snap; struct ieee80211_bss_conf *bss_conf; + snap = rcu_dereference(rtwvif->snap_link_confs); + if (snap) { + bss_conf = snap->link_conf[rtwvif_link->link_id]; + goto out; + } + bss_conf = rcu_dereference(vif->link_conf[rtwvif_link->link_id]); + +out: if (unlikely(!bss_conf)) { *nolink = true; return &vif->bss_conf; diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c index 40ff5aa041fa6..a4e47ef22b9b8 100644 --- a/drivers/net/wireless/realtek/rtw89/mac80211.c +++ b/drivers/net/wireless/realtek/rtw89/mac80211.c @@ -202,7 +202,7 @@ static int rtw89_ops_add_interface(struct ieee80211_hw *hw, rtw89_traffic_stats_init(rtwdev, &rtwvif->stats); - rtwvif_link = rtw89_vif_set_link(rtwvif, 0); + rtwvif_link = rtw89_vif_set_link(rtwvif, RTW89_VIF_IDLE_LINK_ID); if (!rtwvif_link) { ret = -EINVAL; goto release_port; @@ -218,7 +218,7 @@ static int rtw89_ops_add_interface(struct ieee80211_hw *hw, return 0; unset_link: - rtw89_vif_unset_link(rtwvif, 0); + rtw89_vif_unset_link(rtwvif, RTW89_VIF_IDLE_LINK_ID); release_port: list_del_init(&rtwvif->list); rtw89_core_release_bit_map(rtwdev->hw_port, port); @@ -246,17 +246,17 @@ static void rtw89_ops_remove_interface(struct ieee80211_hw *hw, mutex_lock(&rtwdev->mutex); - rtwvif_link = rtwvif->links[0]; + rtwvif_link = rtwvif->links[RTW89_VIF_IDLE_LINK_ID]; if (unlikely(!rtwvif_link)) { rtw89_err(rtwdev, "%s: rtwvif link (link_id %u) is not active\n", - __func__, 0); + __func__, RTW89_VIF_IDLE_LINK_ID); goto bottom; } __rtw89_ops_remove_iface_link(rtwdev, rtwvif_link); - rtw89_vif_unset_link(rtwvif, 0); + rtw89_vif_unset_link(rtwvif, RTW89_VIF_IDLE_LINK_ID); bottom: list_del_init(&rtwvif->list); @@ -1508,6 +1508,220 @@ static bool rtw89_ops_can_activate_links(struct ieee80211_hw *hw, return rtw89_can_work_on_links(rtwdev, vif, active_links); } +static void __rtw89_ops_clr_vif_links(struct rtw89_dev *rtwdev, + struct rtw89_vif *rtwvif, + unsigned long clr_links) +{ + struct rtw89_vif_link *rtwvif_link; + unsigned int link_id; + + for_each_set_bit(link_id, &clr_links, IEEE80211_MLD_MAX_NUM_LINKS) { + rtwvif_link = rtwvif->links[link_id]; + if (unlikely(!rtwvif_link)) + continue; + + __rtw89_ops_remove_iface_link(rtwdev, rtwvif_link); + + rtw89_vif_unset_link(rtwvif, link_id); + } +} + +static int __rtw89_ops_set_vif_links(struct rtw89_dev *rtwdev, + struct rtw89_vif *rtwvif, + unsigned long set_links) +{ + struct rtw89_vif_link *rtwvif_link; + unsigned int link_id; + int ret; + + for_each_set_bit(link_id, &set_links, IEEE80211_MLD_MAX_NUM_LINKS) { + rtwvif_link = rtw89_vif_set_link(rtwvif, link_id); + if (!rtwvif_link) + return -EINVAL; + + ret = __rtw89_ops_add_iface_link(rtwdev, rtwvif_link); + if (ret) { + rtw89_err(rtwdev, "%s: failed to add iface (link id %u)\n", + __func__, link_id); + return ret; + } + } + + return 0; +} + +static +int rtw89_ops_change_vif_links(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u16 old_links, u16 new_links, + struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]) +{ + struct rtw89_dev *rtwdev = hw->priv; + struct rtw89_vif *rtwvif = vif_to_rtwvif(vif); + unsigned long clr_links = old_links & ~new_links; + unsigned long set_links = new_links & ~old_links; + bool removing_links = !old_links || clr_links; + struct rtw89_link_conf_container *snap; + int ret = 0; + int i; + + guard(mutex)(&rtwdev->mutex); + + rtw89_debug(rtwdev, RTW89_DBG_STATE, + "%s: old_links (0x%08x) -> new_links (0x%08x)\n", + __func__, old_links, new_links); + + if (!rtw89_can_work_on_links(rtwdev, vif, new_links)) + return -EOPNOTSUPP; + + if (removing_links) { + snap = kzalloc(sizeof(*snap), GFP_KERNEL); + if (!snap) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(snap->link_conf); i++) + snap->link_conf[i] = old[i]; + + rcu_assign_pointer(rtwvif->snap_link_confs, snap); + } + + /* might depend on @snap; don't change order */ + rtw89_leave_ips_by_hwflags(rtwdev); + + if (rtwdev->scanning) + rtw89_hw_scan_abort(rtwdev, rtwdev->scan_info.scanning_vif); + + if (!old_links) + __rtw89_ops_clr_vif_links(rtwdev, rtwvif, + BIT(RTW89_VIF_IDLE_LINK_ID)); + else if (clr_links) + __rtw89_ops_clr_vif_links(rtwdev, rtwvif, clr_links); + + if (removing_links) { + /* @snap is required if and only if during removing links. + * However, it's done here. So, cleanup @snap immediately. + */ + rcu_assign_pointer(rtwvif->snap_link_confs, NULL); + + /* The pointers in @old will free after this function return, + * so synchronously wait for all readers of snap to be done. + */ + synchronize_rcu(); + kfree(snap); + } + + if (set_links) { + ret = __rtw89_ops_set_vif_links(rtwdev, rtwvif, set_links); + if (ret) + __rtw89_ops_clr_vif_links(rtwdev, rtwvif, set_links); + } else if (!new_links) { + ret = __rtw89_ops_set_vif_links(rtwdev, rtwvif, + BIT(RTW89_VIF_IDLE_LINK_ID)); + if (ret) + __rtw89_ops_clr_vif_links(rtwdev, rtwvif, + BIT(RTW89_VIF_IDLE_LINK_ID)); + } + + rtw89_enter_ips_by_hwflags(rtwdev); + return ret; +} + +static void __rtw89_ops_clr_sta_links(struct rtw89_dev *rtwdev, + struct rtw89_sta *rtwsta, + unsigned long clr_links) +{ + struct rtw89_vif_link *rtwvif_link; + struct rtw89_sta_link *rtwsta_link; + unsigned int link_id; + + for_each_set_bit(link_id, &clr_links, IEEE80211_MLD_MAX_NUM_LINKS) { + rtwsta_link = rtwsta->links[link_id]; + if (unlikely(!rtwsta_link)) + continue; + + rtwvif_link = rtwsta_link->rtwvif_link; + + rtw89_core_sta_link_disassoc(rtwdev, rtwvif_link, rtwsta_link); + rtw89_core_sta_link_disconnect(rtwdev, rtwvif_link, rtwsta_link); + rtw89_core_sta_link_remove(rtwdev, rtwvif_link, rtwsta_link); + + rtw89_sta_unset_link(rtwsta, link_id); + } +} + +static int __rtw89_ops_set_sta_links(struct rtw89_dev *rtwdev, + struct rtw89_sta *rtwsta, + unsigned long set_links) +{ + struct rtw89_vif_link *rtwvif_link; + struct rtw89_sta_link *rtwsta_link; + unsigned int link_id; + int ret; + + for_each_set_bit(link_id, &set_links, IEEE80211_MLD_MAX_NUM_LINKS) { + rtwsta_link = rtw89_sta_set_link(rtwsta, link_id); + if (!rtwsta_link) + return -EINVAL; + + rtwvif_link = rtwsta_link->rtwvif_link; + + ret = rtw89_core_sta_link_add(rtwdev, rtwvif_link, rtwsta_link); + if (ret) { + rtw89_err(rtwdev, "%s: failed to add sta (link id %u)\n", + __func__, link_id); + return ret; + } + + rtw89_vif_type_mapping(rtwvif_link, true); + + ret = rtw89_core_sta_link_assoc(rtwdev, rtwvif_link, rtwsta_link); + if (ret) { + rtw89_err(rtwdev, "%s: failed to assoc sta (link id %u)\n", + __func__, link_id); + return ret; + } + + __rtw89_ops_bss_link_assoc(rtwdev, rtwvif_link); + } + + return 0; +} + +static +int rtw89_ops_change_sta_links(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + u16 old_links, u16 new_links) +{ + struct rtw89_dev *rtwdev = hw->priv; + struct rtw89_sta *rtwsta = sta_to_rtwsta(sta); + unsigned long clr_links = old_links & ~new_links; + unsigned long set_links = new_links & ~old_links; + int ret = 0; + + guard(mutex)(&rtwdev->mutex); + + rtw89_debug(rtwdev, RTW89_DBG_STATE, + "%s: old_links (0x%08x) -> new_links (0x%08x)\n", + __func__, old_links, new_links); + + if (!rtw89_can_work_on_links(rtwdev, vif, new_links)) + return -EOPNOTSUPP; + + rtw89_leave_ps_mode(rtwdev); + + if (clr_links) + __rtw89_ops_clr_sta_links(rtwdev, rtwsta, clr_links); + + if (set_links) { + ret = __rtw89_ops_set_sta_links(rtwdev, rtwsta, set_links); + if (ret) + __rtw89_ops_clr_sta_links(rtwdev, rtwsta, set_links); + } + + return ret; +} + #ifdef CONFIG_PM static int rtw89_ops_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) @@ -1636,6 +1850,8 @@ const struct ieee80211_ops rtw89_ops = { .link_sta_rc_update = rtw89_ops_sta_rc_update, .set_tid_config = rtw89_ops_set_tid_config, .can_activate_links = rtw89_ops_can_activate_links, + .change_vif_links = rtw89_ops_change_vif_links, + .change_sta_links = rtw89_ops_change_sta_links, #ifdef CONFIG_PM .suspend = rtw89_ops_suspend, .resume = rtw89_ops_resume, From a6db83bef0f587494fcc9cc5a9ec6e66ea13236e Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Wed, 20 Nov 2024 11:40:52 +0800 Subject: [PATCH 0062/1450] wifi: rtw89: apply MLD pairwise key to dynamically active links In MLD connection, a pairwise key should work on all active links. And, we take just one entry in security CAM for one pairwise key. (It means we will reuse one single entry for all links.) Originally, we already applied the security CAM entry of pairwise key to deflink's address CAM. However, links can be activated dynamically. So now for pairwise keys, each rtw89_sta records the IDs of the security CAM entries. Then, when driver is notified that some links are active via change_sta_links(), we apply target pairwise keys to them according to the record. Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241120034054.13575-5-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/cam.c | 32 ++++++++++++++++--- drivers/net/wireless/realtek/rtw89/cam.h | 5 +++ drivers/net/wireless/realtek/rtw89/core.h | 4 +++ drivers/net/wireless/realtek/rtw89/mac80211.c | 16 ++++++++++ 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/cam.c b/drivers/net/wireless/realtek/rtw89/cam.c index 8ef59994c0dbd..8fa1e6c1ce139 100644 --- a/drivers/net/wireless/realtek/rtw89/cam.c +++ b/drivers/net/wireless/realtek/rtw89/cam.c @@ -135,8 +135,8 @@ static int rtw89_cam_get_avail_sec_cam(struct rtw89_dev *rtwdev, } static int rtw89_cam_get_addr_cam_key_idx(struct rtw89_addr_cam_entry *addr_cam, - struct rtw89_sec_cam_entry *sec_cam, - struct ieee80211_key_conf *key, + const struct rtw89_sec_cam_entry *sec_cam, + const struct ieee80211_key_conf *key, u8 *key_idx) { u8 idx; @@ -246,8 +246,8 @@ static int __rtw89_cam_detach_sec_cam(struct rtw89_dev *rtwdev, static int __rtw89_cam_attach_sec_cam(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link, struct rtw89_sta_link *rtwsta_link, - struct ieee80211_key_conf *key, - struct rtw89_sec_cam_entry *sec_cam) + const struct ieee80211_key_conf *key, + const struct rtw89_sec_cam_entry *sec_cam) { struct rtw89_addr_cam_entry *addr_cam; u8 key_idx = 0; @@ -286,6 +286,22 @@ static int __rtw89_cam_attach_sec_cam(struct rtw89_dev *rtwdev, return 0; } +int rtw89_cam_attach_link_sec_cam(struct rtw89_dev *rtwdev, + struct rtw89_vif_link *rtwvif_link, + struct rtw89_sta_link *rtwsta_link, + u8 sec_cam_idx) +{ + struct rtw89_cam_info *cam_info = &rtwdev->cam_info; + const struct rtw89_sec_cam_entry *sec_cam; + + sec_cam = cam_info->sec_entries[sec_cam_idx]; + if (!sec_cam) + return -ENOENT; + + return __rtw89_cam_attach_sec_cam(rtwdev, rtwvif_link, rtwsta_link, + sec_cam->key_conf, sec_cam); +} + static int rtw89_cam_detach_sec_cam(struct rtw89_dev *rtwdev, struct ieee80211_vif *vif, struct ieee80211_sta *sta, @@ -306,6 +322,9 @@ static int rtw89_cam_detach_sec_cam(struct rtw89_dev *rtwdev, rtwvif = vif_to_rtwvif(vif); + if (rtwsta) + clear_bit(sec_cam->sec_cam_idx, rtwsta->pairwise_sec_cam_map); + rtw89_vif_for_each_link(rtwvif, rtwvif_link, link_id) { rtwsta_link = rtwsta ? rtwsta->links[link_id] : NULL; if (rtwsta && !rtwsta_link) @@ -369,6 +388,8 @@ static int rtw89_cam_attach_sec_cam(struct rtw89_dev *rtwdev, return ret; } + set_bit(sec_cam->sec_cam_idx, rtwsta->pairwise_sec_cam_map); + return 0; } @@ -410,6 +431,9 @@ static int rtw89_cam_sec_key_install(struct rtw89_dev *rtwdev, sec_cam->len = RTW89_SEC_CAM_LEN; sec_cam->ext_key = ext_key; memcpy(sec_cam->key, key->key, key->keylen); + + sec_cam->key_conf = key; + ret = rtw89_cam_send_sec_key_cmd(rtwdev, sec_cam); if (ret) { rtw89_err(rtwdev, "failed to send sec key cmd: %d\n", ret); diff --git a/drivers/net/wireless/realtek/rtw89/cam.h b/drivers/net/wireless/realtek/rtw89/cam.h index 3134ebf088258..8fd2d776408ea 100644 --- a/drivers/net/wireless/realtek/rtw89/cam.h +++ b/drivers/net/wireless/realtek/rtw89/cam.h @@ -578,4 +578,9 @@ int rtw89_cam_sec_key_del(struct rtw89_dev *rtwdev, void rtw89_cam_bssid_changed(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link); void rtw89_cam_reset_keys(struct rtw89_dev *rtwdev); +int rtw89_cam_attach_link_sec_cam(struct rtw89_dev *rtwdev, + struct rtw89_vif_link *rtwvif_link, + struct rtw89_sta_link *rtwsta_link, + u8 sec_cam_idx); + #endif diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index bcfefd7fa01ea..409cbdc6b92ae 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -3359,6 +3359,8 @@ struct rtw89_sec_cam_entry { u8 spp_mode : 1; /* 256 bits */ u8 key[32]; + + struct ieee80211_key_conf *key_conf; }; struct rtw89_sta_link { @@ -5761,6 +5763,8 @@ struct rtw89_sta { struct rtw89_ampdu_params ampdu_params[IEEE80211_NUM_TIDS]; DECLARE_BITMAP(ampdu_map, IEEE80211_NUM_TIDS); + DECLARE_BITMAP(pairwise_sec_cam_map, RTW89_MAX_SEC_CAM_NUM); + u8 links_inst_valid_num; DECLARE_BITMAP(links_inst_map, __RTW89_MLD_MAX_LINK_NUM); struct rtw89_sta_link *links[IEEE80211_MLD_MAX_NUM_LINKS]; diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c index a4e47ef22b9b8..bf7a674bce284 100644 --- a/drivers/net/wireless/realtek/rtw89/mac80211.c +++ b/drivers/net/wireless/realtek/rtw89/mac80211.c @@ -509,6 +509,7 @@ static int __rtw89_ops_sta_add(struct rtw89_dev *rtwdev, rtw89_core_txq_init(rtwdev, sta->txq[i]); skb_queue_head_init(&rtwsta->roc_queue); + bitmap_zero(rtwsta->pairwise_sec_cam_map, RTW89_MAX_SEC_CAM_NUM); rtwsta_link = rtw89_sta_set_link(rtwsta, sta->deflink.link_id); if (!rtwsta_link) { @@ -1656,6 +1657,7 @@ static int __rtw89_ops_set_sta_links(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link; struct rtw89_sta_link *rtwsta_link; unsigned int link_id; + u8 sec_cam_idx; int ret; for_each_set_bit(link_id, &set_links, IEEE80211_MLD_MAX_NUM_LINKS) { @@ -1682,6 +1684,20 @@ static int __rtw89_ops_set_sta_links(struct rtw89_dev *rtwdev, } __rtw89_ops_bss_link_assoc(rtwdev, rtwvif_link); + + for_each_set_bit(sec_cam_idx, rtwsta->pairwise_sec_cam_map, + RTW89_MAX_SEC_CAM_NUM) { + ret = rtw89_cam_attach_link_sec_cam(rtwdev, + rtwvif_link, + rtwsta_link, + sec_cam_idx); + if (ret) { + rtw89_err(rtwdev, + "%s: failed to apply pairwise key (link id %u)\n", + __func__, link_id); + return ret; + } + } } return 0; From 55709b195464e59bd5c51abf25fa243d7a8b7a3e Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Wed, 20 Nov 2024 11:40:53 +0800 Subject: [PATCH 0063/1450] wifi: rtw89: pass target link_id to ieee80211_gtk_rekey_add() When calling ieee80211_gtk_rekey_add(), pass the target link_id instead of always -1. Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241120034054.13575-6-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/wow.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw89/wow.c b/drivers/net/wireless/realtek/rtw89/wow.c index 3e81fd974ec18..1e1dbb20d47ad 100644 --- a/drivers/net/wireless/realtek/rtw89/wow.c +++ b/drivers/net/wireless/realtek/rtw89/wow.c @@ -620,7 +620,10 @@ static struct ieee80211_key_conf *rtw89_wow_gtk_rekey(struct rtw89_dev *rtwdev, * need to unlock mutex */ mutex_unlock(&rtwdev->mutex); - key = ieee80211_gtk_rekey_add(wow_vif, rekey_conf, -1); + if (ieee80211_vif_is_mld(wow_vif)) + key = ieee80211_gtk_rekey_add(wow_vif, rekey_conf, rtwvif_link->link_id); + else + key = ieee80211_gtk_rekey_add(wow_vif, rekey_conf, -1); mutex_lock(&rtwdev->mutex); kfree(rekey_conf); From f79257f5b97199a08d5c2c039bf4908323f9dd92 Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Wed, 20 Nov 2024 11:40:54 +0800 Subject: [PATCH 0064/1450] wifi: rtw89: pass target link_id to ieee80211_nullfunc_get() When calling ieee80211_nullfunc_get(), pass the target link_id instead of always -1. Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241120034054.13575-7-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.c | 3 ++- drivers/net/wireless/realtek/rtw89/fw.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c index c99111df90a7e..6f9b4f0b2748f 100644 --- a/drivers/net/wireless/realtek/rtw89/core.c +++ b/drivers/net/wireless/realtek/rtw89/core.c @@ -3216,6 +3216,7 @@ static int rtw89_core_send_nullfunc(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link, bool qos, bool ps) { struct ieee80211_vif *vif = rtwvif_link_to_vif(rtwvif_link); + int link_id = ieee80211_vif_is_mld(vif) ? rtwvif_link->link_id : -1; struct ieee80211_sta *sta; struct ieee80211_hdr *hdr; struct sk_buff *skb; @@ -3231,7 +3232,7 @@ static int rtw89_core_send_nullfunc(struct rtw89_dev *rtwdev, goto out; } - skb = ieee80211_nullfunc_get(rtwdev->hw, vif, -1, qos); + skb = ieee80211_nullfunc_get(rtwdev->hw, vif, link_id, qos); if (!skb) { ret = -ENOMEM; goto out; diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index 7bda9aab382cf..cbd759c844e55 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -2415,6 +2415,7 @@ static int rtw89_fw_h2c_add_general_pkt(struct rtw89_dev *rtwdev, u8 *id) { struct ieee80211_vif *vif = rtwvif_link_to_vif(rtwvif_link); + int link_id = ieee80211_vif_is_mld(vif) ? rtwvif_link->link_id : -1; struct rtw89_pktofld_info *info; struct sk_buff *skb; int ret; @@ -2431,10 +2432,10 @@ static int rtw89_fw_h2c_add_general_pkt(struct rtw89_dev *rtwdev, skb = ieee80211_proberesp_get(rtwdev->hw, vif); break; case RTW89_PKT_OFLD_TYPE_NULL_DATA: - skb = ieee80211_nullfunc_get(rtwdev->hw, vif, -1, false); + skb = ieee80211_nullfunc_get(rtwdev->hw, vif, link_id, false); break; case RTW89_PKT_OFLD_TYPE_QOS_NULL: - skb = ieee80211_nullfunc_get(rtwdev->hw, vif, -1, true); + skb = ieee80211_nullfunc_get(rtwdev->hw, vif, link_id, true); break; case RTW89_PKT_OFLD_TYPE_EAPOL_KEY: skb = rtw89_eapol_get(rtwdev, rtwvif_link); From 1a75e81baf4f1b322f3498ffd373eaada8e60589 Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Sun, 24 Nov 2024 11:05:36 +0100 Subject: [PATCH 0065/1450] of/unittest: Add empty dma-ranges address translation tests Intermediate DT PCI nodes dynamically generated by enabling CONFIG_PCI_DYNAMIC_OF_NODES have empty dma-ranges property. PCI address specifiers have 3 cells and when dma-ranges is missing or empty, of_translate_one() is currently dropping the flag portion of PCI addresses which are subnodes of the aforementioned ones, failing the translation. Add new tests covering this case. With this test, we get 1 new failure which is fixed in subsequent commit: FAIL of_unittest_pci_empty_dma_ranges():1245 for_each_of_pci_range wrong CPU addr (ffffffffffffffff) on node /testcase-data/address-tests2/pcie@d1070000/pci@0,0/dev@0,0/local-bus@0 Signed-off-by: Andrea della Porta Link: https://lore.kernel.org/r/08f8fee4fdc0379240fda2f4a0e6f11ebf9647a8.1732441813.git.andrea.porta@suse.com Signed-off-by: Rob Herring (Arm) --- drivers/of/unittest-data/tests-address.dtsi | 2 ++ drivers/of/unittest.c | 39 +++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/drivers/of/unittest-data/tests-address.dtsi b/drivers/of/unittest-data/tests-address.dtsi index 3344f15c3755c..f02a181bb1259 100644 --- a/drivers/of/unittest-data/tests-address.dtsi +++ b/drivers/of/unittest-data/tests-address.dtsi @@ -114,6 +114,7 @@ device_type = "pci"; ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x7f00000>, <0x81000000 0 0x00000000 0 0xefff0000 0 0x0010000>; + dma-ranges = <0x43000000 0x10 0x00 0x00 0x00 0x00 0x10000000>; reg = <0x00000000 0xd1070000 0x20000>; pci@0,0 { @@ -142,6 +143,7 @@ #size-cells = <0x01>; ranges = <0xa0000000 0 0 0 0x2000000>, <0xb0000000 1 0 0 0x1000000>; + dma-ranges = <0xc0000000 0x43000000 0x10 0x00 0x10000000>; dev@e0000000 { reg = <0xa0001000 0x1000>, diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index daf9a2dddd7e0..80483e38d7b4d 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -1213,6 +1213,44 @@ static void __init of_unittest_pci_dma_ranges(void) of_node_put(np); } +static void __init of_unittest_pci_empty_dma_ranges(void) +{ + struct device_node *np; + struct of_pci_range range; + struct of_pci_range_parser parser; + + if (!IS_ENABLED(CONFIG_PCI)) + return; + + np = of_find_node_by_path("/testcase-data/address-tests2/pcie@d1070000/pci@0,0/dev@0,0/local-bus@0"); + if (!np) { + pr_err("missing testcase data\n"); + return; + } + + if (of_pci_dma_range_parser_init(&parser, np)) { + pr_err("missing dma-ranges property\n"); + return; + } + + /* + * Get the dma-ranges from the device tree + */ + for_each_of_pci_range(&parser, &range) { + unittest(range.size == 0x10000000, + "for_each_of_pci_range wrong size on node %pOF size=%llx\n", + np, range.size); + unittest(range.cpu_addr == 0x00000000, + "for_each_of_pci_range wrong CPU addr (%llx) on node %pOF", + range.cpu_addr, np); + unittest(range.pci_addr == 0xc0000000, + "for_each_of_pci_range wrong DMA addr (%llx) on node %pOF", + range.pci_addr, np); + } + + of_node_put(np); +} + static void __init of_unittest_bus_ranges(void) { struct device_node *np; @@ -4272,6 +4310,7 @@ static int __init of_unittest(void) of_unittest_dma_get_max_cpu_address(); of_unittest_parse_dma_ranges(); of_unittest_pci_dma_ranges(); + of_unittest_pci_empty_dma_ranges(); of_unittest_bus_ranges(); of_unittest_bus_3cell_ranges(); of_unittest_reg(); From 7f05e20b989ac33c9c0f8c2028ec0a566493548f Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Sun, 24 Nov 2024 11:05:37 +0100 Subject: [PATCH 0066/1450] of: address: Preserve the flags portion on 1:1 dma-ranges mapping A missing or empty dma-ranges in a DT node implies a 1:1 mapping for dma translations. In this specific case, the current behaviour is to zero out the entire specifier so that the translation could be carried on as an offset from zero. This includes address specifier that has flags (e.g. PCI ranges). Once the flags portion has been zeroed, the translation chain is broken since the mapping functions will check the upcoming address specifier against mismatching flags, always failing the 1:1 mapping and its entire purpose of always succeeding. Set to zero only the address portion while passing the flags through. Fixes: dbbdee94734b ("of/address: Merge all of the bus translation code") Cc: stable@vger.kernel.org Signed-off-by: Andrea della Porta Tested-by: Herve Codina Link: https://lore.kernel.org/r/e51ae57874e58a9b349c35e2e877425ebc075d7a.1732441813.git.andrea.porta@suse.com Signed-off-by: Rob Herring (Arm) --- drivers/of/address.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index c5b925ac469f1..5b7ee3ed5296b 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -459,7 +459,8 @@ static int of_translate_one(const struct device_node *parent, const struct of_bu } if (ranges == NULL || rlen == 0) { offset = of_read_number(addr, na); - memset(addr, 0, pna * 4); + /* set address to zero, pass flags through */ + memset(addr + pbus->flag_cells, 0, (pna - pbus->flag_cells) * 4); pr_debug("empty ranges; 1:1 translation\n"); goto finish; } From 61a6ba233fe1198e9eacc9ca1d1cbdb27f70cee5 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 13 Nov 2024 16:56:13 -0600 Subject: [PATCH 0067/1450] dt-bindings: Unify "fsl,liodn" type definitions The type definition of "fsl,liodn" is defined as uint32 in crypto/fsl,sec-v4.0.yaml and uint32-array in soc/fsl/fsl,bman.yaml, soc/fsl/fsl,qman-portal.yaml, and soc/fsl/fsl,qman.yaml. Unify the type to be uint32-array and constraint the single entry cases. Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20241113225614.1782862-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/crypto/fsl,sec-v4.0.yaml | 10 ++++++---- .../devicetree/bindings/soc/fsl/fsl,qman-portal.yaml | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml index 9c8c9991f29ad..f0c4a7c83568a 100644 --- a/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml +++ b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml @@ -114,8 +114,9 @@ patternProperties: table that specifies the PPID to LIODN mapping. Needed if the PAMU is used. Value is a 12 bit value where value is a LIODN ID for this JR. This property is normally set by boot firmware. - $ref: /schemas/types.yaml#/definitions/uint32 - maximum: 0xfff + $ref: /schemas/types.yaml#/definitions/uint32-array + items: + - maximum: 0xfff '^rtic@[0-9a-f]+$': type: object @@ -186,8 +187,9 @@ patternProperties: Needed if the PAMU is used. Value is a 12 bit value where value is a LIODN ID for this JR. This property is normally set by boot firmware. - $ref: /schemas/types.yaml#/definitions/uint32 - maximum: 0xfff + $ref: /schemas/types.yaml#/definitions/uint32-array + items: + - maximum: 0xfff fsl,rtic-region: description: diff --git a/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml b/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml index 17016184143f2..e459fec02ba83 100644 --- a/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml +++ b/Documentation/devicetree/bindings/soc/fsl/fsl,qman-portal.yaml @@ -35,6 +35,7 @@ properties: fsl,liodn: $ref: /schemas/types.yaml#/definitions/uint32-array + maxItems: 2 description: See pamu.txt. Two LIODN(s). DQRR LIODN (DLIODN) and Frame LIODN (FLIODN) @@ -69,6 +70,7 @@ patternProperties: type: object properties: fsl,liodn: + $ref: /schemas/types.yaml#/definitions/uint32-array description: See pamu.txt, PAMU property used for static LIODN assignment fsl,iommu-parent: From 56dcbf0b520796e26b2bbe5686bdd305ad924954 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 26 Nov 2024 19:11:30 +0200 Subject: [PATCH 0068/1450] wifi: ath12k: convert struct ath12k::wmi_mgmt_tx_work to struct wiphy_work To simplify locking for the next patches convert struct ath12k::wmi_mgmt_tx_work to use wiphy_work. After this ath12k_mgmt_over_wmi_tx_work() is called with wiphy_lock() taken. In ath12k_core_suspend() we need to take wiphy_lock() because ath12k_mac_wait_tx_complete() requires it. Also add lockdep_assert_wiphy() to document when wiphy_lock() is held. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-2-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/core.c | 6 ++++++ drivers/net/wireless/ath/ath12k/core.h | 2 +- drivers/net/wireless/ath/ath12k/mac.c | 20 ++++++++++++++++---- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index c57322221e1d0..263a7c7891225 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -79,11 +79,17 @@ int ath12k_core_suspend(struct ath12k_base *ab) ar = ab->pdevs[i].ar; if (!ar) continue; + + wiphy_lock(ath12k_ar_to_hw(ar)->wiphy); + ret = ath12k_mac_wait_tx_complete(ar); if (ret) { + wiphy_unlock(ath12k_ar_to_hw(ar)->wiphy); ath12k_warn(ab, "failed to wait tx complete: %d\n", ret); return ret; } + + wiphy_unlock(ath12k_ar_to_hw(ar)->wiphy); } /* PM framework skips suspend_late/resume_early callbacks diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index c1d5e93b679a8..5be977008319b 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -679,7 +679,7 @@ struct ath12k { struct work_struct regd_update_work; - struct work_struct wmi_mgmt_tx_work; + struct wiphy_work wmi_mgmt_tx_work; struct sk_buff_head wmi_mgmt_tx_queue; struct ath12k_wow wow; diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 60702bf071415..a6fe998c177eb 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -6726,6 +6726,8 @@ static void ath12k_mgmt_over_wmi_tx_drop(struct ath12k *ar, struct sk_buff *skb) { int num_mgmt; + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + ieee80211_free_txskb(ath12k_ar_to_hw(ar), skb); num_mgmt = atomic_dec_if_positive(&ar->num_pending_mgmt_tx); @@ -6787,6 +6789,8 @@ static int ath12k_mac_mgmt_tx_wmi(struct ath12k *ar, struct ath12k_link_vif *arv int buf_id; int ret; + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + ATH12K_SKB_CB(skb)->ar = ar; spin_lock_bh(&ar->txmgmt_idr_lock); buf_id = idr_alloc(&ar->txmgmt_idr, skb, 0, @@ -6841,7 +6845,7 @@ static void ath12k_mgmt_over_wmi_tx_purge(struct ath12k *ar) ath12k_mgmt_over_wmi_tx_drop(ar, skb); } -static void ath12k_mgmt_over_wmi_tx_work(struct work_struct *work) +static void ath12k_mgmt_over_wmi_tx_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ath12k *ar = container_of(work, struct ath12k, wmi_mgmt_tx_work); struct ath12k_skb_cb *skb_cb; @@ -6850,6 +6854,8 @@ static void ath12k_mgmt_over_wmi_tx_work(struct work_struct *work) struct sk_buff *skb; int ret; + lockdep_assert_wiphy(wiphy); + while ((skb = skb_dequeue(&ar->wmi_mgmt_tx_queue)) != NULL) { skb_cb = ATH12K_SKB_CB(skb); if (!skb_cb->vif) { @@ -6904,7 +6910,7 @@ static int ath12k_mac_mgmt_tx(struct ath12k *ar, struct sk_buff *skb, skb_queue_tail(q, skb); atomic_inc(&ar->num_pending_mgmt_tx); - ieee80211_queue_work(ath12k_ar_to_hw(ar), &ar->wmi_mgmt_tx_work); + wiphy_work_queue(ath12k_ar_to_hw(ar)->wiphy, &ar->wmi_mgmt_tx_work); return 0; } @@ -6981,10 +6987,12 @@ static void ath12k_mac_op_tx(struct ieee80211_hw *hw, void ath12k_mac_drain_tx(struct ath12k *ar) { + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + /* make sure rcu-protected mac80211 tx path itself is drained */ synchronize_net(); - cancel_work_sync(&ar->wmi_mgmt_tx_work); + wiphy_work_cancel(ath12k_ar_to_hw(ar)->wiphy, &ar->wmi_mgmt_tx_work); ath12k_mgmt_over_wmi_tx_purge(ar); } @@ -7101,6 +7109,8 @@ static void ath12k_drain_tx(struct ath12k_hw *ah) struct ath12k *ar; int i; + lockdep_assert_wiphy(ah->hw->wiphy); + for_each_ar(ah, ar, i) ath12k_mac_drain_tx(ar); } @@ -9134,6 +9144,8 @@ static int ath12k_mac_flush(struct ath12k *ar) int ath12k_mac_wait_tx_complete(struct ath12k *ar) { + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + ath12k_mac_drain_tx(ar); return ath12k_mac_flush(ar); } @@ -10604,7 +10616,7 @@ static void ath12k_mac_setup(struct ath12k *ar) INIT_DELAYED_WORK(&ar->scan.timeout, ath12k_scan_timeout_work); INIT_WORK(&ar->regd_update_work, ath12k_regd_update_work); - INIT_WORK(&ar->wmi_mgmt_tx_work, ath12k_mgmt_over_wmi_tx_work); + wiphy_work_init(&ar->wmi_mgmt_tx_work, ath12k_mgmt_over_wmi_tx_work); skb_queue_head_init(&ar->wmi_mgmt_tx_queue); } From 648a121bafa3f4487254ab8e9e298f12540f0603 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Tue, 26 Nov 2024 19:11:31 +0200 Subject: [PATCH 0069/1450] wifi: ath12k: ath12k_mac_op_tx(): MLO support For a frame transmission for an ML vif, mac80211 mentions transmit link id in the tx control info. Use it to convert the RA/TA to the corresponding link sta and link vif address before enqueueing the frame for transmission. For 802.3 data frames, always enqueue the frame on the primary (assoc) link id. Firmware does the link selection, builds 802.11 header and therefore the address translation too. Also ensure right link vif is used for WMI based management transmission and add comments to document when RCU read lock is held. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Co-developed-by: Rameshkumar Sundaram Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-3-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/core.h | 1 + drivers/net/wireless/ath/ath12k/mac.c | 139 ++++++++++++++++++++++++- 2 files changed, 137 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 5be977008319b..e246e3d3c162d 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -122,6 +122,7 @@ struct ath12k_skb_cb { dma_addr_t paddr_ext_desc; u32 cipher; u8 flags; + u8 link_id; }; struct ath12k_skb_rxcb { diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index a6fe998c177eb..5ca96cb86d174 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -6848,6 +6848,7 @@ static void ath12k_mgmt_over_wmi_tx_purge(struct ath12k *ar) static void ath12k_mgmt_over_wmi_tx_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ath12k *ar = container_of(work, struct ath12k, wmi_mgmt_tx_work); + struct ath12k_hw *ah = ar->ah; struct ath12k_skb_cb *skb_cb; struct ath12k_vif *ahvif; struct ath12k_link_vif *arvif; @@ -6865,7 +6866,15 @@ static void ath12k_mgmt_over_wmi_tx_work(struct wiphy *wiphy, struct wiphy_work } ahvif = ath12k_vif_to_ahvif(skb_cb->vif); - arvif = &ahvif->deflink; + if (!(ahvif->links_map & BIT(skb_cb->link_id))) { + ath12k_warn(ar->ab, + "invalid linkid %u in mgmt over wmi tx with linkmap 0x%x\n", + skb_cb->link_id, ahvif->links_map); + ath12k_mgmt_over_wmi_tx_drop(ar, skb); + continue; + } + + arvif = wiphy_dereference(ah->hw->wiphy, ahvif->link[skb_cb->link_id]); if (ar->allocated_vdev_map & (1LL << arvif->vdev_id)) { ret = ath12k_mac_mgmt_tx_wmi(ar, arvif, skb); if (ret) { @@ -6875,8 +6884,9 @@ static void ath12k_mgmt_over_wmi_tx_work(struct wiphy *wiphy, struct wiphy_work } } else { ath12k_warn(ar->ab, - "dropping mgmt frame for vdev %d, is_started %d\n", + "dropping mgmt frame for vdev %d link %u is_started %d\n", arvif->vdev_id, + skb_cb->link_id, arvif->is_started); ath12k_mgmt_over_wmi_tx_drop(ar, skb); } @@ -6936,6 +6946,105 @@ static void ath12k_mac_add_p2p_noa_ie(struct ath12k *ar, spin_unlock_bh(&ar->data_lock); } +/* Note: called under rcu_read_lock() */ +static u8 ath12k_mac_get_tx_link(struct ieee80211_sta *sta, struct ieee80211_vif *vif, + u8 link, struct sk_buff *skb, u32 info_flags) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); + struct ieee80211_link_sta *link_sta; + struct ieee80211_bss_conf *bss_conf; + struct ath12k_sta *ahsta; + + /* Use the link id passed or the default vif link */ + if (!sta) { + if (link != IEEE80211_LINK_UNSPECIFIED) + return link; + + return ahvif->deflink.link_id; + } + + ahsta = ath12k_sta_to_ahsta(sta); + + /* Below translation ensures we pass proper A2 & A3 for non ML clients. + * Also it assumes for now support only for MLO AP in this path + */ + if (!sta->mlo) { + link = ahsta->deflink.link_id; + + if (info_flags & IEEE80211_TX_CTL_HW_80211_ENCAP) + return link; + + bss_conf = rcu_dereference(vif->link_conf[link]); + if (bss_conf) { + ether_addr_copy(hdr->addr2, bss_conf->addr); + if (!ieee80211_has_tods(hdr->frame_control) && + !ieee80211_has_fromds(hdr->frame_control)) + ether_addr_copy(hdr->addr3, bss_conf->addr); + } + + return link; + } + + /* enqueue eth enacap & data frames on primary link, FW does link + * selection and address translation. + */ + if (info_flags & IEEE80211_TX_CTL_HW_80211_ENCAP || + ieee80211_is_data(hdr->frame_control)) + return ahsta->assoc_link_id; + + /* 802.11 frame cases */ + if (link == IEEE80211_LINK_UNSPECIFIED) + link = ahsta->deflink.link_id; + + if (!ieee80211_is_mgmt(hdr->frame_control)) + return link; + + /* Perform address conversion for ML STA Tx */ + bss_conf = rcu_dereference(vif->link_conf[link]); + link_sta = rcu_dereference(sta->link[link]); + + if (bss_conf && link_sta) { + ether_addr_copy(hdr->addr1, link_sta->addr); + ether_addr_copy(hdr->addr2, bss_conf->addr); + + if (vif->type == NL80211_IFTYPE_STATION && bss_conf->bssid) + ether_addr_copy(hdr->addr3, bss_conf->bssid); + else if (vif->type == NL80211_IFTYPE_AP) + ether_addr_copy(hdr->addr3, bss_conf->addr); + + return link; + } + + if (bss_conf) { + /* In certain cases where a ML sta associated and added subset of + * links on which the ML AP is active, but now sends some frame + * (ex. Probe request) on a different link which is active in our + * MLD but was not added during previous association, we can + * still honor the Tx to that ML STA via the requested link. + * The control would reach here in such case only when that link + * address is same as the MLD address or in worst case clients + * used MLD address at TA wrongly which would have helped + * identify the ML sta object and pass it here. + * If the link address of that STA is different from MLD address, + * then the sta object would be NULL and control won't reach + * here but return at the start of the function itself with !sta + * check. Also this would not need any translation at hdr->addr1 + * from MLD to link address since the RA is the MLD address + * (same as that link address ideally) already. + */ + ether_addr_copy(hdr->addr2, bss_conf->addr); + + if (vif->type == NL80211_IFTYPE_STATION && bss_conf->bssid) + ether_addr_copy(hdr->addr3, bss_conf->bssid); + else if (vif->type == NL80211_IFTYPE_AP) + ether_addr_copy(hdr->addr3, bss_conf->addr); + } + + return link; +} + +/* Note: called under rcu_read_lock() */ static void ath12k_mac_op_tx(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb) @@ -6945,13 +7054,16 @@ static void ath12k_mac_op_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif = info->control.vif; struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); struct ath12k_link_vif *arvif = &ahvif->deflink; - struct ath12k *ar = arvif->ar; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_key_conf *key = info->control.hw_key; + struct ieee80211_sta *sta = control->sta; u32 info_flags = info->flags; + struct ath12k *ar; bool is_prb_rsp; + u8 link_id; int ret; + link_id = u32_get_bits(info->control.flags, IEEE80211_TX_CTRL_MLO_LINK); memset(skb_cb, 0, sizeof(*skb_cb)); skb_cb->vif = vif; @@ -6960,6 +7072,27 @@ static void ath12k_mac_op_tx(struct ieee80211_hw *hw, skb_cb->flags |= ATH12K_SKB_CIPHER_SET; } + /* handle only for MLO case, use deflink for non MLO case */ + if (ieee80211_vif_is_mld(vif)) { + link_id = ath12k_mac_get_tx_link(sta, vif, link_id, skb, info_flags); + if (link_id >= IEEE80211_MLD_MAX_NUM_LINKS) { + ieee80211_free_txskb(hw, skb); + return; + } + } else { + link_id = 0; + } + + arvif = rcu_dereference(ahvif->link[link_id]); + if (!arvif || !arvif->ar) { + ath12k_warn(ahvif->ah, "failed to find arvif link id %u for frame transmission", + link_id); + ieee80211_free_txskb(hw, skb); + return; + } + + ar = arvif->ar; + skb_cb->link_id = link_id; is_prb_rsp = ieee80211_is_probe_resp(hdr->frame_control); if (info_flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { From 2197feb0249d308bbb0ba0443bd45511cdec190a Mon Sep 17 00:00:00 2001 From: Sriram R Date: Tue, 26 Nov 2024 19:11:32 +0200 Subject: [PATCH 0070/1450] wifi: ath12k: ath12k_mac_op_flush(): MLO support Currently when tx flush is requested for an vif only packets corresponding to deflink are flushed, with MLO multiple link arvif could be affiliated to the ML vif and packets corresponding to all of them should be flushed. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Co-developed-by: Maharaja Kennadyrajan Signed-off-by: Maharaja Kennadyrajan Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-4-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 5ca96cb86d174..595e8110ab86c 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -9287,7 +9287,11 @@ static void ath12k_mac_op_flush(struct ieee80211_hw *hw, struct ieee80211_vif *v u32 queues, bool drop) { struct ath12k_hw *ah = ath12k_hw_to_ah(hw); + struct ath12k_link_vif *arvif; + struct ath12k_vif *ahvif; + unsigned long links; struct ath12k *ar; + u8 link_id; int i; lockdep_assert_wiphy(hw->wiphy); @@ -9302,12 +9306,18 @@ static void ath12k_mac_op_flush(struct ieee80211_hw *hw, struct ieee80211_vif *v return; } - ar = ath12k_get_ar_by_vif(hw, vif); + for_each_ar(ah, ar, i) + wiphy_work_flush(hw->wiphy, &ar->wmi_mgmt_tx_work); - if (!ar) - return; + ahvif = ath12k_vif_to_ahvif(vif); + links = ahvif->links_map; + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + if (!(arvif && arvif->ar)) + continue; - ath12k_mac_flush(ar); + ath12k_mac_flush(arvif->ar); + } } static int From 5419ef950da4a76c54c91129f16c292fc65da56b Mon Sep 17 00:00:00 2001 From: Sriram R Date: Tue, 26 Nov 2024 19:11:33 +0200 Subject: [PATCH 0071/1450] wifi: ath12k: ath12k_mac_op_ampdu_action(): MLO support Apply tid queue setup based on all link stations on receiving ampdu action params for an ML Station. Modify ath12k_get_ar_by_vif() to fetch ar based on link arvif inside ahvif. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-5-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/dp_rx.c | 36 +++++++++--- drivers/net/wireless/ath/ath12k/dp_rx.h | 6 +- drivers/net/wireless/ath/ath12k/mac.c | 76 ++++++++++++++----------- 3 files changed, 76 insertions(+), 42 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/dp_rx.c b/drivers/net/wireless/ath/ath12k/dp_rx.c index 70680f2124e52..b24d1de4aabb9 100644 --- a/drivers/net/wireless/ath/ath12k/dp_rx.c +++ b/drivers/net/wireless/ath/ath12k/dp_rx.c @@ -1065,15 +1065,25 @@ int ath12k_dp_rx_peer_tid_setup(struct ath12k *ar, const u8 *peer_mac, int vdev_ } int ath12k_dp_rx_ampdu_start(struct ath12k *ar, - struct ieee80211_ampdu_params *params) + struct ieee80211_ampdu_params *params, + u8 link_id) { struct ath12k_base *ab = ar->ab; struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(params->sta); - struct ath12k_link_sta *arsta = &ahsta->deflink; - int vdev_id = arsta->arvif->vdev_id; + struct ath12k_link_sta *arsta; + int vdev_id; int ret; - ret = ath12k_dp_rx_peer_tid_setup(ar, params->sta->addr, vdev_id, + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + arsta = wiphy_dereference(ath12k_ar_to_hw(ar)->wiphy, + ahsta->link[link_id]); + if (!arsta) + return -ENOLINK; + + vdev_id = arsta->arvif->vdev_id; + + ret = ath12k_dp_rx_peer_tid_setup(ar, arsta->addr, vdev_id, params->tid, params->buf_size, params->ssn, arsta->ahsta->pn_type); if (ret) @@ -1083,19 +1093,29 @@ int ath12k_dp_rx_ampdu_start(struct ath12k *ar, } int ath12k_dp_rx_ampdu_stop(struct ath12k *ar, - struct ieee80211_ampdu_params *params) + struct ieee80211_ampdu_params *params, + u8 link_id) { struct ath12k_base *ab = ar->ab; struct ath12k_peer *peer; struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(params->sta); - struct ath12k_link_sta *arsta = &ahsta->deflink; - int vdev_id = arsta->arvif->vdev_id; + struct ath12k_link_sta *arsta; + int vdev_id; bool active; int ret; + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + arsta = wiphy_dereference(ath12k_ar_to_hw(ar)->wiphy, + ahsta->link[link_id]); + if (!arsta) + return -ENOLINK; + + vdev_id = arsta->arvif->vdev_id; + spin_lock_bh(&ab->base_lock); - peer = ath12k_peer_find(ab, vdev_id, params->sta->addr); + peer = ath12k_peer_find(ab, vdev_id, arsta->addr); if (!peer) { spin_unlock_bh(&ab->base_lock); ath12k_warn(ab, "failed to find the peer to stop rx aggregation\n"); diff --git a/drivers/net/wireless/ath/ath12k/dp_rx.h b/drivers/net/wireless/ath/ath12k/dp_rx.h index bfd4f814553e7..1ce82088c9540 100644 --- a/drivers/net/wireless/ath/ath12k/dp_rx.h +++ b/drivers/net/wireless/ath/ath12k/dp_rx.h @@ -85,9 +85,11 @@ static inline u32 ath12k_he_gi_to_nl80211_he_gi(u8 sgi) } int ath12k_dp_rx_ampdu_start(struct ath12k *ar, - struct ieee80211_ampdu_params *params); + struct ieee80211_ampdu_params *params, + u8 link_id); int ath12k_dp_rx_ampdu_stop(struct ath12k *ar, - struct ieee80211_ampdu_params *params); + struct ieee80211_ampdu_params *params, + u8 link_id); int ath12k_dp_rx_peer_pn_replay_config(struct ath12k_link_vif *arvif, const u8 *peer_addr, enum set_key_cmd key_cmd, diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 595e8110ab86c..ec8209f172613 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -725,11 +725,14 @@ static struct ath12k *ath12k_get_ar_by_ctx(struct ieee80211_hw *hw, } static struct ath12k *ath12k_get_ar_by_vif(struct ieee80211_hw *hw, - struct ieee80211_vif *vif) + struct ieee80211_vif *vif, + u8 link_id) { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); - struct ath12k_link_vif *arvif = &ahvif->deflink; struct ath12k_hw *ah = ath12k_hw_to_ah(hw); + struct ath12k_link_vif *arvif; + + lockdep_assert_wiphy(hw->wiphy); /* If there is one pdev within ah, then we return * ar directly. @@ -737,7 +740,11 @@ static struct ath12k *ath12k_get_ar_by_vif(struct ieee80211_hw *hw, if (ah->num_radio == 1) return ah->radio; - if (arvif->is_created) + if (!(ahvif->links_map & BIT(link_id))) + return NULL; + + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + if (arvif && arvif->is_created) return arvif->ar; return NULL; @@ -5667,6 +5674,7 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, struct ath12k *ar; struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); + struct ath12k_hw *ah = ath12k_hw_to_ah(hw); struct ath12k_link_sta *arsta; struct ath12k_link_vif *arvif; struct ath12k_peer *peer; @@ -5676,20 +5684,17 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, */ u8 link_id = ATH12K_DEFAULT_LINK_ID; - ar = ath12k_get_ar_by_vif(hw, vif); - if (!ar) { - WARN_ON_ONCE(1); - return; - } - rcu_read_lock(); arvif = rcu_dereference(ahvif->link[link_id]); if (!arvif) { - ath12k_warn(ar->ab, "mac sta rc update failed to fetch link vif on link id %u for peer %pM\n", - link_id, sta->addr); + ath12k_hw_warn(ah, "mac sta rc update failed to fetch link vif on link id %u for peer %pM\n", + link_id, sta->addr); rcu_read_unlock(); return; } + + ar = arvif->ar; + arsta = rcu_dereference(ahsta->link[link_id]); if (!arsta) { rcu_read_unlock(); @@ -8288,20 +8293,26 @@ static int ath12k_mac_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx return ret; } -static int ath12k_mac_ampdu_action(struct ath12k_link_vif *arvif, - struct ieee80211_ampdu_params *params) +static int ath12k_mac_ampdu_action(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_ampdu_params *params, + u8 link_id) { - struct ath12k *ar = arvif->ar; + struct ath12k *ar; int ret = -EINVAL; - lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + lockdep_assert_wiphy(hw->wiphy); + + ar = ath12k_get_ar_by_vif(hw, vif, link_id); + if (!ar) + return -EINVAL; switch (params->action) { case IEEE80211_AMPDU_RX_START: - ret = ath12k_dp_rx_ampdu_start(ar, params); + ret = ath12k_dp_rx_ampdu_start(ar, params, link_id); break; case IEEE80211_AMPDU_RX_STOP: - ret = ath12k_dp_rx_ampdu_stop(ar, params); + ret = ath12k_dp_rx_ampdu_stop(ar, params, link_id); break; case IEEE80211_AMPDU_TX_START: case IEEE80211_AMPDU_TX_STOP_CONT: @@ -8315,6 +8326,10 @@ static int ath12k_mac_ampdu_action(struct ath12k_link_vif *arvif, break; } + if (ret) + ath12k_warn(ar->ab, "unable to perform ampdu action %d for vif %pM link %u ret %d\n", + params->action, vif->addr, link_id, ret); + return ret; } @@ -8322,27 +8337,24 @@ static int ath12k_mac_op_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_ampdu_params *params) { - struct ath12k_hw *ah = ath12k_hw_to_ah(hw); - struct ath12k *ar; - struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); - struct ath12k_link_vif *arvif; + struct ieee80211_sta *sta = params->sta; + struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); + unsigned long links_map = ahsta->links_map; int ret = -EINVAL; + u8 link_id; lockdep_assert_wiphy(hw->wiphy); - ar = ath12k_get_ar_by_vif(hw, vif); - if (!ar) - return -EINVAL; - - ar = ath12k_ah_to_ar(ah, 0); - arvif = &ahvif->deflink; + if (WARN_ON(!links_map)) + return ret; - ret = ath12k_mac_ampdu_action(arvif, params); - if (ret) - ath12k_warn(ar->ab, "pdev idx %d unable to perform ampdu action %d ret %d\n", - ar->pdev_idx, params->action, ret); + for_each_set_bit(link_id, &links_map, IEEE80211_MLD_MAX_NUM_LINKS) { + ret = ath12k_mac_ampdu_action(hw, vif, params, link_id); + if (ret) + return ret; + } - return ret; + return 0; } static int ath12k_mac_op_add_chanctx(struct ieee80211_hw *hw, From 85edf16384d12db938a09458d89662cdff87068e Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Tue, 26 Nov 2024 19:11:34 +0200 Subject: [PATCH 0072/1450] wifi: ath12k: ath12k_mac_station_add(): fix potential rx_stats leak If peer creation fails ar->rx_stats needs to be freed in error handling. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-6-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index ec8209f172613..0b9dd50959e6b 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -5308,6 +5308,8 @@ static int ath12k_mac_station_add(struct ath12k *ar, free_peer: ath12k_peer_delete(ar, arvif->vdev_id, arsta->addr); + kfree(arsta->rx_stats); + arsta->rx_stats = NULL; dec_num_station: ath12k_mac_dec_num_stations(arvif, arsta); exit: From 90570ba4610bdb1db39ef45f2b271a9f89680a9d Mon Sep 17 00:00:00 2001 From: Sriram R Date: Tue, 26 Nov 2024 19:11:35 +0200 Subject: [PATCH 0073/1450] wifi: ath12k: do not return invalid link id for scan link When a scan request is received, driver selects a link id for which the arvif can be mapped. Same link is also used for getting the link conf address. Currently, we return 0 as link id for a non ML vif, which is correct since that is the default link id. Also when any of the link vif is active and the scan request is for a channel in the active link we return its link id. But, when we don't hit both of the above cases (i.e not a ML vif or no active link vif for the channel is present) we currently return 0 as the link id. Bu the problemis that this might not work out always, eg., when only one link (eg. linkid = 1) is added to vif, then we won't find any link conf for link id 0 in the vif resulting in scan failure. During AP bringup, such scan failure causes bringup issues. Hence avoid sending link id 0 as default. Rather use a default link for scan and default link address for the same. This scan vdev will either be deleted if another scan is requested on same vif or when AP is broughtup on same link or during interface cleanup. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Sriram R Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-7-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/core.h | 3 +- drivers/net/wireless/ath/ath12k/mac.c | 65 +++++++++++++++++++------- drivers/net/wireless/ath/ath12k/mac.h | 6 +++ 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index e246e3d3c162d..f4a710d495846 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -322,10 +322,11 @@ struct ath12k_vif { bool ps; struct ath12k_link_vif deflink; - struct ath12k_link_vif __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; + struct ath12k_link_vif __rcu *link[ATH12K_NUM_MAX_LINKS]; struct ath12k_vif_cache *cache[IEEE80211_MLD_MAX_NUM_LINKS]; /* indicates bitmap of link vif created in FW */ u16 links_map; + u8 last_scan_link; /* Must be last - ends in a flexible-array member. * diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 0b9dd50959e6b..5aff5ba7e3b4f 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -3792,6 +3792,9 @@ static void ath12k_ahvif_put_link_key_cache(struct ath12k_vif_cache *cache) static void ath12k_ahvif_put_link_cache(struct ath12k_vif *ahvif, u8 link_id) { + if (link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + return; + ath12k_ahvif_put_link_key_cache(ahvif->cache[link_id]); kfree(ahvif->cache[link_id]); ahvif->cache[link_id] = NULL; @@ -3852,9 +3855,9 @@ static struct ath12k_link_vif *ath12k_mac_assign_link_vif(struct ath12k_hw *ah, arvif = &ahvif->deflink; } else { /* If this is the first link arvif being created for an ML VIF - * use the preallocated deflink memory + * use the preallocated deflink memory except for scan arvifs */ - if (!ahvif->links_map) { + if (!ahvif->links_map && link_id != ATH12K_DEFAULT_SCAN_LINK) { arvif = &ahvif->deflink; } else { arvif = (struct ath12k_link_vif *) @@ -4154,10 +4157,10 @@ ath12k_mac_find_link_id_by_ar(struct ath12k_vif *ahvif, struct ath12k *ar) return link_id; } - /* input ar is not assigned to any of the links, use link id - * 0 for scan vdev creation. + /* input ar is not assigned to any of the links of ML VIF, use scan + * link (15) for scan vdev creation. */ - return 0; + return ATH12K_DEFAULT_SCAN_LINK; } static int ath12k_mac_op_hw_scan(struct ieee80211_hw *hw, @@ -4188,7 +4191,7 @@ static int ath12k_mac_op_hw_scan(struct ieee80211_hw *hw, /* check if any of the links of ML VIF is already started on * radio(ar) correpsondig to given scan frequency and use it, - * if not use deflink(link 0) for scan purpose. + * if not use scan link (link 15) for scan purpose. */ link_id = ath12k_mac_find_link_id_by_ar(ahvif, ar); arvif = ath12k_mac_assign_link_vif(ah, vif, link_id); @@ -4298,6 +4301,13 @@ static int ath12k_mac_op_hw_scan(struct ieee80211_hw *hw, spin_unlock_bh(&ar->data_lock); } + /* As per cfg80211/mac80211 scan design, it allows only one + * scan at a time. Hence last_scan link id is used for + * tracking the link id on which the scan is been done on + * this vif. + */ + ahvif->last_scan_link = arvif->link_id; + /* Add a margin to account for event/command processing */ ieee80211_queue_delayed_work(ath12k_ar_to_hw(ar), &ar->scan.timeout, msecs_to_jiffies(arg->max_scan_time + @@ -4317,14 +4327,14 @@ static void ath12k_mac_op_cancel_hw_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); + u16 link_id = ahvif->last_scan_link; struct ath12k_link_vif *arvif; struct ath12k *ar; lockdep_assert_wiphy(hw->wiphy); - arvif = &ahvif->deflink; - - if (!arvif->is_created) + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + if (!arvif || !arvif->is_created) return; ar = arvif->ar; @@ -7688,10 +7698,19 @@ int ath12k_mac_vdev_create(struct ath12k *ar, struct ath12k_link_vif *arvif) u16 nss; int i; int ret, vdev_id; + u8 link_id; lockdep_assert_wiphy(hw->wiphy); - link_conf = wiphy_dereference(hw->wiphy, vif->link_conf[arvif->link_id]); + /* If no link is active and scan vdev is requested + * use a default link conf for scan address purpose. + */ + if (arvif->link_id == ATH12K_DEFAULT_SCAN_LINK && vif->valid_links) + link_id = ffs(vif->valid_links) - 1; + else + link_id = arvif->link_id; + + link_conf = wiphy_dereference(hw->wiphy, vif->link_conf[link_id]); if (!link_conf) { ath12k_warn(ar->ab, "unable to access bss link conf in vdev create for vif %pM link %u\n", vif->addr, arvif->link_id); @@ -7971,7 +7990,9 @@ static struct ath12k *ath12k_mac_assign_vif_to_vdev(struct ieee80211_hw *hw, struct ath12k_link_vif *arvif, struct ieee80211_chanctx_conf *ctx) { - struct ieee80211_vif *vif = ath12k_ahvif_to_vif(arvif->ahvif); + struct ath12k_vif *ahvif = arvif->ahvif; + struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif); + struct ath12k_link_vif *scan_arvif; struct ath12k_hw *ah = hw->priv; struct ath12k *ar; struct ath12k_base *ab; @@ -7990,6 +8011,19 @@ static struct ath12k *ath12k_mac_assign_vif_to_vdev(struct ieee80211_hw *hw, if (!ar) return NULL; + /* cleanup the scan vdev if we are done scan on that ar + * and now we want to create for actual usage. + */ + if (ieee80211_vif_is_mld(vif)) { + scan_arvif = wiphy_dereference(hw->wiphy, + ahvif->link[ATH12K_DEFAULT_SCAN_LINK]); + if (scan_arvif && scan_arvif->ar == ar) { + ar->scan.vdev_id = -1; + ath12k_mac_remove_link_interface(hw, scan_arvif); + ath12k_mac_unassign_link_vif(scan_arvif); + } + } + if (arvif->ar) { /* This is not expected really */ if (WARN_ON(!arvif->is_created)) { @@ -8194,7 +8228,7 @@ static void ath12k_mac_op_remove_interface(struct ieee80211_hw *hw, lockdep_assert_wiphy(hw->wiphy); - for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + for (link_id = 0; link_id < ATH12K_NUM_MAX_LINKS; link_id++) { /* if we cached some config but never received assign chanctx, * free the allocated cache. */ @@ -9042,11 +9076,8 @@ ath12k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw, return -ENOMEM; } - if (!arvif->is_started) { - ar = ath12k_mac_assign_vif_to_vdev(hw, arvif, ctx); - if (!ar) - return -EINVAL; - } else { + ar = ath12k_mac_assign_vif_to_vdev(hw, arvif, ctx); + if (!ar) { ath12k_warn(arvif->ar->ab, "failed to assign chanctx for vif %pM link id %u link vif is already started", vif->addr, link_id); return -EINVAL; diff --git a/drivers/net/wireless/ath/ath12k/mac.h b/drivers/net/wireless/ath/ath12k/mac.h index c13630ee479a1..abdc9a6c07402 100644 --- a/drivers/net/wireless/ath/ath12k/mac.h +++ b/drivers/net/wireless/ath/ath12k/mac.h @@ -44,6 +44,12 @@ struct ath12k_generic_iter { #define ATH12K_DEFAULT_LINK_ID 0 #define ATH12K_INVALID_LINK_ID 255 +/* Default link after the IEEE802.11 defined Max link id limit + * for driver usage purpose. + */ +#define ATH12K_DEFAULT_SCAN_LINK IEEE80211_MLD_MAX_NUM_LINKS +#define ATH12K_NUM_MAX_LINKS (IEEE80211_MLD_MAX_NUM_LINKS + 1) + enum ath12k_supported_bw { ATH12K_BW_20 = 0, ATH12K_BW_40 = 1, From 1833a2ce5d7df2b064e491d3e912da9fa0b85eb9 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Tue, 26 Nov 2024 19:11:36 +0200 Subject: [PATCH 0074/1450] wifi: ath12k: ath12k_bss_assoc(): MLO support Currently, the ath12k_bss_assoc() function handles only deflink station connections. To support multi-link station connections, make the necessary changes to retrieve the required information from the link-level members. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Aditya Kumar Singh Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-8-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 28 +++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 5aff5ba7e3b4f..2bb5d79c66b19 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -3133,7 +3133,9 @@ static void ath12k_bss_assoc(struct ath12k *ar, struct ath12k_vif *ahvif = arvif->ahvif; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif); struct ath12k_wmi_vdev_up_params params = {}; - struct ath12k_wmi_peer_assoc_arg peer_arg; + struct ath12k_wmi_peer_assoc_arg peer_arg = {}; + struct ieee80211_link_sta *link_sta; + u8 link_id = bss_conf->link_id; struct ath12k_link_sta *arsta; struct ieee80211_sta *ap_sta; struct ath12k_sta *ahsta; @@ -3143,27 +3145,38 @@ static void ath12k_bss_assoc(struct ath12k *ar, lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); - ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %i assoc bssid %pM aid %d\n", - arvif->vdev_id, arvif->bssid, ahvif->aid); + ath12k_dbg(ar->ab, ATH12K_DBG_MAC, + "mac vdev %i link id %u assoc bssid %pM aid %d\n", + arvif->vdev_id, link_id, arvif->bssid, ahvif->aid); rcu_read_lock(); - ap_sta = ieee80211_find_sta(vif, bss_conf->bssid); + /* During ML connection, cfg.ap_addr has the MLD address. For + * non-ML connection, it has the BSSID. + */ + ap_sta = ieee80211_find_sta(vif, vif->cfg.ap_addr); if (!ap_sta) { ath12k_warn(ar->ab, "failed to find station entry for bss %pM vdev %i\n", - bss_conf->bssid, arvif->vdev_id); + vif->cfg.ap_addr, arvif->vdev_id); rcu_read_unlock(); return; } ahsta = ath12k_sta_to_ahsta(ap_sta); - arsta = &ahsta->deflink; + arsta = wiphy_dereference(ath12k_ar_to_hw(ar)->wiphy, + ahsta->link[link_id]); if (WARN_ON(!arsta)) { rcu_read_unlock(); return; } + link_sta = ath12k_mac_get_link_sta(arsta); + if (WARN_ON(!link_sta)) { + rcu_read_unlock(); + return; + } + ath12k_peer_assoc_prepare(ar, arvif, arsta, &peer_arg, false); rcu_read_unlock(); @@ -3182,8 +3195,7 @@ static void ath12k_bss_assoc(struct ath12k *ar, } ret = ath12k_setup_peer_smps(ar, arvif, bss_conf->bssid, - &ap_sta->deflink.ht_cap, - &ap_sta->deflink.he_6ghz_capa); + &link_sta->ht_cap, &link_sta->he_6ghz_capa); if (ret) { ath12k_warn(ar->ab, "failed to setup peer SMPS for vdev %d: %d\n", arvif->vdev_id, ret); From aa80f12f3bedc2d73e4cc43554aee44c277cc938 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Tue, 26 Nov 2024 19:11:37 +0200 Subject: [PATCH 0075/1450] wifi: ath12k: defer vdev creation for MLO Currently for single radio devices (ah->num_radio == 1) ath12k_mac_op_add_interface() creates vdev and later hw scan and assign_vif_chanctx uses the same. For MLO, vdev create request should carry ML address which will not be known during ath12k_mac_op_add_interface() as vif will be marked as ML only after links are added to it. If hw scan is requested, the vdev will be deleted post hw scan and subsequent assign_vif_chanctx call will create new vdev with ML address. But in certain cases assign_vif_chanctx could be called without any prior hw scan request and reusing the previously created vdev causes a non-ML vdev to be used for an ML vif and firmware operates the vdev in non-ML mode. Fix this by deferring vdev creation for interface until hw scan or assign_vif_chanctx request is received from mac80211. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Rameshkumar Sundaram Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-9-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 2bb5d79c66b19..ee804d4a3fd8d 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -8131,14 +8131,9 @@ static int ath12k_mac_op_add_interface(struct ieee80211_hw *hw, vif->hw_queue[i] = ATH12K_HW_DEFAULT_QUEUE; vif->driver_flags |= IEEE80211_VIF_SUPPORTS_UAPSD; - /* For non-ml vifs, vif->addr is the actual vdev address but for - * ML vif link(link BSSID) address is the vdev address and it can be a - * different one from vif->addr (i.e ML address). - * Defer vdev creation until assign_chanctx or hw_scan is initiated as driver + /* Defer vdev creation until assign_chanctx or hw_scan is initiated as driver * will not know if this interface is an ML vif at this point. */ - ath12k_mac_assign_vif_to_vdev(hw, arvif, NULL); - return 0; } From ad969bc9ee73fa9eda6223be2a7c0c6caf937d71 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 26 Nov 2024 19:11:38 +0200 Subject: [PATCH 0076/1450] wifi: ath12k: ath12k_mac_op_set_key(): fix uninitialized symbol 'ret' Dan reported that in some cases the ret variable could be uninitialized. Fix that by removing the out label entirely and returning zero explicitly on succesful cases. Also remove the unnecessary else branches to follow more the style used in ath12k and now it's easier to see the error handling. No functional changes. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/7e7afd00-ad84-4744-8d94-416bab7e7dd9@stanley.mountain/ Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-10-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 57 +++++++++++++++------------ 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index ee804d4a3fd8d..ae8a253c466cd 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4657,6 +4657,7 @@ static int ath12k_mac_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, if (sta) { ahsta = ath12k_sta_to_ahsta(sta); + /* For an ML STA Pairwise key is same for all associated link Stations, * hence do set key for all link STAs which are active. */ @@ -4679,41 +4680,47 @@ static int ath12k_mac_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, if (ret) break; } - } else { - arsta = &ahsta->deflink; - arvif = arsta->arvif; - if (WARN_ON(!arvif)) { - ret = -EINVAL; - goto out; - } - ret = ath12k_mac_set_key(arvif->ar, cmd, arvif, arsta, key); - } - } else { - if (key->link_id >= 0 && key->link_id < IEEE80211_MLD_MAX_NUM_LINKS) { - link_id = key->link_id; - arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); - } else { - link_id = 0; - arvif = &ahvif->deflink; + return 0; } - if (!arvif || !arvif->is_created) { - cache = ath12k_ahvif_get_link_cache(ahvif, link_id); - if (!cache) - return -ENOSPC; + arsta = &ahsta->deflink; + arvif = arsta->arvif; + if (WARN_ON(!arvif)) + return -EINVAL; + + ret = ath12k_mac_set_key(arvif->ar, cmd, arvif, arsta, key); + if (ret) + return ret; + + return 0; + } + + if (key->link_id >= 0 && key->link_id < IEEE80211_MLD_MAX_NUM_LINKS) { + link_id = key->link_id; + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + } else { + link_id = 0; + arvif = &ahvif->deflink; + } - ret = ath12k_mac_update_key_cache(cache, cmd, sta, key); + if (!arvif || !arvif->is_created) { + cache = ath12k_ahvif_get_link_cache(ahvif, link_id); + if (!cache) + return -ENOSPC; + ret = ath12k_mac_update_key_cache(cache, cmd, sta, key); + if (ret) return ret; - } - ret = ath12k_mac_set_key(arvif->ar, cmd, arvif, NULL, key); + return 0; } -out: + ret = ath12k_mac_set_key(arvif->ar, cmd, arvif, NULL, key); + if (ret) + return ret; - return ret; + return 0; } static int From 8c2143702d0719a0357600bca0236900781ffc78 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Tue, 26 Nov 2024 19:11:39 +0200 Subject: [PATCH 0077/1450] wifi: ath12k: ath12k_mac_op_sta_rc_update(): use mac80211 provided link id There's a todo comment to use mac80211 provided link id. As mac80211 now provides it use it in ath12k and remove the comment. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241126171139.2350704-11-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/mac.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index ae8a253c466cd..8d42077078676 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -5696,10 +5696,10 @@ static int ath12k_mac_op_sta_set_txpwr(struct ieee80211_hw *hw, return ret; } -static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct ieee80211_link_sta *link_sta, - u32 changed) +static void ath12k_mac_op_link_sta_rc_update(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + u32 changed) { struct ieee80211_sta *sta = link_sta->sta; struct ath12k *ar; @@ -5710,27 +5710,23 @@ static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, struct ath12k_link_vif *arvif; struct ath12k_peer *peer; u32 bw, smps; - /* TODO: use proper link id once link sta specific rc update support is - * available in mac80211. - */ - u8 link_id = ATH12K_DEFAULT_LINK_ID; rcu_read_lock(); - arvif = rcu_dereference(ahvif->link[link_id]); + arvif = rcu_dereference(ahvif->link[link_sta->link_id]); if (!arvif) { ath12k_hw_warn(ah, "mac sta rc update failed to fetch link vif on link id %u for peer %pM\n", - link_id, sta->addr); + link_sta->link_id, sta->addr); rcu_read_unlock(); return; } ar = arvif->ar; - arsta = rcu_dereference(ahsta->link[link_id]); + arsta = rcu_dereference(ahsta->link[link_sta->link_id]); if (!arsta) { rcu_read_unlock(); ath12k_warn(ar->ab, "mac sta rc update failed to fetch link sta on link id %u for peer %pM\n", - link_id, sta->addr); + link_sta->link_id, sta->addr); return; } spin_lock_bh(&ar->ab->base_lock); @@ -10165,7 +10161,7 @@ static const struct ieee80211_ops ath12k_ops = { .set_rekey_data = ath12k_mac_op_set_rekey_data, .sta_state = ath12k_mac_op_sta_state, .sta_set_txpwr = ath12k_mac_op_sta_set_txpwr, - .link_sta_rc_update = ath12k_mac_op_sta_rc_update, + .link_sta_rc_update = ath12k_mac_op_link_sta_rc_update, .conf_tx = ath12k_mac_op_conf_tx, .set_antenna = ath12k_mac_op_set_antenna, .get_antenna = ath12k_mac_op_get_antenna, From 130727c37b7e2495db10535f6ef00095783ad5a9 Mon Sep 17 00:00:00 2001 From: Pierre-Henry Moussay Date: Mon, 30 Sep 2024 10:54:30 +0100 Subject: [PATCH 0078/1450] dt-bindings: can: mpfs: add PIC64GX CAN compatibility PIC64GX CAN is compatible with the MPFS CAN, only add a fallback Signed-off-by: Pierre-Henry Moussay Acked-by: Conor Dooley Reviewed-by: Marc Kleine-Budde Link: https://patch.msgid.link/20240930095449.1813195-2-pierre-henry.moussay@microchip.com Signed-off-by: Marc Kleine-Budde --- .../devicetree/bindings/net/can/microchip,mpfs-can.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml b/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml index 01e4d4a54df6a..1219c5cb601fe 100644 --- a/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml +++ b/Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml @@ -15,7 +15,11 @@ allOf: properties: compatible: - const: microchip,mpfs-can + oneOf: + - items: + - const: microchip,pic64gx-can + - const: microchip,mpfs-can + - const: microchip,mpfs-can reg: maxItems: 1 From 79195755cdebffb085bd2b6c0767272ef39f53bb Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Thu, 28 Nov 2024 09:29:21 +0100 Subject: [PATCH 0079/1450] dt-bindings: can: convert tcan4x5x.txt to DT schema Convert binding doc tcan4x5x.txt to yaml. Added during conversion, required clock-names cclk. Signed-off-by: Sean Nyekjaer Acked-by: Conor Dooley Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20241128-convert-tcan-v3-1-bf2d8005bab5@geanix.com Signed-off-by: Marc Kleine-Budde --- .../devicetree/bindings/net/can/tcan4x5x.txt | 48 ----- .../bindings/net/can/ti,tcan4x5x.yaml | 191 ++++++++++++++++++ 2 files changed, 191 insertions(+), 48 deletions(-) delete mode 100644 Documentation/devicetree/bindings/net/can/tcan4x5x.txt create mode 100644 Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml diff --git a/Documentation/devicetree/bindings/net/can/tcan4x5x.txt b/Documentation/devicetree/bindings/net/can/tcan4x5x.txt deleted file mode 100644 index 20c0572c98534..0000000000000 --- a/Documentation/devicetree/bindings/net/can/tcan4x5x.txt +++ /dev/null @@ -1,48 +0,0 @@ -Texas Instruments TCAN4x5x CAN Controller -================================================ - -This file provides device node information for the TCAN4x5x interface contains. - -Required properties: - - compatible: - "ti,tcan4552", "ti,tcan4x5x" - "ti,tcan4553", "ti,tcan4x5x" or - "ti,tcan4x5x" - - reg: 0 - - #address-cells: 1 - - #size-cells: 0 - - spi-max-frequency: Maximum frequency of the SPI bus the chip can - operate at should be less than or equal to 18 MHz. - - interrupt-parent: the phandle to the interrupt controller which provides - the interrupt. - - interrupts: interrupt specification for data-ready. - -See Documentation/devicetree/bindings/net/can/bosch,m_can.yaml for additional -required property details. - -Optional properties: - - reset-gpios: Hardwired output GPIO. If not defined then software - reset. - - device-state-gpios: Input GPIO that indicates if the device is in - a sleep state or if the device is active. Not - available with tcan4552/4553. - - device-wake-gpios: Wake up GPIO to wake up the TCAN device. Not - available with tcan4552/4553. - - wakeup-source: Leave the chip running when suspended, and configure - the RX interrupt to wake up the device. - -Example: -tcan4x5x: tcan4x5x@0 { - compatible = "ti,tcan4x5x"; - reg = <0>; - #address-cells = <1>; - #size-cells = <1>; - spi-max-frequency = <10000000>; - bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>; - interrupt-parent = <&gpio1>; - interrupts = <14 IRQ_TYPE_LEVEL_LOW>; - device-state-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; - device-wake-gpios = <&gpio1 15 GPIO_ACTIVE_HIGH>; - reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>; - wakeup-source; -}; diff --git a/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml b/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml new file mode 100644 index 0000000000000..afd9d315dea2a --- /dev/null +++ b/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/can/ti,tcan4x5x.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Texas Instruments TCAN4x5x CAN Controller + +maintainers: + - Marc Kleine-Budde + +properties: + compatible: + oneOf: + - items: + - enum: + - ti,tcan4552 + - ti,tcan4553 + - const: ti,tcan4x5x + - const: ti,tcan4x5x + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + description: The GPIO parent interrupt. + + clocks: + maxItems: 1 + + clock-names: + items: + - const: cclk + + reset-gpios: + description: Hardwired output GPIO. If not defined then software reset. + maxItems: 1 + + device-state-gpios: + description: + Input GPIO that indicates if the device is in a sleep state or if the + device is active. Not available with tcan4552/4553. + maxItems: 1 + + device-wake-gpios: + description: + Wake up GPIO to wake up the TCAN device. + Not available with tcan4552/4553. + maxItems: 1 + + bosch,mram-cfg: + description: | + Message RAM configuration data. + Multiple M_CAN instances can share the same Message RAM + and each element(e.g Rx FIFO or Tx Buffer and etc) number + in Message RAM is also configurable, so this property is + telling driver how the shared or private Message RAM are + used by this M_CAN controller. + + The format should be as follows: + + The 'offset' is an address offset of the Message RAM where + the following elements start from. This is usually set to + 0x0 if you're using a private Message RAM. The remain cells + are used to specify how many elements are used for each FIFO/Buffer. + + M_CAN includes the following elements according to user manual: + 11-bit Filter 0-128 elements / 0-128 words + 29-bit Filter 0-64 elements / 0-128 words + Rx FIFO 0 0-64 elements / 0-1152 words + Rx FIFO 1 0-64 elements / 0-1152 words + Rx Buffers 0-64 elements / 0-1152 words + Tx Event FIFO 0-32 elements / 0-64 words + Tx Buffers 0-32 elements / 0-576 words + + Please refer to 2.4.1 Message RAM Configuration in Bosch + M_CAN user manual for details. + $ref: /schemas/types.yaml#/definitions/int32-array + items: + - description: The 'offset' is an address offset of the Message RAM where + the following elements start from. This is usually set to 0x0 if + you're using a private Message RAM. + default: 0 + - description: 11-bit Filter 0-128 elements / 0-128 words + minimum: 0 + maximum: 128 + - description: 29-bit Filter 0-64 elements / 0-128 words + minimum: 0 + maximum: 64 + - description: Rx FIFO 0 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Rx FIFO 1 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Rx Buffers 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Tx Event FIFO 0-32 elements / 0-64 words + minimum: 0 + maximum: 32 + - description: Tx Buffers 0-32 elements / 0-576 words + minimum: 0 + maximum: 32 + minItems: 1 + + spi-max-frequency: + description: + Must be half or less of "clocks" frequency. + maximum: 18000000 + + wakeup-source: + $ref: /schemas/types.yaml#/definitions/flag + description: + Enable CAN remote wakeup. + +allOf: + - $ref: can-controller.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# + - if: + properties: + compatible: + contains: + enum: + - ti,tcan4552 + - ti,tcan4553 + then: + properties: + device-state-gpios: false + device-wake-gpios: false + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + - bosch,mram-cfg + +unevaluatedProperties: false + +examples: + - | + #include + #include + + spi { + #address-cells = <1>; + #size-cells = <0>; + + can@0 { + compatible = "ti,tcan4x5x"; + reg = <0>; + clocks = <&can0_osc>; + clock-names = "cclk"; + pinctrl-names = "default"; + pinctrl-0 = <&can0_pins>; + spi-max-frequency = <10000000>; + bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>; + interrupt-parent = <&gpio1>; + interrupts = <14 IRQ_TYPE_LEVEL_LOW>; + device-state-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; + device-wake-gpios = <&gpio1 15 GPIO_ACTIVE_HIGH>; + reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>; + wakeup-source; + }; + }; + - | + #include + #include + + spi { + #address-cells = <1>; + #size-cells = <0>; + + can@0 { + compatible = "ti,tcan4552", "ti,tcan4x5x"; + reg = <0>; + clocks = <&can0_osc>; + clock-names = "cclk"; + pinctrl-names = "default"; + pinctrl-0 = <&can0_pins>; + spi-max-frequency = <10000000>; + bosch,mram-cfg = <0x0 0 0 16 0 0 1 1>; + interrupt-parent = <&gpio1>; + interrupts = <14 IRQ_TYPE_LEVEL_LOW>; + reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>; + wakeup-source; + }; + }; From 6495567981be6f91eccb48e058ca88dd7acad181 Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Thu, 14 Nov 2024 10:14:49 +0100 Subject: [PATCH 0080/1450] dt-bindings: can: tcan4x5x: Document the ti,nwkrq-voltage-vio option The nWKRQ pin supports an output voltage of either the internal reference voltage (3.6V) or the reference voltage of the digital interface 0-6V (VIO). Add the devicetree option ti,nwkrq-voltage-vio to set it to VIO. If this property is omitted the reset default, the internal reference voltage, is used. Signed-off-by: Sean Nyekjaer Reviewed-by: Rob Herring (Arm) Reviewed-by: Marc Kleine-Budde Link: https://patch.msgid.link/20241114-tcan-wkrqv-v5-1-a2d50833ed71@geanix.com Signed-off-by: Marc Kleine-Budde --- .../devicetree/bindings/net/can/ti,tcan4x5x.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml b/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml index afd9d315dea2a..384e15da27136 100644 --- a/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml +++ b/Documentation/devicetree/bindings/net/can/ti,tcan4x5x.yaml @@ -110,6 +110,13 @@ properties: Must be half or less of "clocks" frequency. maximum: 18000000 + ti,nwkrq-voltage-vio: + type: boolean + description: + nWKRQ Pin GPO buffer voltage configuration. + Set nWKRQ to use VIO voltage rail. + When not set nWKRQ will use internal voltage rail. + wakeup-source: $ref: /schemas/types.yaml#/definitions/flag description: @@ -163,6 +170,7 @@ examples: device-state-gpios = <&gpio3 21 GPIO_ACTIVE_HIGH>; device-wake-gpios = <&gpio1 15 GPIO_ACTIVE_HIGH>; reset-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>; + ti,nwkrq-voltage-vio; wakeup-source; }; }; From fc38e9339c47d704934bc74e55c331f0d2d88583 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Fri, 29 Nov 2024 13:20:33 +0200 Subject: [PATCH 0081/1450] wifi: ath12k: Refactor core startup In the upcoming hardware device group abstraction radios across different devices can be grouped together to support multi-link operation and register as a device group to mac80211. Currently, ath12k_mac_allocate() and ath12k_mac_register() are part of ath12k_core_start() and ath12k_core_pdev_create() respectively and are based on per device (struct ath12k_base). These APIs can be decoupled and moved out to ath12k_core_qmi_firmware_ready() itself. This refactor is helpful for device group abstraction when mac80211 allocate and register will be changed from per device (struct ath12k_base) to per device group (struct ath12k_hw_group). Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241128165026.2618331-2-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/core.c | 63 ++++++++++++-------------- drivers/net/wireless/ath/ath12k/dp.c | 7 +++ drivers/net/wireless/ath/ath12k/pci.c | 9 ++++ drivers/net/wireless/ath/ath12k/qmi.c | 4 ++ 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 263a7c7891225..5313b02673071 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -604,9 +604,10 @@ static void ath12k_core_stop(struct ath12k_base *ab) ath12k_acpi_stop(ab); + ath12k_dp_rx_pdev_reo_cleanup(ab); ath12k_hif_stop(ab); ath12k_wmi_detach(ab); - ath12k_dp_rx_pdev_reo_cleanup(ab); + ath12k_dp_free(ab); /* De-Init of components as needed */ } @@ -708,7 +709,7 @@ static int ath12k_core_soc_create(struct ath12k_base *ab) static void ath12k_core_soc_destroy(struct ath12k_base *ab) { - ath12k_dp_free(ab); + ath12k_hif_power_down(ab, false); ath12k_reg_free(ab); ath12k_debugfs_soc_destroy(ab); ath12k_qmi_deinit_service(ab); @@ -718,30 +719,17 @@ static int ath12k_core_pdev_create(struct ath12k_base *ab) { int ret; - ret = ath12k_mac_register(ab); - if (ret) { - ath12k_err(ab, "failed register the radio with mac80211: %d\n", ret); - return ret; - } - ret = ath12k_dp_pdev_alloc(ab); if (ret) { ath12k_err(ab, "failed to attach DP pdev: %d\n", ret); - goto err_mac_unregister; + return ret; } return 0; - -err_mac_unregister: - ath12k_mac_unregister(ab); - - return ret; } static void ath12k_core_pdev_destroy(struct ath12k_base *ab) { - ath12k_mac_unregister(ab); - ath12k_hif_irq_disable(ab); ath12k_dp_pdev_free(ab); } @@ -799,19 +787,12 @@ static int ath12k_core_start(struct ath12k_base *ab, goto err_hif_stop; } - ret = ath12k_mac_allocate(ab); - if (ret) { - ath12k_err(ab, "failed to create new hw device with mac80211 :%d\n", - ret); - goto err_hif_stop; - } - ath12k_dp_cc_config(ab); ret = ath12k_dp_rx_pdev_reo_setup(ab); if (ret) { ath12k_err(ab, "failed to initialize reo destination rings: %d\n", ret); - goto err_mac_destroy; + goto err_hif_stop; } ath12k_dp_hal_rx_desc_init(ab); @@ -854,8 +835,6 @@ static int ath12k_core_start(struct ath12k_base *ab, err_reo_cleanup: ath12k_dp_rx_pdev_reo_cleanup(ab); -err_mac_destroy: - ath12k_mac_destroy(ab); err_hif_stop: ath12k_hif_stop(ab); err_wmi_detach: @@ -909,28 +888,46 @@ int ath12k_core_qmi_firmware_ready(struct ath12k_base *ab) goto err_dp_free; } + ret = ath12k_mac_allocate(ab); + if (ret) { + ath12k_err(ab, "failed to create new hw device with mac80211 :%d\n", + ret); + goto err_core_stop; + } + + ret = ath12k_mac_register(ab); + if (ret) { + ath12k_err(ab, "failed register the radio with mac80211: %d\n", ret); + goto err_mac_destroy; + } + ret = ath12k_core_pdev_create(ab); if (ret) { ath12k_err(ab, "failed to create pdev core: %d\n", ret); - goto err_core_stop; + goto err_mac_unregister; } + ath12k_hif_irq_enable(ab); ret = ath12k_core_rfkill_config(ab); if (ret && ret != -EOPNOTSUPP) { ath12k_err(ab, "failed to config rfkill: %d\n", ret); - goto err_core_pdev_destroy; + goto err_hif_irq_disable; } mutex_unlock(&ab->core_lock); return 0; -err_core_pdev_destroy: +err_hif_irq_disable: + ath12k_hif_irq_disable(ab); ath12k_core_pdev_destroy(ab); +err_mac_unregister: + ath12k_mac_unregister(ab); +err_mac_destroy: + ath12k_mac_destroy(ab); err_core_stop: ath12k_core_stop(ab); - ath12k_mac_destroy(ab); err_dp_free: ath12k_dp_free(ab); mutex_unlock(&ab->core_lock); @@ -1270,15 +1267,15 @@ void ath12k_core_deinit(struct ath12k_base *ab) mutex_lock(&ab->core_lock); + ath12k_hif_irq_disable(ab); ath12k_core_pdev_destroy(ab); + ath12k_mac_unregister(ab); + ath12k_mac_destroy(ab); ath12k_core_stop(ab); mutex_unlock(&ab->core_lock); - ath12k_hif_power_down(ab, false); - ath12k_mac_destroy(ab); ath12k_core_soc_destroy(ab); - ath12k_fw_unmap(ab); } void ath12k_core_free(struct ath12k_base *ab) diff --git a/drivers/net/wireless/ath/ath12k/dp.c b/drivers/net/wireless/ath/ath12k/dp.c index 328be2c635d67..ce823b1c175f5 100644 --- a/drivers/net/wireless/ath/ath12k/dp.c +++ b/drivers/net/wireless/ath/ath12k/dp.c @@ -982,6 +982,9 @@ void ath12k_dp_pdev_free(struct ath12k_base *ab) { int i; + if (!ab->mon_reap_timer.function) + return; + del_timer_sync(&ab->mon_reap_timer); for (i = 0; i < ab->num_radios; i++) @@ -1289,6 +1292,9 @@ void ath12k_dp_free(struct ath12k_base *ab) struct ath12k_dp *dp = &ab->dp; int i; + if (!dp->ab) + return; + ath12k_dp_link_desc_cleanup(ab, dp->link_desc_banks, HAL_WBM_IDLE_LINK, &dp->wbm_idle_ring); @@ -1306,6 +1312,7 @@ void ath12k_dp_free(struct ath12k_base *ab) ath12k_dp_rx_free(ab); /* Deinit any SOC level resource */ + dp->ab = NULL; } void ath12k_dp_cc_config(struct ath12k_base *ab) diff --git a/drivers/net/wireless/ath/ath12k/pci.c b/drivers/net/wireless/ath/ath12k/pci.c index cf907550e6a4a..8dbc7377ae7ca 100644 --- a/drivers/net/wireless/ath/ath12k/pci.c +++ b/drivers/net/wireless/ath/ath12k/pci.c @@ -1123,6 +1123,9 @@ void ath12k_pci_ext_irq_enable(struct ath12k_base *ab) void ath12k_pci_ext_irq_disable(struct ath12k_base *ab) { + if (!test_bit(ATH12K_FLAG_EXT_IRQ_ENABLED, &ab->dev_flags)) + return; + __ath12k_pci_ext_irq_disable(ab); ath12k_pci_sync_ext_irqs(ab); } @@ -1147,6 +1150,11 @@ int ath12k_pci_hif_resume(struct ath12k_base *ab) void ath12k_pci_stop(struct ath12k_base *ab) { + struct ath12k_pci *ab_pci = ath12k_pci_priv(ab); + + if (!test_bit(ATH12K_PCI_FLAG_INIT_DONE, &ab_pci->flags)) + return; + ath12k_pci_ce_irq_disable_sync(ab); ath12k_ce_cleanup_pipes(ab); } @@ -1725,6 +1733,7 @@ static void ath12k_pci_remove(struct pci_dev *pdev) cancel_work_sync(&ab->reset_work); cancel_work_sync(&ab->dump_work); ath12k_core_deinit(ab); + ath12k_fw_unmap(ab); qmi_fail: ath12k_mhi_unregister(ab_pci); diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c index d2d9d03c7a28e..f5388eae01dc9 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.c +++ b/drivers/net/wireless/ath/ath12k/qmi.c @@ -3402,11 +3402,15 @@ int ath12k_qmi_init_service(struct ath12k_base *ab) void ath12k_qmi_deinit_service(struct ath12k_base *ab) { + if (!ab->qmi.ab) + return; + qmi_handle_release(&ab->qmi.handle); cancel_work_sync(&ab->qmi.event_work); destroy_workqueue(ab->qmi.event_wq); ath12k_qmi_m3_free(ab); ath12k_qmi_free_target_mem_chunk(ab); + ab->qmi.ab = NULL; } void ath12k_qmi_free_resource(struct ath12k_base *ab) From 016abac20b832ce2e2b8afbe2c9ef8158ad1cfe8 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Fri, 29 Nov 2024 13:20:33 +0200 Subject: [PATCH 0082/1450] wifi: ath12k: add ath12k_ab_to_ah() and ath12k_ab_set_ah() Currently, one or more ath12k_hw is part of a device (struct ath12k_base) but in future, it would be part of device group abstraction (struct ath12k_hw_group), i.e., when multiple radios (ar) across different devices can be combined together in a device group (struct ath12k_hw_group). In order to facilitate the above transition, introduce helpers ath12k_ab_to_ah() and ath12k_ab_set_ah() to get and set values of ath12k_hw respectively. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241128165026.2618331-3-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/core.c | 8 ++++---- drivers/net/wireless/ath/ath12k/core.h | 11 +++++++++++ drivers/net/wireless/ath/ath12k/mac.c | 23 +++++++++++++---------- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 5313b02673071..14d0aa26d850a 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -986,7 +986,7 @@ static void ath12k_rfkill_work(struct work_struct *work) spin_unlock_bh(&ab->base_lock); for (i = 0; i < ab->num_hw; i++) { - ah = ab->ah[i]; + ah = ath12k_ab_to_ah(ab, i); if (!ah) continue; @@ -1038,7 +1038,7 @@ static void ath12k_core_pre_reconfigure_recovery(struct ath12k_base *ab) set_bit(ATH12K_FLAG_CRASH_FLUSH, &ab->dev_flags); for (i = 0; i < ab->num_hw; i++) { - ah = ab->ah[i]; + ah = ath12k_ab_to_ah(ab, i); if (!ah || ah->state == ATH12K_HW_STATE_OFF) continue; @@ -1077,7 +1077,7 @@ static void ath12k_core_post_reconfigure_recovery(struct ath12k_base *ab) int i, j; for (i = 0; i < ab->num_hw; i++) { - ah = ab->ah[i]; + ah = ath12k_ab_to_ah(ab, i); if (!ah || ah->state == ATH12K_HW_STATE_OFF) continue; @@ -1131,7 +1131,7 @@ static void ath12k_core_restart(struct work_struct *work) if (ab->is_reset) { for (i = 0; i < ab->num_hw; i++) { - ah = ab->ah[i]; + ah = ath12k_ab_to_ah(ab, i); ieee80211_restart_hw(ah->hw); } } diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index f4a710d495846..ba52be1cfd0f8 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -1160,4 +1160,15 @@ static inline struct ieee80211_hw *ath12k_ar_to_hw(struct ath12k *ar) #define for_each_ar(ah, ar, index) \ for ((index) = 0; ((index) < (ah)->num_radio && \ ((ar) = &(ah)->radio[(index)])); (index)++) + +static inline struct ath12k_hw *ath12k_ab_to_ah(struct ath12k_base *ab, int idx) +{ + return ab->ah[idx]; +} + +static inline void ath12k_ab_set_ah(struct ath12k_base *ab, int idx, + struct ath12k_hw *ah) +{ + ab->ah[idx] = ah; +} #endif /* _CORE_H_ */ diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 8d42077078676..8cafb67523c98 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -10832,7 +10832,7 @@ int ath12k_mac_register(struct ath12k_base *ab) ab->free_vdev_map = (1LL << (ab->num_radios * TARGET_NUM_VDEVS)) - 1; for (i = 0; i < ab->num_hw; i++) { - ah = ab->ah[i]; + ah = ath12k_ab_to_ah(ab, i); ret = ath12k_mac_hw_register(ah); if (ret) @@ -10843,7 +10843,7 @@ int ath12k_mac_register(struct ath12k_base *ab) err: for (i = i - 1; i >= 0; i--) { - ah = ab->ah[i]; + ah = ath12k_ab_to_ah(ab, i); if (!ah) continue; @@ -10859,7 +10859,7 @@ void ath12k_mac_unregister(struct ath12k_base *ab) int i; for (i = ab->num_hw - 1; i >= 0; i--) { - ah = ab->ah[i]; + ah = ath12k_ab_to_ah(ab, i); if (!ah) continue; @@ -10917,6 +10917,7 @@ static struct ath12k_hw *ath12k_mac_hw_allocate(struct ath12k_base *ab, void ath12k_mac_destroy(struct ath12k_base *ab) { struct ath12k_pdev *pdev; + struct ath12k_hw *ah; int i; for (i = 0; i < ab->num_radios; i++) { @@ -10928,11 +10929,12 @@ void ath12k_mac_destroy(struct ath12k_base *ab) } for (i = 0; i < ab->num_hw; i++) { - if (!ab->ah[i]) + ah = ath12k_ab_to_ah(ab, i); + if (!ah) continue; - ath12k_mac_hw_destroy(ab->ah[i]); - ab->ah[i] = NULL; + ath12k_mac_hw_destroy(ah); + ath12k_ab_set_ah(ab, i, NULL); } } @@ -10965,7 +10967,7 @@ int ath12k_mac_allocate(struct ath12k_base *ab) ah->dev = ab->dev; - ab->ah[i] = ah; + ath12k_ab_set_ah(ab, i, ah); } ath12k_dp_pdev_pre_alloc(ab); @@ -10974,11 +10976,12 @@ int ath12k_mac_allocate(struct ath12k_base *ab) err: for (i = i - 1; i >= 0; i--) { - if (!ab->ah[i]) + ah = ath12k_ab_to_ah(ab, i); + if (!ah) continue; - ath12k_mac_hw_destroy(ab->ah[i]); - ab->ah[i] = NULL; + ath12k_mac_hw_destroy(ah); + ath12k_ab_set_ah(ab, i, NULL); } return ret; From 17dd22aff52716eda49541bfec71a8f10bd7e514 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Fri, 29 Nov 2024 13:20:33 +0200 Subject: [PATCH 0083/1450] wifi: ath12k: add ath12k_get_num_hw() Currently, one or more struct ath12k_hw is part of device (struct ath12k_base) but in future, ath12k_hw would be part of device group (struct ath12k_hw_group). Hence, num_hw under device would be moved to device group. To facilitate above transition, add helper ath12k_get_num_hw() to get the number of radios per device. In future, this helper will return the number of radios in a device group. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241128165026.2618331-4-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/core.c | 8 ++++---- drivers/net/wireless/ath/ath12k/core.h | 5 +++++ drivers/net/wireless/ath/ath12k/mac.c | 8 ++++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 14d0aa26d850a..386d42db29ac0 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -985,7 +985,7 @@ static void ath12k_rfkill_work(struct work_struct *work) rfkill_radio_on = ab->rfkill_radio_on; spin_unlock_bh(&ab->base_lock); - for (i = 0; i < ab->num_hw; i++) { + for (i = 0; i < ath12k_get_num_hw(ab); i++) { ah = ath12k_ab_to_ah(ab, i); if (!ah) continue; @@ -1037,7 +1037,7 @@ static void ath12k_core_pre_reconfigure_recovery(struct ath12k_base *ab) if (ab->is_reset) set_bit(ATH12K_FLAG_CRASH_FLUSH, &ab->dev_flags); - for (i = 0; i < ab->num_hw; i++) { + for (i = 0; i < ath12k_get_num_hw(ab); i++) { ah = ath12k_ab_to_ah(ab, i); if (!ah || ah->state == ATH12K_HW_STATE_OFF) continue; @@ -1076,7 +1076,7 @@ static void ath12k_core_post_reconfigure_recovery(struct ath12k_base *ab) struct ath12k *ar; int i, j; - for (i = 0; i < ab->num_hw; i++) { + for (i = 0; i < ath12k_get_num_hw(ab); i++) { ah = ath12k_ab_to_ah(ab, i); if (!ah || ah->state == ATH12K_HW_STATE_OFF) continue; @@ -1130,7 +1130,7 @@ static void ath12k_core_restart(struct work_struct *work) } if (ab->is_reset) { - for (i = 0; i < ab->num_hw; i++) { + for (i = 0; i < ath12k_get_num_hw(ab); i++) { ah = ath12k_ab_to_ah(ab, i); ieee80211_restart_hw(ah->hw); } diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index ba52be1cfd0f8..4bfc7a7cc8948 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -1171,4 +1171,9 @@ static inline void ath12k_ab_set_ah(struct ath12k_base *ab, int idx, { ab->ah[idx] = ah; } + +static inline int ath12k_get_num_hw(struct ath12k_base *ab) +{ + return ab->num_hw; +} #endif /* _CORE_H_ */ diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 8cafb67523c98..129607ac6c1a1 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -10831,7 +10831,7 @@ int ath12k_mac_register(struct ath12k_base *ab) ab->cc_freq_hz = 320000; ab->free_vdev_map = (1LL << (ab->num_radios * TARGET_NUM_VDEVS)) - 1; - for (i = 0; i < ab->num_hw; i++) { + for (i = 0; i < ath12k_get_num_hw(ab); i++) { ah = ath12k_ab_to_ah(ab, i); ret = ath12k_mac_hw_register(ah); @@ -10858,7 +10858,7 @@ void ath12k_mac_unregister(struct ath12k_base *ab) struct ath12k_hw *ah; int i; - for (i = ab->num_hw - 1; i >= 0; i--) { + for (i = ath12k_get_num_hw(ab) - 1; i >= 0; i--) { ah = ath12k_ab_to_ah(ab, i); if (!ah) continue; @@ -10928,7 +10928,7 @@ void ath12k_mac_destroy(struct ath12k_base *ab) pdev->ar = NULL; } - for (i = 0; i < ab->num_hw; i++) { + for (i = 0; i < ath12k_get_num_hw(ab); i++) { ah = ath12k_ab_to_ah(ab, i); if (!ah) continue; @@ -10951,7 +10951,7 @@ int ath12k_mac_allocate(struct ath12k_base *ab) ab->num_hw = ab->num_radios; radio_per_hw = 1; - for (i = 0; i < ab->num_hw; i++) { + for (i = 0; i < ath12k_get_num_hw(ab); i++) { for (j = 0; j < radio_per_hw; j++) { pdev_map[j].ab = ab; pdev_map[j].pdev_idx = (i * radio_per_hw) + j; From 45e72c306c08d59d0dc42238a8571bbbf04823f5 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Fri, 29 Nov 2024 13:20:33 +0200 Subject: [PATCH 0084/1450] wifi: ath12k: introduce QMI firmware ready flag When hardware device group abstraction is introduced, the QMI firmware ready event of different devices in a group can be received simultaneously. To indicate the firmware ready event is completed for a particular device in a group set a flag (ATH12K_FLAG_QMI_FW_READY_COMPLETE). This helps the upcoming hardware recovery implementation for hardware device group abstraction. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241128165026.2618331-5-kvalo@kernel.org --- drivers/net/wireless/ath/ath12k/core.c | 2 +- drivers/net/wireless/ath/ath12k/core.h | 1 + drivers/net/wireless/ath/ath12k/qmi.c | 12 +++++++++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 386d42db29ac0..4da147f7bfac6 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -1145,7 +1145,7 @@ static void ath12k_core_reset(struct work_struct *work) int reset_count, fail_cont_count; long time_left; - if (!(test_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags))) { + if (!(test_bit(ATH12K_FLAG_QMI_FW_READY_COMPLETE, &ab->dev_flags))) { ath12k_warn(ab, "ignore reset dev flags 0x%lx\n", ab->dev_flags); return; } diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 4bfc7a7cc8948..9ddced140056c 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -228,6 +228,7 @@ enum ath12k_dev_flags { ATH12K_FLAG_HTC_SUSPEND_COMPLETE, ATH12K_FLAG_CE_IRQ_ENABLED, ATH12K_FLAG_EXT_IRQ_ENABLED, + ATH12K_FLAG_QMI_FW_READY_COMPLETE, }; struct ath12k_tx_conf { diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c index f5388eae01dc9..77d8ee14bf33e 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.c +++ b/drivers/net/wireless/ath/ath12k/qmi.c @@ -3033,6 +3033,8 @@ void ath12k_qmi_firmware_stop(struct ath12k_base *ab) { int ret; + clear_bit(ATH12K_FLAG_QMI_FW_READY_COMPLETE, &ab->dev_flags); + ret = ath12k_qmi_wlanfw_mode_send(ab, ATH12K_FIRMWARE_MODE_OFF); if (ret < 0) { ath12k_warn(ab, "qmi failed to send wlan mode off\n"); @@ -3336,7 +3338,7 @@ static void ath12k_qmi_driver_event_work(struct work_struct *work) break; case ATH12K_QMI_EVENT_FW_READY: clear_bit(ATH12K_FLAG_QMI_FAIL, &ab->dev_flags); - if (test_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags)) { + if (test_bit(ATH12K_FLAG_QMI_FW_READY_COMPLETE, &ab->dev_flags)) { if (ab->is_reset) ath12k_hal_dump_srng_stats(ab); queue_work(ab->workqueue, &ab->restart_work); @@ -3346,8 +3348,12 @@ static void ath12k_qmi_driver_event_work(struct work_struct *work) clear_bit(ATH12K_FLAG_CRASH_FLUSH, &ab->dev_flags); clear_bit(ATH12K_FLAG_RECOVERY, &ab->dev_flags); - ath12k_core_qmi_firmware_ready(ab); - set_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags); + ret = ath12k_core_qmi_firmware_ready(ab); + if (!ret) { + set_bit(ATH12K_FLAG_QMI_FW_READY_COMPLETE, + &ab->dev_flags); + set_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags); + } break; default: From b905bafdea21a75d75a96855edd9e0b6051eee30 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Sat, 30 Nov 2024 21:14:19 -0800 Subject: [PATCH 0085/1450] hfs: Sanity check the root record In the syzbot reproducer, the hfs_cat_rec for the root dir has type HFS_CDR_FIL after being read with hfs_bnode_read() in hfs_super_fill(). This indicates it should be used as an hfs_cat_file, which is 102 bytes. Only the first 70 bytes of that struct are initialized, however, because the entrylength passed into hfs_bnode_read() is still the length of a directory record. This causes uninitialized values to be used later on, when the hfs_cat_rec union is treated as the larger hfs_cat_file struct. Add a check to make sure the retrieved record has the correct type for the root directory (HFS_CDR_DIR), and make sure we load the correct number of bytes for a directory record. Reported-by: syzbot+2db3c7526ba68f4ea776@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2db3c7526ba68f4ea776 Tested-by: syzbot+2db3c7526ba68f4ea776@syzkaller.appspotmail.com Tested-by: Leo Stone Signed-off-by: Leo Stone Link: https://lore.kernel.org/r/20241201051420.77858-1-leocstone@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/hfs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 3bee9b5dba5e4..fe09c2093a93d 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -349,11 +349,13 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) goto bail_no_root; res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); if (!res) { - if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + if (fd.entrylength != sizeof(rec.dir)) { res = -EIO; goto bail_hfs_find; } hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + if (rec.type != HFS_CDR_DIR) + res = -EIO; } if (res) goto bail_hfs_find; From 60bc447c85f80d3184c7ac327e1d29e0b0a11d46 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 2 Dec 2024 14:15:17 +0100 Subject: [PATCH 0086/1450] of: Add #address-cells/#size-cells in the device-tree root empty node On systems where ACPI is enabled or when a device-tree is not passed to the kernel by the bootloader, a device-tree root empty node is created. This device-tree root empty node does not have the #address-cells and the #size-cells properties This leads to the use of the default address cells and size cells values which are defined in the code to 1 for the address cells value and 1 for the size cells value. According to the devicetree specification and the OpenFirmware standard (IEEE 1275-1994) the default value for #address-cells should be 2. Also, according to the devicetree specification, the #address-cells and the #size-cells are required properties in the root node. The device tree compiler already uses 2 as default value for address cells and 1 for size cells. The powerpc PROM code also uses 2 as default value for address cells and 1 for size cells. Modern implementation should have the #address-cells and the #size-cells properties set and should not rely on default values. On x86, this root empty node is used and the code default values are used. In preparation of the support for device-tree overlay on PCI devices feature on x86 (i.e. the creation of the PCI root bus device-tree node), the default value for #address-cells needs to be updated. Indeed, on x86_64, addresses are on 64bits and the upper part of an address is needed for correct address translations. On x86_32 having the default value updated does not lead to issues while the upper part of a 64-bit value is zero. Changing the default value for all architectures may break device-tree compatibility. Indeed, existing dts file without the #address-cells property set in the root node will not be compatible with this modification. Instead of updating default values, add both required #address-cells and #size-cells properties in the device-tree empty node. Use 2 for both properties value in order to fully support 64-bit addresses and sizes on systems using this empty root node. Signed-off-by: Herve Codina Link: https://lore.kernel.org/r/20241202131522.142268-6-herve.codina@bootlin.com Signed-off-by: Rob Herring (Arm) --- drivers/of/empty_root.dts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/of/empty_root.dts b/drivers/of/empty_root.dts index cf9e97a60f482..cbe169ba3db52 100644 --- a/drivers/of/empty_root.dts +++ b/drivers/of/empty_root.dts @@ -2,5 +2,12 @@ /dts-v1/; / { - + /* + * #address-cells/#size-cells are required properties at root node. + * Use 2 cells for both address cells and size cells in order to fully + * support 64-bit addresses and sizes on systems using this empty root + * node. + */ + #address-cells = <0x02>; + #size-cells = <0x02>; }; From c43ec96e8d34399bd9dab2f2dc316b904892133f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 29 Oct 2024 08:28:45 +0000 Subject: [PATCH 0087/1450] dmaengine: at_xdmac: avoid null_prt_deref in at_xdmac_prep_dma_memset The at_xdmac_memset_create_desc may return NULL, which will lead to a null pointer dereference. For example, the len input is error, or the atchan->free_descs_list is empty and memory is exhausted. Therefore, add check to avoid this. Fixes: b206d9a23ac7 ("dmaengine: xdmac: Add memset support") Signed-off-by: Chen Ridong Link: https://lore.kernel.org/r/20241029082845.1185380-1-chenridong@huaweicloud.com Signed-off-by: Vinod Koul --- drivers/dma/at_xdmac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 9c7b402200040..ba25c23164e78 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1363,6 +1363,8 @@ at_xdmac_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value, return NULL; desc = at_xdmac_memset_create_desc(chan, atchan, dest, len, value); + if (!desc) + return NULL; list_add_tail(&desc->desc_node, &desc->descs_list); desc->tx_dma_desc.cookie = -EBUSY; From f0e870a0e9c5521f2952ea9f3ea9d3d122631a89 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Nov 2024 11:50:50 +0200 Subject: [PATCH 0088/1450] dmaengine: dw: Select only supported masters for ACPI devices The recently submitted fix-commit revealed a problem in the iDMA 32-bit platform code. Even though the controller supported only a single master the dw_dma_acpi_filter() method hard-coded two master interfaces with IDs 0 and 1. As a result the sanity check implemented in the commit b336268dde75 ("dmaengine: dw: Add peripheral bus width verification") got incorrect interface data width and thus prevented the client drivers from configuring the DMA-channel with the EINVAL error returned. E.g., the next error was printed for the PXA2xx SPI controller driver trying to configure the requested channels: > [ 164.525604] pxa2xx_spi_pci 0000:00:07.1: DMA slave config failed > [ 164.536105] pxa2xx_spi_pci 0000:00:07.1: failed to get DMA TX descriptor > [ 164.543213] spidev spi-SPT0001:00: SPI transfer failed: -16 The problem would have been spotted much earlier if the iDMA 32-bit controller supported more than one master interfaces. But since it supports just a single master and the iDMA 32-bit specific code just ignores the master IDs in the CTLLO preparation method, the issue has been gone unnoticed so far. Fix the problem by specifying the default master ID for both memory and peripheral devices in the driver data. Thus the issue noticed for the iDMA 32-bit controllers will be eliminated and the ACPI-probed DW DMA controllers will be configured with the correct master ID by default. Cc: stable@vger.kernel.org Fixes: b336268dde75 ("dmaengine: dw: Add peripheral bus width verification") Fixes: 199244d69458 ("dmaengine: dw: add support of iDMA 32-bit hardware") Reported-by: Ferry Toth Closes: https://lore.kernel.org/dmaengine/ZuXbCKUs1iOqFu51@black.fi.intel.com/ Reported-by: Andy Shevchenko Closes: https://lore.kernel.org/dmaengine/ZuXgI-VcHpMgbZ91@black.fi.intel.com/ Tested-by: Ferry Toth Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241104095142.157925-1-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- drivers/dma/dw/acpi.c | 6 ++++-- drivers/dma/dw/internal.h | 8 ++++++++ drivers/dma/dw/pci.c | 4 ++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/dma/dw/acpi.c b/drivers/dma/dw/acpi.c index c510c109d2c3a..b6452fffa657a 100644 --- a/drivers/dma/dw/acpi.c +++ b/drivers/dma/dw/acpi.c @@ -8,13 +8,15 @@ static bool dw_dma_acpi_filter(struct dma_chan *chan, void *param) { + struct dw_dma *dw = to_dw_dma(chan->device); + struct dw_dma_chip_pdata *data = dev_get_drvdata(dw->dma.dev); struct acpi_dma_spec *dma_spec = param; struct dw_dma_slave slave = { .dma_dev = dma_spec->dev, .src_id = dma_spec->slave_id, .dst_id = dma_spec->slave_id, - .m_master = 0, - .p_master = 1, + .m_master = data->m_master, + .p_master = data->p_master, }; return dw_dma_filter(chan, &slave); diff --git a/drivers/dma/dw/internal.h b/drivers/dma/dw/internal.h index 563ce73488db3..f1bd06a20cd61 100644 --- a/drivers/dma/dw/internal.h +++ b/drivers/dma/dw/internal.h @@ -51,11 +51,15 @@ struct dw_dma_chip_pdata { int (*probe)(struct dw_dma_chip *chip); int (*remove)(struct dw_dma_chip *chip); struct dw_dma_chip *chip; + u8 m_master; + u8 p_master; }; static __maybe_unused const struct dw_dma_chip_pdata dw_dma_chip_pdata = { .probe = dw_dma_probe, .remove = dw_dma_remove, + .m_master = 0, + .p_master = 1, }; static const struct dw_dma_platform_data idma32_pdata = { @@ -72,6 +76,8 @@ static __maybe_unused const struct dw_dma_chip_pdata idma32_chip_pdata = { .pdata = &idma32_pdata, .probe = idma32_dma_probe, .remove = idma32_dma_remove, + .m_master = 0, + .p_master = 0, }; static const struct dw_dma_platform_data xbar_pdata = { @@ -88,6 +94,8 @@ static __maybe_unused const struct dw_dma_chip_pdata xbar_chip_pdata = { .pdata = &xbar_pdata, .probe = idma32_dma_probe, .remove = idma32_dma_remove, + .m_master = 0, + .p_master = 0, }; #endif /* _DMA_DW_INTERNAL_H */ diff --git a/drivers/dma/dw/pci.c b/drivers/dma/dw/pci.c index ad2d4d012cf72..e8a0eb81726a5 100644 --- a/drivers/dma/dw/pci.c +++ b/drivers/dma/dw/pci.c @@ -56,10 +56,10 @@ static int dw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *pid) if (ret) return ret; - dw_dma_acpi_controller_register(chip->dw); - pci_set_drvdata(pdev, data); + dw_dma_acpi_controller_register(chip->dw); + return 0; } From 4b65d5322e1d8994acfdb9b867aa00bdb30d177b Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Mon, 28 Oct 2024 17:34:13 +0800 Subject: [PATCH 0089/1450] dmaengine: loongson2-apb: Change GENMASK to GENMASK_ULL Fix the following smatch static checker warning: drivers/dma/loongson2-apb-dma.c:189 ls2x_dma_write_cmd() warn: was expecting a 64 bit value instead of '~(((0)) + (((~((0))) - (((1)) << (0)) + 1) & (~((0)) >> ((8 * 4) - 1 - (4)))))' The GENMASK macro used "unsigned long", which caused build issues when using a 32-bit toolchain because it would try to access bits > 31. This patch switches GENMASK to GENMASK_ULL, which uses "unsigned long long". Fixes: 71e7d3cb6e55 ("dmaengine: ls2x-apb: New driver for the Loongson LS2X APB DMA controller") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/87cdc025-7246-4548-85ca-3d36fdc2be2d@stanley.mountain/ Signed-off-by: Binbin Zhou Link: https://lore.kernel.org/r/20241028093413.1145820-1-zhoubinbin@loongson.cn Signed-off-by: Vinod Koul --- drivers/dma/loongson2-apb-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/loongson2-apb-dma.c b/drivers/dma/loongson2-apb-dma.c index 367ed34ce4da5..c528f02b9f848 100644 --- a/drivers/dma/loongson2-apb-dma.c +++ b/drivers/dma/loongson2-apb-dma.c @@ -31,7 +31,7 @@ #define LDMA_ASK_VALID BIT(2) #define LDMA_START BIT(3) /* DMA start operation */ #define LDMA_STOP BIT(4) /* DMA stop operation */ -#define LDMA_CONFIG_MASK GENMASK(4, 0) /* DMA controller config bits mask */ +#define LDMA_CONFIG_MASK GENMASK_ULL(4, 0) /* DMA controller config bits mask */ /* Bitfields in ndesc_addr field of HW descriptor */ #define LDMA_DESC_EN BIT(0) /*1: The next descriptor is valid */ From b32913a5609a36c230e9b091da26d38f8e80a056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 30 Nov 2024 15:53:49 +0100 Subject: [PATCH 0090/1450] ptp: Switch back to struct platform_driver::remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 0edb555a65d1 ("platform: Make platform_driver::remove() return void") .remove() is (again) the right callback to implement for platform drivers. Convert all platform drivers below drivers/ptp to use .remove(), with the eventual goal to drop struct platform_driver::remove_new(). As .remove() and .remove_new() have the same prototypes, conversion is done by just changing the structure member name in the driver initializer. While touching these drivers, make the alignment of the touched initializers consistent. Signed-off-by: Uwe Kleine-König Acked-by: Richard Cochran Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_clockmatrix.c | 2 +- drivers/ptp/ptp_dte.c | 4 ++-- drivers/ptp/ptp_fc3.c | 2 +- drivers/ptp/ptp_idt82p33.c | 2 +- drivers/ptp/ptp_ines.c | 4 ++-- drivers/ptp/ptp_qoriq.c | 2 +- drivers/ptp/ptp_vmclock.c | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c index b6f1941308b17..fbb3fa8fc60b2 100644 --- a/drivers/ptp/ptp_clockmatrix.c +++ b/drivers/ptp/ptp_clockmatrix.c @@ -2471,7 +2471,7 @@ static struct platform_driver idtcm_driver = { .name = "8a3400x-phc", }, .probe = idtcm_probe, - .remove_new = idtcm_remove, + .remove = idtcm_remove, }; module_platform_driver(idtcm_driver); diff --git a/drivers/ptp/ptp_dte.c b/drivers/ptp/ptp_dte.c index 449ff90927beb..372168578a30d 100644 --- a/drivers/ptp/ptp_dte.c +++ b/drivers/ptp/ptp_dte.c @@ -326,8 +326,8 @@ static struct platform_driver ptp_dte_driver = { .pm = PTP_DTE_PM_OPS, .of_match_table = ptp_dte_of_match, }, - .probe = ptp_dte_probe, - .remove_new = ptp_dte_remove, + .probe = ptp_dte_probe, + .remove = ptp_dte_remove, }; module_platform_driver(ptp_dte_driver); diff --git a/drivers/ptp/ptp_fc3.c b/drivers/ptp/ptp_fc3.c index 879b82f035352..cfced36c70bcb 100644 --- a/drivers/ptp/ptp_fc3.c +++ b/drivers/ptp/ptp_fc3.c @@ -1003,7 +1003,7 @@ static struct platform_driver idtfc3_driver = { .name = "rc38xxx-phc", }, .probe = idtfc3_probe, - .remove_new = idtfc3_remove, + .remove = idtfc3_remove, }; module_platform_driver(idtfc3_driver); diff --git a/drivers/ptp/ptp_idt82p33.c b/drivers/ptp/ptp_idt82p33.c index d5732490ed9d1..b2fd94d4f8631 100644 --- a/drivers/ptp/ptp_idt82p33.c +++ b/drivers/ptp/ptp_idt82p33.c @@ -1461,7 +1461,7 @@ static struct platform_driver idt82p33_driver = { .name = "82p33x1x-phc", }, .probe = idt82p33_probe, - .remove_new = idt82p33_remove, + .remove = idt82p33_remove, }; module_platform_driver(idt82p33_driver); diff --git a/drivers/ptp/ptp_ines.c b/drivers/ptp/ptp_ines.c index 14a23d3a27f2c..3d723a2aa6bb0 100644 --- a/drivers/ptp/ptp_ines.c +++ b/drivers/ptp/ptp_ines.c @@ -781,8 +781,8 @@ static const struct of_device_id ines_ptp_ctrl_of_match[] = { MODULE_DEVICE_TABLE(of, ines_ptp_ctrl_of_match); static struct platform_driver ines_ptp_ctrl_driver = { - .probe = ines_ptp_ctrl_probe, - .remove_new = ines_ptp_ctrl_remove, + .probe = ines_ptp_ctrl_probe, + .remove = ines_ptp_ctrl_remove, .driver = { .name = "ines_ptp_ctrl", .of_match_table = ines_ptp_ctrl_of_match, diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index 879cfc1537ac8..4d488c1f1941b 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -670,7 +670,7 @@ static struct platform_driver ptp_qoriq_driver = { .of_match_table = match_table, }, .probe = ptp_qoriq_probe, - .remove_new = ptp_qoriq_remove, + .remove = ptp_qoriq_remove, }; module_platform_driver(ptp_qoriq_driver); diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index cdca8a3ad1aa1..0a2cfc8ad3c54 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -601,7 +601,7 @@ MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); static struct platform_driver vmclock_platform_driver = { .probe = vmclock_probe, - .remove_new = vmclock_remove, + .remove = vmclock_remove, .driver = { .name = "vmclock", .acpi_match_table = vmclock_acpi_ids, From 989e0cdc0f18a594b25cabc60426d29659aeaf58 Mon Sep 17 00:00:00 2001 From: Brahmajit Das Date: Sat, 5 Oct 2024 01:21:32 +0530 Subject: [PATCH 0091/1450] fs/qnx6: Fix building with GCC 15 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qnx6_checkroot() had been using weirdly spelled initializer - it needed to initialize 3-element arrays of char and it used NUL-padded 3-character string literals (i.e. 4-element initializers, with completely pointless zeroes at the end). That had been spotted by gcc-15[*]; prior to that gcc quietly dropped the 4th element of initializers. However, none of that had been needed in the first place - all this array is used for is checking that the first directory entry in root directory is "." and the second - "..". The check had been expressed as a loop, using that match_root[] array. Since there is no chance that we ever want to extend that list of entries, the entire thing is much too fancy for its own good; what we need is just a couple of explicit memcmp() and that's it. [*]: fs/qnx6/inode.c: In function ‘qnx6_checkroot’: fs/qnx6/inode.c:182:41: error: initializer-string for array of ‘char’ is too long [-Werror=unterminated-string-initialization] 182 | static char match_root[2][3] = {".\0\0", "..\0"}; | ^~~~~~~ fs/qnx6/inode.c:182:50: error: initializer-string for array of ‘char’ is too long [-Werror=unterminated-string-initialization] 182 | static char match_root[2][3] = {".\0\0", "..\0"}; | ^~~~~~ Signed-off-by: Brahmajit Das Link: https://lore.kernel.org/r/20241004195132.1393968-1-brahmajit.xyz@gmail.com Acked-by: Al Viro Signed-off-by: Christian Brauner --- fs/qnx6/inode.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 85925ec0051a9..3310d1ad4d0e9 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -179,8 +179,7 @@ static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) */ static const char *qnx6_checkroot(struct super_block *s) { - static char match_root[2][3] = {".\0\0", "..\0"}; - int i, error = 0; + int error = 0; struct qnx6_dir_entry *dir_entry; struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; @@ -189,11 +188,9 @@ static const char *qnx6_checkroot(struct super_block *s) if (IS_ERR(folio)) return "error reading root directory"; dir_entry = kmap_local_folio(folio, 0); - for (i = 0; i < 2; i++) { - /* maximum 3 bytes - due to match_root limitation */ - if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) - error = 1; - } + if (memcmp(dir_entry[0].de_fname, ".", 2) || + memcmp(dir_entry[1].de_fname, "..", 3)) + error = 1; folio_release_kmap(folio, dir_entry); if (error) return "error reading root directory."; From b77bd3ba762f34e5eb731134cf50e233d1060053 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 27 Nov 2024 16:06:05 -0300 Subject: [PATCH 0092/1450] ARM: imx: Re-introduce the PINCTRL selection Since commit 17d210018914 ("ARM: imx: Allow user to disable pinctrl"), the CONFIG_PINCTRL option is no longer implicitly selected, causing several i.MX SoC pinctrl drivers no longer getting selected by default. This causes boot regressions on the ARMv4, ARMv5, ARMv6 and ARMv7 i.MX SoCs. Fix it by selecting CONFIG_PINCTRL as before. This defeats the purpose of 7d210018914 ("ARM: imx: Allow user to disable pinctrl"), but it is the less invasive fix for the boot regressions. The attempt to build Layerscape without pinctrl can still be explored later as suggested by Arnd: "Overall, my best advice here is still to not change the way i.MX pinctrl works at all, but just fix Layerscape to not depend on i.MX. The reason for the 'select' here is clearly that the i.MX machines would fail to boot without pinctrl, and changing that because of Layerscape seems backwards." Fixes: 17d210018914 ("ARM: imx: Allow user to disable pinctrl") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/linux-arm-kernel/49ff070a-ce67-42d7-84ec-8b54fd7e9742@roeck-us.net/ Signed-off-by: Fabio Estevam Acked-by: Arnd Bergmann Reviewed-by: Linus Walleij Tested-by: Guenter Roeck Link: https://lore.kernel.org/20241127190605.1367157-1-festevam@gmail.com Signed-off-by: Linus Walleij --- arch/arm/mach-imx/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig index e4fe059cd8614..dc47b2312127f 100644 --- a/arch/arm/mach-imx/Kconfig +++ b/arch/arm/mach-imx/Kconfig @@ -6,6 +6,7 @@ menuconfig ARCH_MXC select CLKSRC_IMX_GPT select GENERIC_IRQ_CHIP select GPIOLIB + select PINCTRL select PM_OPP if PM select SOC_BUS select SRAM From e8e7be7d212dc2bc83b8151e51088666a6c42092 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 2 Dec 2024 09:27:13 +0100 Subject: [PATCH 0093/1450] mctp i2c: drop check because i2c_unregister_device() is NULL safe No need to check the argument of i2c_unregister_device() because the function itself does it. Signed-off-by: Wolfram Sang Link: https://patch.msgid.link/20241202082713.9719-1-wsa+renesas@sang-engineering.com Signed-off-by: Paolo Abeni --- drivers/net/mctp/mctp-i2c.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c index d2b3f5a591418..e3dcdeacc12c5 100644 --- a/drivers/net/mctp/mctp-i2c.c +++ b/drivers/net/mctp/mctp-i2c.c @@ -177,8 +177,7 @@ static struct mctp_i2c_client *mctp_i2c_new_client(struct i2c_client *client) return mcli; err: if (mcli) { - if (mcli->client) - i2c_unregister_device(mcli->client); + i2c_unregister_device(mcli->client); kfree(mcli); } return ERR_PTR(rc); From 514b2262ade48a0503ac6aa03c3bfb8c5be69b21 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 16 Nov 2024 00:05:18 +0100 Subject: [PATCH 0094/1450] firmware: arm_scmi: Fix i.MX build dependency The newly added SCMI vendor driver references functions in the protocol driver but needs a Kconfig dependency to ensure it can link, essentially the Kconfig dependency needs to be reversed to match the link time dependency: | arm-linux-gnueabi-ld: sound/soc/fsl/fsl_mqs.o: in function `fsl_mqs_sm_write': | fsl_mqs.c:(.text+0x1aa): undefined reference to `scmi_imx_misc_ctrl_set' | arm-linux-gnueabi-ld: sound/soc/fsl/fsl_mqs.o: in function `fsl_mqs_sm_read': | fsl_mqs.c:(.text+0x1ee): undefined reference to `scmi_imx_misc_ctrl_get' This however only works after changing the dependency in the SND_SOC_FSL_MQS driver as well, which uses 'select IMX_SCMI_MISC_DRV' to turn on a driver it depends on. This is generally a bad idea, so the best solution is to change that into a dependency. To allow the ASoC driver to keep building with the SCMI support, this needs to be an optional dependency that enforces the link-time dependency if IMX_SCMI_MISC_DRV is a loadable module but not depend on it if that is disabled. Fixes: 61c9f03e22fc ("firmware: arm_scmi: Add initial support for i.MX MISC protocol") Fixes: 101c9023594a ("ASoC: fsl_mqs: Support accessing registers by scmi interface") Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Acked-by: Shengjiu Wang Message-Id: <20241115230555.2435004-1-arnd@kernel.org> Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/vendors/imx/Kconfig | 1 + drivers/firmware/imx/Kconfig | 1 - sound/soc/fsl/Kconfig | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/arm_scmi/vendors/imx/Kconfig b/drivers/firmware/arm_scmi/vendors/imx/Kconfig index 2883ed24a84d6..a01bf5e47301d 100644 --- a/drivers/firmware/arm_scmi/vendors/imx/Kconfig +++ b/drivers/firmware/arm_scmi/vendors/imx/Kconfig @@ -15,6 +15,7 @@ config IMX_SCMI_BBM_EXT config IMX_SCMI_MISC_EXT tristate "i.MX SCMI MISC EXTENSION" depends on ARM_SCMI_PROTOCOL || (COMPILE_TEST && OF) + depends on IMX_SCMI_MISC_DRV default y if ARCH_MXC help This enables i.MX System MISC control logic such as gpio expander diff --git a/drivers/firmware/imx/Kconfig b/drivers/firmware/imx/Kconfig index 477d3f32d99a6..907cd149c40a8 100644 --- a/drivers/firmware/imx/Kconfig +++ b/drivers/firmware/imx/Kconfig @@ -25,7 +25,6 @@ config IMX_SCU config IMX_SCMI_MISC_DRV tristate "IMX SCMI MISC Protocol driver" - depends on IMX_SCMI_MISC_EXT || COMPILE_TEST default y if ARCH_MXC help The System Controller Management Interface firmware (SCMI FW) is diff --git a/sound/soc/fsl/Kconfig b/sound/soc/fsl/Kconfig index 8e88830e8e577..678540b782805 100644 --- a/sound/soc/fsl/Kconfig +++ b/sound/soc/fsl/Kconfig @@ -29,8 +29,8 @@ config SND_SOC_FSL_SAI config SND_SOC_FSL_MQS tristate "Medium Quality Sound (MQS) module support" depends on SND_SOC_FSL_SAI + depends on IMX_SCMI_MISC_DRV || !IMX_SCMI_MISC_DRV select REGMAP_MMIO - select IMX_SCMI_MISC_DRV if IMX_SCMI_MISC_EXT !=n help Say Y if you want to add Medium Quality Sound (MQS) support for the Freescale CPUs. From 239521712b2b568b99d5f0ef7c1f874d797f4a29 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 13 Nov 2024 16:56:31 -0600 Subject: [PATCH 0095/1450] dt-bindings: mtd: fixed-partitions: Fix "compression" typo The example erroneously has "compress" property rather than the documented "compression" property. Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20241113225632.1783241-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/mtd/partitions/fixed-partitions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml index 058253d6d889c..62086366837cf 100644 --- a/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml +++ b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml @@ -82,7 +82,7 @@ examples: uimage@100000 { reg = <0x0100000 0x200000>; - compress = "lzma"; + compression = "lzma"; }; }; From d7dfa7fde63dde4d2ec0083133efe2c6686c03ff Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 2 Dec 2024 17:58:19 +0100 Subject: [PATCH 0096/1450] of: Fix error path in of_parse_phandle_with_args_map() The current code uses some 'goto put;' to cancel the parsing operation and can lead to a return code value of 0 even on error cases. Indeed, some goto calls are done from a loop without setting the ret value explicitly before the goto call and so the ret value can be set to 0 due to operation done in previous loop iteration. For instance match can be set to 0 in the previous loop iteration (leading to a new iteration) but ret can also be set to 0 it the of_property_read_u32() call succeed. In that case if no match are found or if an error is detected the new iteration, the return value can be wrongly 0. Avoid those cases setting the ret value explicitly before the goto calls. Fixes: bd6f2fd5a1d5 ("of: Support parsing phandle argument lists through a nexus node") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Link: https://lore.kernel.org/r/20241202165819.158681-1-herve.codina@bootlin.com Signed-off-by: Rob Herring (Arm) --- drivers/of/base.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index a8b0c42bdc8e9..44b1c8bf9cc0d 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1471,8 +1471,10 @@ int of_parse_phandle_with_args_map(const struct device_node *np, map_len--; /* Check if not found */ - if (!new) + if (!new) { + ret = -EINVAL; goto put; + } if (!of_device_is_available(new)) match = 0; @@ -1482,17 +1484,20 @@ int of_parse_phandle_with_args_map(const struct device_node *np, goto put; /* Check for malformed properties */ - if (WARN_ON(new_size > MAX_PHANDLE_ARGS)) - goto put; - if (map_len < new_size) + if (WARN_ON(new_size > MAX_PHANDLE_ARGS) || + map_len < new_size) { + ret = -EINVAL; goto put; + } /* Move forward by new node's #-cells amount */ map += new_size; map_len -= new_size; } - if (!match) + if (!match) { + ret = -ENOENT; goto put; + } /* Get the -map-pass-thru property (optional) */ pass = of_get_property(cur, pass_name, NULL); From 793baff3f24f16dab9061045e23eea67724feae6 Mon Sep 17 00:00:00 2001 From: Honglei Wang Date: Fri, 29 Nov 2024 17:10:03 +0800 Subject: [PATCH 0097/1450] sched_ext: Add __weak to fix the build errors commit 5cbb302880f5 ("sched_ext: Rename scx_bpf_dispatch[_vtime]_from_dsq*() -> scx_bpf_dsq_move[_vtime]*()") introduced several new functions which caused compilation errors when compiled with clang. Let's fix this by adding __weak markers. Signed-off-by: Honglei Wang Signed-off-by: Tejun Heo Fixes: 5cbb302880f5 ("sched_ext: Rename scx_bpf_dispatch[_vtime]_from_dsq*() -> scx_bpf_dsq_move[_vtime]*()") Acked-by: Andrii Nakryiko --- tools/sched_ext/include/scx/common.bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 2f36b7b6418d1..625f5b046776c 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -40,9 +40,9 @@ void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_fl void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; -bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym; -void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; -void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; +bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak; +void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; u32 scx_bpf_reenqueue_local(void) __ksym; From b03917e02bf9861be887a7e67c399b3b014f88be Mon Sep 17 00:00:00 2001 From: Konstantin Andrikopoulos Date: Wed, 27 Nov 2024 15:07:38 +0000 Subject: [PATCH 0098/1450] rust: add safety comment in workqueue traits Add missing safety comments for the implementation of the unsafe traits WorkItemPointer and RawWorkItem for Arc in workqueue.rs Link: https://github.com/Rust-for-Linux/linux/issues/351. Co-developed-by: Vangelis Mamalakis Signed-off-by: Vangelis Mamalakis Suggested-by: Miguel Ojeda Reviewed-by: Alice Ryhl Signed-off-by: Konstantin Andrikopoulos Signed-off-by: Tejun Heo --- rust/kernel/workqueue.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index 4d1d2062f6eba..fd3e97192ed8f 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -519,7 +519,15 @@ impl_has_work! { impl{T} HasWork for ClosureWork { self.work } } -// SAFETY: TODO. +// SAFETY: The `__enqueue` implementation in RawWorkItem uses a `work_struct` initialized with the +// `run` method of this trait as the function pointer because: +// - `__enqueue` gets the `work_struct` from the `Work` field, using `T::raw_get_work`. +// - The only safe way to create a `Work` object is through `Work::new`. +// - `Work::new` makes sure that `T::Pointer::run` is passed to `init_work_with_key`. +// - Finally `Work` and `RawWorkItem` guarantee that the correct `Work` field +// will be used because of the ID const generic bound. This makes sure that `T::raw_get_work` +// uses the correct offset for the `Work` field, and `Work::new` picks the correct +// implementation of `WorkItemPointer` for `Arc`. unsafe impl WorkItemPointer for Arc where T: WorkItem, @@ -537,7 +545,13 @@ where } } -// SAFETY: TODO. +// SAFETY: The `work_struct` raw pointer is guaranteed to be valid for the duration of the call to +// the closure because we get it from an `Arc`, which means that the ref count will be at least 1, +// and we don't drop the `Arc` ourselves. If `queue_work_on` returns true, it is further guaranteed +// to be valid until a call to the function pointer in `work_struct` because we leak the memory it +// points to, and only reclaim it if the closure returns false, or in `WorkItemPointer::run`, which +// is what the function pointer in the `work_struct` must be pointing to, according to the safety +// requirements of `WorkItemPointer`. unsafe impl RawWorkItem for Arc where T: WorkItem, From ebf7f7d616818f2841c8aece14084e87d59bd8c7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 3 Dec 2024 17:16:43 -0800 Subject: [PATCH 0099/1450] Revert "ptp: Switch back to struct platform_driver::remove()" This reverts commit b32913a5609a36c230e9b091da26d38f8e80a056. Linus applied directly commit e70140ba0d2b ("Get rid of 'remove_new' relic from platform driver struct"), drop our local change to avoid conflicts. Link: https://lore.kernel.org/CAMuHMdV3J=o2x9G=1t_y97iv9eLsPfiej108vU6JHnn=AR-Nvw@mail.gmail.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_clockmatrix.c | 2 +- drivers/ptp/ptp_dte.c | 4 ++-- drivers/ptp/ptp_fc3.c | 2 +- drivers/ptp/ptp_idt82p33.c | 2 +- drivers/ptp/ptp_ines.c | 4 ++-- drivers/ptp/ptp_qoriq.c | 2 +- drivers/ptp/ptp_vmclock.c | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c index fbb3fa8fc60b2..b6f1941308b17 100644 --- a/drivers/ptp/ptp_clockmatrix.c +++ b/drivers/ptp/ptp_clockmatrix.c @@ -2471,7 +2471,7 @@ static struct platform_driver idtcm_driver = { .name = "8a3400x-phc", }, .probe = idtcm_probe, - .remove = idtcm_remove, + .remove_new = idtcm_remove, }; module_platform_driver(idtcm_driver); diff --git a/drivers/ptp/ptp_dte.c b/drivers/ptp/ptp_dte.c index 372168578a30d..449ff90927beb 100644 --- a/drivers/ptp/ptp_dte.c +++ b/drivers/ptp/ptp_dte.c @@ -326,8 +326,8 @@ static struct platform_driver ptp_dte_driver = { .pm = PTP_DTE_PM_OPS, .of_match_table = ptp_dte_of_match, }, - .probe = ptp_dte_probe, - .remove = ptp_dte_remove, + .probe = ptp_dte_probe, + .remove_new = ptp_dte_remove, }; module_platform_driver(ptp_dte_driver); diff --git a/drivers/ptp/ptp_fc3.c b/drivers/ptp/ptp_fc3.c index cfced36c70bcb..879b82f035352 100644 --- a/drivers/ptp/ptp_fc3.c +++ b/drivers/ptp/ptp_fc3.c @@ -1003,7 +1003,7 @@ static struct platform_driver idtfc3_driver = { .name = "rc38xxx-phc", }, .probe = idtfc3_probe, - .remove = idtfc3_remove, + .remove_new = idtfc3_remove, }; module_platform_driver(idtfc3_driver); diff --git a/drivers/ptp/ptp_idt82p33.c b/drivers/ptp/ptp_idt82p33.c index b2fd94d4f8631..d5732490ed9d1 100644 --- a/drivers/ptp/ptp_idt82p33.c +++ b/drivers/ptp/ptp_idt82p33.c @@ -1461,7 +1461,7 @@ static struct platform_driver idt82p33_driver = { .name = "82p33x1x-phc", }, .probe = idt82p33_probe, - .remove = idt82p33_remove, + .remove_new = idt82p33_remove, }; module_platform_driver(idt82p33_driver); diff --git a/drivers/ptp/ptp_ines.c b/drivers/ptp/ptp_ines.c index 3d723a2aa6bb0..14a23d3a27f2c 100644 --- a/drivers/ptp/ptp_ines.c +++ b/drivers/ptp/ptp_ines.c @@ -781,8 +781,8 @@ static const struct of_device_id ines_ptp_ctrl_of_match[] = { MODULE_DEVICE_TABLE(of, ines_ptp_ctrl_of_match); static struct platform_driver ines_ptp_ctrl_driver = { - .probe = ines_ptp_ctrl_probe, - .remove = ines_ptp_ctrl_remove, + .probe = ines_ptp_ctrl_probe, + .remove_new = ines_ptp_ctrl_remove, .driver = { .name = "ines_ptp_ctrl", .of_match_table = ines_ptp_ctrl_of_match, diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index 4d488c1f1941b..879cfc1537ac8 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -670,7 +670,7 @@ static struct platform_driver ptp_qoriq_driver = { .of_match_table = match_table, }, .probe = ptp_qoriq_probe, - .remove = ptp_qoriq_remove, + .remove_new = ptp_qoriq_remove, }; module_platform_driver(ptp_qoriq_driver); diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index 0a2cfc8ad3c54..cdca8a3ad1aa1 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -601,7 +601,7 @@ MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); static struct platform_driver vmclock_platform_driver = { .probe = vmclock_probe, - .remove = vmclock_remove, + .remove_new = vmclock_remove, .driver = { .name = "vmclock", .acpi_match_table = vmclock_acpi_ids, From 2e20bf8cc05766dcd0357cdfcada49e1bc45512b Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 2 Dec 2024 21:14:35 +0100 Subject: [PATCH 0100/1450] r8169: remove unused flag RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE After 854d71c555dfc3 ("r8169: remove original workaround for RTL8125 broken rx issue") this flag isn't used any longer. So remove it. Signed-off-by: Heiner Kallweit Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/d9dd214b-3027-4f60-b0e8-6f34a0c76582@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 739707a7b40fb..4b96b4ad81b96 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -623,7 +623,6 @@ struct rtl8169_tc_offsets { enum rtl_flag { RTL_FLAG_TASK_RESET_PENDING, - RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, RTL_FLAG_TASK_TX_TIMEOUT, RTL_FLAG_MAX }; @@ -4723,8 +4722,6 @@ static void rtl_task(struct work_struct *work) reset: rtl_reset_work(tp); netif_wake_queue(tp->dev); - } else if (test_and_clear_bit(RTL_FLAG_TASK_RESET_NO_QUEUE_WAKE, tp->wk.flags)) { - rtl_reset_work(tp); } } From bb18265c3aba92b91a1355609769f3e967b65dee Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 2 Dec 2024 21:20:02 +0100 Subject: [PATCH 0101/1450] r8169: remove support for chip version 11 This is a follow-up to 982300c115d2 ("r8169: remove detection of chip version 11 (early RTL8168b)"). Nobody complained yet, so remove support for this chip version. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/b689ab6d-20b5-4b64-bd7e-531a0a972ba3@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169.h | 2 +- drivers/net/ethernet/realtek/r8169_main.c | 14 +------------- drivers/net/ethernet/realtek/r8169_phy_config.c | 10 ---------- 3 files changed, 2 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index be4c9622618d8..8904aae41acae 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -23,7 +23,7 @@ enum mac_version { RTL_GIGA_MAC_VER_08, RTL_GIGA_MAC_VER_09, RTL_GIGA_MAC_VER_10, - RTL_GIGA_MAC_VER_11, + /* support for RTL_GIGA_MAC_VER_11 has been removed */ /* RTL_GIGA_MAC_VER_12 was handled the same as VER_17 */ /* RTL_GIGA_MAC_VER_13 was merged with VER_10 */ RTL_GIGA_MAC_VER_14, diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4b96b4ad81b96..cc14cd540f748 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -104,7 +104,6 @@ static const struct { [RTL_GIGA_MAC_VER_08] = {"RTL8102e" }, [RTL_GIGA_MAC_VER_09] = {"RTL8102e/RTL8103e" }, [RTL_GIGA_MAC_VER_10] = {"RTL8101e/RTL8100e" }, - [RTL_GIGA_MAC_VER_11] = {"RTL8168b/8111b" }, [RTL_GIGA_MAC_VER_14] = {"RTL8401" }, [RTL_GIGA_MAC_VER_17] = {"RTL8168b/8111b" }, [RTL_GIGA_MAC_VER_18] = {"RTL8168cp/8111cp" }, @@ -2335,7 +2334,7 @@ static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii) /* 8168B family. */ { 0x7c8, 0x380, RTL_GIGA_MAC_VER_17 }, - /* This one is very old and rare, let's see if anybody complains. + /* This one is very old and rare, support has been removed. * { 0x7c8, 0x300, RTL_GIGA_MAC_VER_11 }, */ @@ -3803,7 +3802,6 @@ static void rtl_hw_config(struct rtl8169_private *tp) [RTL_GIGA_MAC_VER_08] = rtl_hw_start_8102e_3, [RTL_GIGA_MAC_VER_09] = rtl_hw_start_8102e_2, [RTL_GIGA_MAC_VER_10] = NULL, - [RTL_GIGA_MAC_VER_11] = rtl_hw_start_8168b, [RTL_GIGA_MAC_VER_14] = rtl_hw_start_8401, [RTL_GIGA_MAC_VER_17] = rtl_hw_start_8168b, [RTL_GIGA_MAC_VER_18] = rtl_hw_start_8168cp_1, @@ -4679,12 +4677,6 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance) if (status & LinkChg) phy_mac_interrupt(tp->phydev); - if (unlikely(status & RxFIFOOver && - tp->mac_version == RTL_GIGA_MAC_VER_11)) { - netif_stop_queue(tp->dev); - rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING); - } - rtl_irq_disable(tp); napi_schedule(&tp->napi); out: @@ -5100,9 +5092,6 @@ static void rtl_set_irq_mask(struct rtl8169_private *tp) if (tp->mac_version <= RTL_GIGA_MAC_VER_06) tp->irq_mask |= SYSErr | RxFIFOOver; - else if (tp->mac_version == RTL_GIGA_MAC_VER_11) - /* special workaround needed */ - tp->irq_mask |= RxFIFOOver; } static int rtl_alloc_irq(struct rtl8169_private *tp) @@ -5297,7 +5286,6 @@ static int rtl_jumbo_max(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_06: return JUMBO_7K; /* RTL8168b */ - case RTL_GIGA_MAC_VER_11: case RTL_GIGA_MAC_VER_17: return JUMBO_4K; /* RTL8168c */ diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index 5307c6ff4e252..b28b30390e84d 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -276,15 +276,6 @@ static void rtl8169sce_hw_phy_config(struct rtl8169_private *tp, rtl_writephy_batch(phydev, phy_reg_init); } -static void rtl8168bb_hw_phy_config(struct rtl8169_private *tp, - struct phy_device *phydev) -{ - phy_write(phydev, 0x1f, 0x0001); - phy_set_bits(phydev, 0x16, BIT(0)); - phy_write(phydev, 0x10, 0xf41b); - phy_write(phydev, 0x1f, 0x0000); -} - static void rtl8168bef_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev) { @@ -1136,7 +1127,6 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, [RTL_GIGA_MAC_VER_08] = rtl8102e_hw_phy_config, [RTL_GIGA_MAC_VER_09] = rtl8102e_hw_phy_config, [RTL_GIGA_MAC_VER_10] = NULL, - [RTL_GIGA_MAC_VER_11] = rtl8168bb_hw_phy_config, [RTL_GIGA_MAC_VER_14] = rtl8401_hw_phy_config, [RTL_GIGA_MAC_VER_17] = rtl8168bef_hw_phy_config, [RTL_GIGA_MAC_VER_18] = rtl8168cp_1_hw_phy_config, From 6fe437cfe2cdc797b03f63b338a13fac96ed6a08 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Tue, 3 Dec 2024 14:31:08 +0000 Subject: [PATCH 0102/1450] firmware: arm_ffa: Fix the race around setting ffa_dev->properties Currently, ffa_dev->properties is set after the ffa_device_register() call return in ffa_setup_partitions(). This could potentially result in a race where the partition's properties is accessed while probing struct ffa_device before it is set. Update the ffa_device_register() to receive ffa_partition_info so all the data from the partition information received from the firmware can be updated into the struct ffa_device before the calling device_register() in ffa_device_register(). Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Signed-off-by: Levi Yun Message-Id: <20241203143109.1030514-2-yeoreum.yun@arm.com> Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/bus.c | 15 +++++++++++---- drivers/firmware/arm_ffa/driver.c | 7 +------ include/linux/arm_ffa.h | 13 ++++++++----- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c index eb17d03b66fec..dfda5ffc14db7 100644 --- a/drivers/firmware/arm_ffa/bus.c +++ b/drivers/firmware/arm_ffa/bus.c @@ -187,13 +187,18 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) return valid; } -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { int id, ret; + uuid_t uuid; struct device *dev; struct ffa_device *ffa_dev; + if (!part_info) + return NULL; + id = ida_alloc_min(&ffa_bus_id, 1, GFP_KERNEL); if (id < 0) return NULL; @@ -210,9 +215,11 @@ struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, dev_set_name(&ffa_dev->dev, "arm-ffa-%d", id); ffa_dev->id = id; - ffa_dev->vm_id = vm_id; + ffa_dev->vm_id = part_info->id; + ffa_dev->properties = part_info->properties; ffa_dev->ops = ops; - uuid_copy(&ffa_dev->uuid, uuid); + import_uuid(&uuid, (u8 *)part_info->uuid); + uuid_copy(&ffa_dev->uuid, &uuid); ret = device_register(&ffa_dev->dev); if (ret) { diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index b14cbdae94e82..2c2ec3c35f156 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -1387,7 +1387,6 @@ static struct notifier_block ffa_bus_nb = { static int ffa_setup_partitions(void) { int count, idx, ret; - uuid_t uuid; struct ffa_device *ffa_dev; struct ffa_dev_part_info *info; struct ffa_partition_info *pbuf, *tpbuf; @@ -1406,23 +1405,19 @@ static int ffa_setup_partitions(void) xa_init(&drv_info->partition_info); for (idx = 0, tpbuf = pbuf; idx < count; idx++, tpbuf++) { - import_uuid(&uuid, (u8 *)tpbuf->uuid); - /* Note that if the UUID will be uuid_null, that will require * ffa_bus_notifier() to find the UUID of this partition id * with help of ffa_device_match_uuid(). FF-A v1.1 and above * provides UUID here for each partition as part of the * discovery API and the same is passed. */ - ffa_dev = ffa_device_register(&uuid, tpbuf->id, &ffa_drv_ops); + ffa_dev = ffa_device_register(tpbuf, &ffa_drv_ops); if (!ffa_dev) { pr_err("%s: failed to register partition ID 0x%x\n", __func__, tpbuf->id); continue; } - ffa_dev->properties = tpbuf->properties; - if (drv_info->version > FFA_VERSION_1_0 && !(tpbuf->properties & FFA_PARTITION_AARCH64_EXEC)) ffa_mode_32bit_set(ffa_dev); diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index a28e2a6a13d05..74169dd0f6594 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -166,9 +166,12 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) return dev_get_drvdata(&fdev->dev); } +struct ffa_partition_info; + #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops); +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops); void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); @@ -176,9 +179,9 @@ void ffa_driver_unregister(struct ffa_driver *driver); bool ffa_device_is_valid(struct ffa_device *ffa_dev); #else -static inline -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +static inline struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { return NULL; } From ac1e21bd8c883aeac2f1835fc93b39c1e6838b35 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 3 Dec 2024 09:44:06 +0800 Subject: [PATCH 0103/1450] jbd2: increase IO priority for writing revoke records Commit '6a3afb6ac6df ("jbd2: increase the journal IO's priority")' increases the priority of journal I/O by marking I/O with the JBD2_JOURNAL_REQ_FLAGS. However, that commit missed the revoke buffers, so also addresses that kind of I/Os. Fixes: 6a3afb6ac6df ("jbd2: increase the journal IO's priority") Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20241203014407.805916-2-yi.zhang@huaweicloud.com Reviewed-by: Kemeng Shi Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/jbd2/revoke.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 4556e46890244..ce63d5fde9c3a 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -654,7 +654,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, REQ_SYNC); + write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); } #endif From a0851ea9cd555c333795b85ddd908898b937c4e1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 3 Dec 2024 09:44:07 +0800 Subject: [PATCH 0104/1450] jbd2: flush filesystem device before updating tail sequence When committing transaction in jbd2_journal_commit_transaction(), the disk caches for the filesystem device should be flushed before updating the journal tail sequence. However, this step is missed if the journal is not located on the filesystem device. As a result, the filesystem may become inconsistent following a power failure or system crash. Fix it by ensuring that the filesystem device is flushed appropriately. Fixes: 3339578f0578 ("jbd2: cleanup journal tail after transaction commit") Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20241203014407.805916-3-yi.zhang@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/jbd2/commit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 9153ff3a08e76..e8e80761ac730 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -772,9 +772,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue - * the commit record + * the commit record and update the journal tail sequence. */ - if (commit_transaction->t_need_data_flush && + if ((commit_transaction->t_need_data_flush || update_tail) && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev); From 790fb9956eead785b720ccc0851f09a5ca3a093e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Dec 2024 09:20:04 -0800 Subject: [PATCH 0105/1450] linux/dmaengine.h: fix a few kernel-doc warnings The comment block for "Interleaved Transfer Request" should not begin with "/**" since it is not in kernel-doc format. Fix doc name for enum sum_check_flags. Fix all (4) missing struct member warnings. Use "Warning:" for one "Note:" in enum dma_desc_metadata_mode since scripts/kernel-doc does not allow more than one Note: per function or identifier description. This leaves around 49 kernel-doc warnings like: include/linux/dmaengine.h:43: warning: Enum value 'DMA_OUT_OF_ORDER' not described in enum 'dma_status' and another scripts/kernel-doc problem with it not being able to parse some typedefs. Fixes: b14dab792dee ("DMAEngine: Define interleaved transfer request api") Fixes: ad283ea4a3ce ("async_tx: add sum check flags") Fixes: 272420214d26 ("dmaengine: Add DMA_CTRL_REUSE") Fixes: f067025bc676 ("dmaengine: add support to provide error result from a DMA transation") Fixes: d38a8c622a1b ("dmaengine: prepare for generic 'unmap' data") Fixes: 5878853fc938 ("dmaengine: Add API function dmaengine_prep_peripheral_dma_vec()") Signed-off-by: Randy Dunlap Cc: Dan Williams Cc: Dave Jiang Cc: Paul Cercueil Cc: Nuno Sa Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/20241202172004.76020-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index b137fdb56093b..346251bf1026f 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -84,7 +84,7 @@ enum dma_transfer_direction { DMA_TRANS_NONE, }; -/** +/* * Interleaved Transfer Request * ---------------------------- * A chunk is collection of contiguous bytes to be transferred. @@ -223,7 +223,7 @@ enum sum_check_bits { }; /** - * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations + * enum sum_check_flags - result of async_{xor,pq}_zero_sum operations * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise */ @@ -286,7 +286,7 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; * pointer to the engine's metadata area * 4. Read out the metadata from the pointer * - * Note: the two mode is not compatible and clients must use one mode for a + * Warning: the two modes are not compatible and clients must use one mode for a * descriptor. */ enum dma_desc_metadata_mode { @@ -594,9 +594,13 @@ struct dma_descriptor_metadata_ops { * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the + * @desc_free: driver's callback function to free a resusable descriptor + * after completion * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete + * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine + * @unmap: hook for generic DMA unmap data * @desc_metadata_mode: core managed metadata mode to protect mixed use of * DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise * DESC_METADATA_NONE @@ -827,6 +831,9 @@ struct dma_filter { * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list * @device_prep_dma_interrupt: prepares an end of chain interrupt operation + * @device_prep_peripheral_dma_vec: prepares a scatter-gather DMA transfer, + * where the address and size of each segment is located in one entry of + * the dma_vec array. * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. * The function takes a buffer of size buf_len. The callback function will From 8d55e8a16f019211163f1180fd9f9fbe05901900 Mon Sep 17 00:00:00 2001 From: Sasha Finkelstein Date: Sun, 24 Nov 2024 16:48:28 +0100 Subject: [PATCH 0106/1450] dmaengine: apple-admac: Avoid accessing registers in probe The ADMAC attached to the AOP has complex power sequencing, and is power gated when the probe callback runs. Move the register reads to other functions, where we can guarantee that the hardware is switched on. Fixes: 568aa6dd641f ("dmaengine: apple-admac: Allocate cache SRAM to channels") Signed-off-by: Sasha Finkelstein Link: https://lore.kernel.org/r/20241124-admac-power-v1-1-58f2165a4d55@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/apple-admac.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/dma/apple-admac.c b/drivers/dma/apple-admac.c index c499173d80b20..bd49f03742912 100644 --- a/drivers/dma/apple-admac.c +++ b/drivers/dma/apple-admac.c @@ -153,6 +153,8 @@ static int admac_alloc_sram_carveout(struct admac_data *ad, { struct admac_sram *sram; int i, ret = 0, nblocks; + ad->txcache.size = readl_relaxed(ad->base + REG_TX_SRAM_SIZE); + ad->rxcache.size = readl_relaxed(ad->base + REG_RX_SRAM_SIZE); if (dir == DMA_MEM_TO_DEV) sram = &ad->txcache; @@ -912,12 +914,7 @@ static int admac_probe(struct platform_device *pdev) goto free_irq; } - ad->txcache.size = readl_relaxed(ad->base + REG_TX_SRAM_SIZE); - ad->rxcache.size = readl_relaxed(ad->base + REG_RX_SRAM_SIZE); - dev_info(&pdev->dev, "Audio DMA Controller\n"); - dev_info(&pdev->dev, "imprint %x TX cache %u RX cache %u\n", - readl_relaxed(ad->base + REG_IMPRINT), ad->txcache.size, ad->rxcache.size); return 0; From dcbef0798eb825cd584f7a93f62bed63f7fbbfc9 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 18 Sep 2024 11:10:22 -0700 Subject: [PATCH 0107/1450] dmaengine: amd: qdma: Remove using the private get and set dma_ops APIs The get_dma_ops and set_dma_ops APIs were never for driver to use. Remove these calls from QDMA driver. Instead, pass the DMA device pointer from the qdma_platdata structure. Fixes: 73d5fc92a11c ("dmaengine: amd: qdma: Add AMD QDMA driver") Signed-off-by: Lizhi Hou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240918181022.2155715-1-lizhi.hou@amd.com Signed-off-by: Vinod Koul --- drivers/dma/amd/qdma/qdma.c | 28 +++++++++++--------------- include/linux/platform_data/amd_qdma.h | 2 ++ 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/dma/amd/qdma/qdma.c b/drivers/dma/amd/qdma/qdma.c index 6d9079458fe9c..66f00ad67351a 100644 --- a/drivers/dma/amd/qdma/qdma.c +++ b/drivers/dma/amd/qdma/qdma.c @@ -7,9 +7,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -492,18 +492,9 @@ static int qdma_device_verify(struct qdma_device *qdev) static int qdma_device_setup(struct qdma_device *qdev) { - struct device *dev = &qdev->pdev->dev; u32 ring_sz = QDMA_DEFAULT_RING_SIZE; int ret = 0; - while (dev && get_dma_ops(dev)) - dev = dev->parent; - if (!dev) { - qdma_err(qdev, "dma device not found"); - return -EINVAL; - } - set_dma_ops(&qdev->pdev->dev, get_dma_ops(dev)); - ret = qdma_setup_fmap_context(qdev); if (ret) { qdma_err(qdev, "Failed setup fmap context"); @@ -548,11 +539,12 @@ static void qdma_free_queue_resources(struct dma_chan *chan) { struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; - struct device *dev = qdev->dma_dev.dev; + struct qdma_platdata *pdata; qdma_clear_queue_context(queue); vchan_free_chan_resources(&queue->vchan); - dma_free_coherent(dev, queue->ring_size * QDMA_MM_DESC_SIZE, + pdata = dev_get_platdata(&qdev->pdev->dev); + dma_free_coherent(pdata->dma_dev, queue->ring_size * QDMA_MM_DESC_SIZE, queue->desc_base, queue->dma_desc_base); } @@ -565,6 +557,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; struct qdma_ctxt_sw_desc desc; + struct qdma_platdata *pdata; size_t size; int ret; @@ -572,8 +565,9 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) return ret; + pdata = dev_get_platdata(&qdev->pdev->dev); size = queue->ring_size * QDMA_MM_DESC_SIZE; - queue->desc_base = dma_alloc_coherent(qdev->dma_dev.dev, size, + queue->desc_base = dma_alloc_coherent(pdata->dma_dev, size, &queue->dma_desc_base, GFP_KERNEL); if (!queue->desc_base) { @@ -588,7 +582,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) { qdma_err(qdev, "Failed to setup SW desc ctxt for %s", chan->name); - dma_free_coherent(qdev->dma_dev.dev, size, queue->desc_base, + dma_free_coherent(pdata->dma_dev, size, queue->desc_base, queue->dma_desc_base); return ret; } @@ -948,8 +942,9 @@ static int qdma_init_error_irq(struct qdma_device *qdev) static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) { - u32 ctxt[QDMA_CTXT_REGMAP_LEN]; + struct qdma_platdata *pdata = dev_get_platdata(&qdev->pdev->dev); struct device *dev = &qdev->pdev->dev; + u32 ctxt[QDMA_CTXT_REGMAP_LEN]; struct qdma_intr_ring *ring; struct qdma_ctxt_intr intr_ctxt; u32 vector; @@ -969,7 +964,8 @@ static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) ring->msix_id = qdev->err_irq_idx + i + 1; ring->ridx = i; ring->color = 1; - ring->base = dmam_alloc_coherent(dev, QDMA_INTR_RING_SIZE, + ring->base = dmam_alloc_coherent(pdata->dma_dev, + QDMA_INTR_RING_SIZE, &ring->dev_base, GFP_KERNEL); if (!ring->base) { qdma_err(qdev, "Failed to alloc intr ring %d", i); diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h index 576d952f97edd..967a6ef31cf98 100644 --- a/include/linux/platform_data/amd_qdma.h +++ b/include/linux/platform_data/amd_qdma.h @@ -26,11 +26,13 @@ struct dma_slave_map; * @max_mm_channels: Maximum number of MM DMA channels in each direction * @device_map: DMA slave map * @irq_index: The index of first IRQ + * @dma_dev: The device pointer for dma operations */ struct qdma_platdata { u32 max_mm_channels; u32 irq_index; struct dma_slave_map *device_map; + struct device *dma_dev; }; #endif /* _PLATDATA_AMD_QDMA_H */ From eb867d797d294a00a092b5027d08439da68940b2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 Nov 2024 15:10:31 +0200 Subject: [PATCH 0108/1450] RDMA/bnxt_re: Remove always true dattr validity check res->dattr is always valid at this point as it was initialized during device addition in bnxt_re_add_device(). This change is fixing the following smatch error: drivers/infiniband/hw/bnxt_re/qplib_fp.c:1090 bnxt_qplib_create_qp() error: we previously assumed 'res->dattr' could be null (see line 985) Fixes: 07f830ae4913 ("RDMA/bnxt_re: Adds MSN table capability for Gen P7 adapters") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202411222329.YTrwonWi-lkp@intel.com/ Link: https://patch.msgid.link/be0d8836b64cba3e479fbcbca717acad04aae02e.1732626579.git.leonro@nvidia.com Acked-by: Selvin Xavier Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index e42abf5be6c0f..9af8aaadc99a8 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1000,9 +1000,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) u32 tbl_indx; u16 nsge; - if (res->dattr) - qp->is_host_msn_tbl = _is_host_msn_table(res->dattr->dev_cap_flags2); - + qp->is_host_msn_tbl = _is_host_msn_table(res->dattr->dev_cap_flags2); sq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_QP, From 0a92ea87bdd6f77ca4e17fe19649882cf5209edd Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Thu, 24 Oct 2024 14:35:40 -0700 Subject: [PATCH 0109/1450] phy: usb: Toggle the PHY power during init When bringing up the PHY, it might be in a bad state if left powered. One case is we lose the PLL lock if the PLL is gated while the PHY is powered. Toggle the PHY power so we can start from a known state. Fixes: 4e5b9c9a73b3 ("phy: usb: Add support for new Synopsys USB controller on the 7216") Signed-off-by: Justin Chen Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20241024213540.1059412-1-justin.chen@broadcom.com Signed-off-by: Vinod Koul --- drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c b/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c index 950b7ae1d1a83..dc452610934ad 100644 --- a/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c +++ b/drivers/phy/broadcom/phy-brcm-usb-init-synopsys.c @@ -325,6 +325,12 @@ static void usb_init_common_7216(struct brcm_usb_init_params *params) void __iomem *ctrl = params->regs[BRCM_REGS_CTRL]; USB_CTRL_UNSET(ctrl, USB_PM, XHC_S2_CLK_SWITCH_EN); + + /* + * The PHY might be in a bad state if it is already powered + * up. Toggle the power just in case. + */ + USB_CTRL_SET(ctrl, USB_PM, USB_PWRDN); USB_CTRL_UNSET(ctrl, USB_PM, USB_PWRDN); /* 1 millisecond - for USB clocks to settle down */ From fbcbffbac994aca1264e3c14da96ac9bfd90466e Mon Sep 17 00:00:00 2001 From: Chukun Pan Date: Fri, 22 Nov 2024 15:30:06 +0800 Subject: [PATCH 0110/1450] phy: rockchip: naneng-combphy: fix phy reset Currently, the USB port via combophy on the RK3528/RK3588 SoC is broken. usb usb8-port1: Cannot enable. Maybe the USB cable is bad? This is due to the combphy of RK3528/RK3588 SoC has multiple resets, but only "phy resets" need assert and deassert, "apb resets" don't need. So change the driver to only match the phy resets, which is also what the vendor kernel does. Fixes: 7160820d742a ("phy: rockchip: add naneng combo phy for RK3568") Cc: FUKAUMI Naoki Cc: Michael Zimmermann Signed-off-by: Chukun Pan Reviewed-by: Heiko Stuebner Tested-by: FUKAUMI Naoki Link: https://lore.kernel.org/r/20241122073006.99309-2-amadeus@jmu.edu.cn Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-naneng-combphy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c index 0a9989e41237f..2eb3329ca23f6 100644 --- a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c +++ b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c @@ -309,7 +309,7 @@ static int rockchip_combphy_parse_dt(struct device *dev, struct rockchip_combphy priv->ext_refclk = device_property_present(dev, "rockchip,ext-refclk"); - priv->phy_rst = devm_reset_control_array_get_exclusive(dev); + priv->phy_rst = devm_reset_control_get(dev, "phy"); if (IS_ERR(priv->phy_rst)) return dev_err_probe(dev, PTR_ERR(priv->phy_rst), "failed to get phy reset\n"); From d0257e089d1bbd35c69b6c97ff73e3690ab149a9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 30 Nov 2024 13:06:41 +0300 Subject: [PATCH 0111/1450] RDMA/uverbs: Prevent integer overflow issue In the expression "cmd.wqe_size * cmd.wr_count", both variables are u32 values that come from the user so the multiplication can lead to integer wrapping. Then we pass the result to uverbs_request_next_ptr() which also could potentially wrap. The "cmd.sge_count * sizeof(struct ib_uverbs_sge)" multiplication can also overflow on 32bit systems although it's fine on 64bit systems. This patch does two things. First, I've re-arranged the condition in uverbs_request_next_ptr() so that the use controlled variable "len" is on one side of the comparison by itself without any math. Then I've modified all the callers to use size_mul() for the multiplications. Fixes: 67cdb40ca444 ("[IB] uverbs: Implement more commands") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/b8765ab3-c2da-4611-aae0-ddd6ba173d23@stanley.mountain Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 66b02fbf077ac..5ad14c39d48c9 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -161,7 +161,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter, { const void __user *res = iter->cur; - if (iter->cur + len > iter->end) + if (len > iter->end - iter->cur) return (void __force __user *)ERR_PTR(-ENOSPC); iter->cur += len; return res; @@ -2008,11 +2008,13 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); if (ret) return ret; - wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count); + wqes = uverbs_request_next_ptr(&iter, size_mul(cmd.wqe_size, + cmd.wr_count)); if (IS_ERR(wqes)) return PTR_ERR(wqes); - sgls = uverbs_request_next_ptr( - &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(&iter, + size_mul(cmd.sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return PTR_ERR(sgls); ret = uverbs_request_finish(&iter); @@ -2198,11 +2200,11 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, if (wqe_size < sizeof(struct ib_uverbs_recv_wr)) return ERR_PTR(-EINVAL); - wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count); + wqes = uverbs_request_next_ptr(iter, size_mul(wqe_size, wr_count)); if (IS_ERR(wqes)) return ERR_CAST(wqes); - sgls = uverbs_request_next_ptr( - iter, sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(iter, size_mul(sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return ERR_CAST(sgls); ret = uverbs_request_finish(iter); From 8886fb3240931a0afce82dea87edfe46bcb0a586 Mon Sep 17 00:00:00 2001 From: Krishna Kurapati Date: Tue, 12 Nov 2024 14:58:31 +0530 Subject: [PATCH 0112/1450] phy: qcom-qmp: Fix register name in RX Lane config of SC8280XP In RX Lane configuration sequence of SC8280XP, the register V5_RX_UCDR_FO_GAIN is incorrectly spelled as RX_UCDR_SO_GAIN and hence the programming sequence is wrong. Fix the register sequence accordingly to avoid any compliance failures. This has been tested on SA8775P by checking device mode enumeration in SuperSpeed. Cc: stable@vger.kernel.org Fixes: c0c7769cdae2 ("phy: qcom-qmp: Add SC8280XP USB3 UNI phy") Signed-off-by: Krishna Kurapati Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20241112092831.4110942-1-quic_kriskura@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c index acd6075bf6d96..c9c3378407153 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c @@ -1052,7 +1052,7 @@ static const struct qmp_phy_init_tbl sc8280xp_usb3_uniphy_rx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FASTLOCK_FO_GAIN, 0x2f), QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FASTLOCK_COUNT_LOW, 0xff), QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FASTLOCK_COUNT_HIGH, 0x0f), - QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_SO_GAIN, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V5_RX_UCDR_FO_GAIN, 0x0a), QMP_PHY_INIT_CFG(QSERDES_V5_RX_VGA_CAL_CNTRL1, 0x54), QMP_PHY_INIT_CFG(QSERDES_V5_RX_VGA_CAL_CNTRL2, 0x0f), QMP_PHY_INIT_CFG(QSERDES_V5_RX_RX_EQU_ADAPTOR_CNTRL2, 0x0f), From 2de679ecd724b823c2cb58caab8508c7eec8aefc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Nov 2024 11:37:02 +0100 Subject: [PATCH 0113/1450] phy: stm32: work around constant-value overflow assertion FIELD_PREP() checks that a constant fits into the available bitfield, but if one of the two lookup tables in stm32_impedance_tune() does not find a matching entry, the index is out of range, which gcc correctly complains about: In file included from : In function 'stm32_impedance_tune', inlined from 'stm32_combophy_pll_init' at drivers/phy/st/phy-stm32-combophy.c:247:9: include/linux/compiler_types.h:517:38: error: call to '__compiletime_assert_447' declared with attribute error: FIELD_PREP: value too large for the field 517 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ include/linux/bitfield.h:68:3: note: in expansion of macro 'BUILD_BUG_ON_MSG' 68 | BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ 115 | __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \ | ^~~~~~~~~~~~~~~~ drivers/phy/st/phy-stm32-combophy.c:162:8: note: in expansion of macro 'FIELD_PREP' 162 | FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of)); | ^~~~~~~~~~ Rework this so the field value gets set inside of the loop and otherwise set to zero. Fixes: 47e1bb6b4ba0 ("phy: stm32: Add support for STM32MP25 COMBOPHY.") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241111103712.3520611-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/st/phy-stm32-combophy.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/phy/st/phy-stm32-combophy.c b/drivers/phy/st/phy-stm32-combophy.c index 765bb34fe3589..49e9fa90a6819 100644 --- a/drivers/phy/st/phy-stm32-combophy.c +++ b/drivers/phy/st/phy-stm32-combophy.c @@ -122,6 +122,7 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) u32 max_vswing = imp_lookup[imp_size - 1].vswing[vswing_size - 1]; u32 min_vswing = imp_lookup[0].vswing[0]; u32 val; + u32 regval; if (!of_property_read_u32(combophy->dev->of_node, "st,output-micro-ohms", &val)) { if (val < min_imp || val > max_imp) { @@ -129,16 +130,20 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) return -EINVAL; } - for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) - if (imp_lookup[imp_of].microohm <= val) + regval = 0; + for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) { + if (imp_lookup[imp_of].microohm <= val) { + regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of); break; + } + } dev_dbg(combophy->dev, "Set %u micro-ohms output impedance\n", imp_lookup[imp_of].microohm); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_OHM, - FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of)); + regval); } else { regmap_read(combophy->regmap, SYSCFG_PCIEPRGCR, &val); imp_of = FIELD_GET(STM32MP25_PCIEPRG_IMPCTRL_OHM, val); @@ -150,16 +155,20 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) return -EINVAL; } - for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) - if (imp_lookup[imp_of].vswing[vswing_of] >= val) + regval = 0; + for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) { + if (imp_lookup[imp_of].vswing[vswing_of] >= val) { + regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of); break; + } + } dev_dbg(combophy->dev, "Set %u microvolt swing\n", imp_lookup[imp_of].vswing[vswing_of]); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_VSWING, - FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of)); + regval); } return 0; From 1b5b7f3d29dc705bdeb3d2663df1b4617276491a Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Wed, 27 Nov 2024 10:02:55 -0800 Subject: [PATCH 0114/1450] wifi: mac80211: fix variable used in for_each_sdata_link() Macro for_each_sdata_link() accepts input '_local' but uses 'local' in its processing. This currently works because all the functions calling this macro have declared 'local' as a variable themselves. But this results in compilation error when a new caller uses 'sdata->local' instead of declaring 'local' variable. Use '_local' instead of 'local' in for_each_sdata_link(). Signed-off-by: Aloka Dixit Link: https://patch.msgid.link/20241127180255.1460553-1-quic_alokad@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a00096dd787b3..534a20054151f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1206,7 +1206,7 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) for (int ___link_id = 0; \ ___link_id < ARRAY_SIZE(___sdata->link); \ ___link_id++) \ - if ((_link = wiphy_dereference((local)->hw.wiphy, \ + if ((_link = wiphy_dereference((_local)->hw.wiphy, \ ___sdata->link[___link_id]))) static inline int From bee404e14477917c2e15f78b2ad1ea443939720c Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Tue, 3 Dec 2024 22:28:50 +0200 Subject: [PATCH 0115/1450] wifi: mac80211: Accept authentication frames on P2P device This is needed for PASN based P2P pairing. Signed-off-by: Andrei Otcheretianski Reviewed-by: Miriam Rachel Korenblit Link: https://patch.msgid.link/20241203222744.6ee3ef9f1374.Ib3074ffbe7b296e0f162b2543e84346b190dfbeb@changeid Signed-off-by: Johannes Berg --- net/mac80211/main.c | 9 +++++++-- net/mac80211/rx.c | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index ee1211a213d70..f13c14fa82e88 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include @@ -726,8 +726,13 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { }, [NL80211_IFTYPE_P2P_DEVICE] = { .tx = 0xffff, + /* + * To support P2P PASN pairing let user space register to rx + * also AUTH frames on P2P device interface. + */ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | - BIT(IEEE80211_STYPE_PROBE_REQ >> 4), + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4), }, }; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 2bec18fc1b035..58c1b9a4e8b5d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4562,7 +4562,9 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) return ieee80211_is_public_action(hdr, skb->len) || ieee80211_is_probe_req(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control) || - ieee80211_is_beacon(hdr->frame_control); + ieee80211_is_beacon(hdr->frame_control) || + (ieee80211_is_auth(hdr->frame_control) && + ether_addr_equal(sdata->vif.addr, hdr->addr1)); case NL80211_IFTYPE_NAN: /* Currently no frames on NAN interface are allowed */ return false; From f42d22d3f79639c1b4e41daf28dad2505d6a5a8b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 22 Nov 2024 09:42:25 +0100 Subject: [PATCH 0116/1450] wifi: cfg80211: define and use wiphy guard Define a guard for the wiphy mutex, and use it in most code in cfg80211, though not all due to some interaction with RTNL and/or indentation. Suggested-by: Jeff Johnson Reviewed-by: Jeff Johnson Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20241122094225.88765cbaab65.I610c9b14f36902e75e1d13f0db29f8bef2298804@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 + net/wireless/chan.c | 4 +- net/wireless/core.c | 42 +++-- net/wireless/mlme.c | 8 +- net/wireless/nl80211.c | 190 +++++++++------------- net/wireless/pmsr.c | 4 +- net/wireless/reg.c | 55 +++---- net/wireless/scan.c | 40 +++-- net/wireless/sme.c | 12 +- net/wireless/util.c | 7 +- net/wireless/wext-compat.c | 317 +++++++++++++------------------------ net/wireless/wext-sme.c | 43 ++--- 12 files changed, 278 insertions(+), 448 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 27acf1292a5ca..63e79a22a214f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6031,6 +6031,10 @@ static inline void wiphy_unlock(struct wiphy *wiphy) mutex_unlock(&wiphy->mtx); } +DEFINE_GUARD(wiphy, struct wiphy *, + mutex_lock(&_T->mtx), + mutex_unlock(&_T->mtx)) + struct wiphy_work; typedef void (*wiphy_work_func_t)(struct wiphy *, struct wiphy_work *); diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 40b6375a5de44..833ea73053a46 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -1039,10 +1039,10 @@ bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, if (!reg_dfs_domain_same(wiphy, &rdev->wiphy)) continue; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + found = cfg80211_is_wiphy_oper_chan(&rdev->wiphy, chan) || cfg80211_offchan_chain_is_active(rdev, chan); - wiphy_unlock(&rdev->wiphy); if (found) return true; diff --git a/net/wireless/core.c b/net/wireless/core.c index afbdc549fb4a5..70857018f020a 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -191,7 +191,8 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, return err; } - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; @@ -212,7 +213,6 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, continue; nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } - wiphy_unlock(&rdev->wiphy); return 0; } @@ -221,9 +221,9 @@ static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) { struct cfg80211_registered_device *rdev = data; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + rdev_rfkill_poll(rdev); - wiphy_unlock(&rdev->wiphy); } void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, @@ -283,7 +283,7 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) /* otherwise, check iftype */ - wiphy_lock(wiphy); + guard(wiphy)(wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: @@ -295,8 +295,6 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) default: break; } - - wiphy_unlock(wiphy); } } EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces); @@ -331,9 +329,9 @@ static void cfg80211_event_work(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, event_work); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + cfg80211_process_rdev_events(rdev); - wiphy_unlock(&rdev->wiphy); } void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) @@ -347,10 +345,10 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) if (wdev->netdev) dev_close(wdev->netdev); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + cfg80211_leave(rdev, wdev); cfg80211_remove_virtual_intf(rdev, wdev); - wiphy_unlock(&rdev->wiphy); } } } @@ -423,9 +421,9 @@ static void cfg80211_wiphy_work(struct work_struct *work) trace_wiphy_work_worker_start(&rdev->wiphy); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); if (rdev->suspended) - goto out; + return; spin_lock_irq(&rdev->wiphy_work_lock); wk = list_first_entry_or_null(&rdev->wiphy_work_list, @@ -441,8 +439,6 @@ static void cfg80211_wiphy_work(struct work_struct *work) } else { spin_unlock_irq(&rdev->wiphy_work_lock); } -out: - wiphy_unlock(&rdev->wiphy); } /* exported functions */ @@ -1526,9 +1522,9 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, break; case NETDEV_REGISTER: if (!wdev->registered) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + cfg80211_register_wdev(rdev, wdev); - wiphy_unlock(&rdev->wiphy); } break; case NETDEV_UNREGISTER: @@ -1537,16 +1533,16 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, * so check wdev->registered. */ if (wdev->registered && !wdev->registering) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + _cfg80211_unregister_wdev(wdev, false); - wiphy_unlock(&rdev->wiphy); } break; case NETDEV_GOING_DOWN: - wiphy_lock(&rdev->wiphy); - cfg80211_leave(rdev, wdev); - cfg80211_remove_links(wdev); - wiphy_unlock(&rdev->wiphy); + scoped_guard(wiphy, &rdev->wiphy) { + cfg80211_leave(rdev, wdev); + cfg80211_remove_links(wdev); + } /* since we just did cfg80211_leave() nothing to do there */ cancel_work_sync(&wdev->disconnect_wk); cancel_work_sync(&wdev->pmsr_free_wk); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index a5eb92d93074e..9d577523462d5 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -627,10 +627,10 @@ void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) rdev = container_of(wk, struct cfg80211_registered_device, mgmt_registrations_update_wk); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_mgmt_registrations_update(wdev); - wiphy_unlock(&rdev->wiphy); } int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, @@ -1193,10 +1193,10 @@ cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + __cfg80211_background_cac_event(rdev, rdev->background_radar_wdev, chandef, event); - wiphy_unlock(&rdev->wiphy); } void cfg80211_background_cac_done_wk(struct work_struct *work) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9d2edb71f981d..9590f9bd2ec0c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3626,7 +3626,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } else wdev = netdev->ieee80211_ptr; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); /* * end workaround code, by now the rdev is available @@ -3639,32 +3639,24 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rtnl_unlock(); if (result) - goto out; + return result; if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { struct ieee80211_txq_params txq_params; struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1]; - if (!rdev->ops->set_txq_params) { - result = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->set_txq_params) + return -EOPNOTSUPP; - if (!netdev) { - result = -EINVAL; - goto out; - } + if (!netdev) + return -EINVAL; if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - result = -EINVAL; - goto out; - } + netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EINVAL; - if (!netif_running(netdev)) { - result = -ENETDOWN; - goto out; - } + if (!netif_running(netdev)) + return -ENETDOWN; nla_for_each_nested(nl_txq_params, info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS], @@ -3675,10 +3667,11 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) txq_params_policy, info->extack); if (result) - goto out; + return result; + result = parse_txq_params(tb, &txq_params); if (result) - goto out; + return result; txq_params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -3694,7 +3687,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) result = rdev_set_txq_params(rdev, netdev, &txq_params); if (result) - goto out; + return result; } } @@ -3711,7 +3704,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (result) - goto out; + return result; } if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) { @@ -3722,19 +3715,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (!(rdev->wiphy.features & NL80211_FEATURE_VIF_TXPOWER)) txp_wdev = NULL; - if (!rdev->ops->set_tx_power) { - result = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->set_tx_power) + return -EOPNOTSUPP; idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING; type = nla_get_u32(info->attrs[idx]); if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] && - (type != NL80211_TX_POWER_AUTOMATIC)) { - result = -EINVAL; - goto out; - } + (type != NL80211_TX_POWER_AUTOMATIC)) + return -EINVAL; if (type != NL80211_TX_POWER_AUTOMATIC) { idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL; @@ -3743,7 +3732,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) result = rdev_set_tx_power(rdev, txp_wdev, type, mbm); if (result) - goto out; + return result; } if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] && @@ -3752,10 +3741,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if ((!rdev->wiphy.available_antennas_tx && !rdev->wiphy.available_antennas_rx) || - !rdev->ops->set_antenna) { - result = -EOPNOTSUPP; - goto out; - } + !rdev->ops->set_antenna) + return -EOPNOTSUPP; tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]); rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]); @@ -3763,17 +3750,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) /* reject antenna configurations which don't match the * available antenna masks, except for the "all" mask */ if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) || - (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) { - result = -EINVAL; - goto out; - } + (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) + return -EINVAL; tx_ant = tx_ant & rdev->wiphy.available_antennas_tx; rx_ant = rx_ant & rdev->wiphy.available_antennas_rx; result = rdev_set_antenna(rdev, tx_ant, rx_ant); if (result) - goto out; + return result; } changed = 0; @@ -3795,10 +3780,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) { frag_threshold = nla_get_u32( info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]); - if (frag_threshold < 256) { - result = -EINVAL; - goto out; - } + if (frag_threshold < 256) + return -EINVAL; if (frag_threshold != (u32) -1) { /* @@ -3819,10 +3802,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { - if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { - result = -EINVAL; - goto out; - } + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) + return -EINVAL; coverage_class = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); @@ -3830,20 +3811,17 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { - if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) { - result = -EOPNOTSUPP; - goto out; - } + if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) + return -EOPNOTSUPP; changed |= WIPHY_PARAM_DYN_ACK; } if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) { - result = -EOPNOTSUPP; - goto out; - } + NL80211_EXT_FEATURE_TXQS)) + return -EOPNOTSUPP; + txq_limit = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_LIMIT]); changed |= WIPHY_PARAM_TXQ_LIMIT; @@ -3851,10 +3829,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) { - result = -EOPNOTSUPP; - goto out; - } + NL80211_EXT_FEATURE_TXQS)) + return -EOPNOTSUPP; + txq_memory_limit = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]); changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT; @@ -3862,10 +3839,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) { - result = -EOPNOTSUPP; - goto out; - } + NL80211_EXT_FEATURE_TXQS)) + return -EOPNOTSUPP; + txq_quantum = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_QUANTUM]); changed |= WIPHY_PARAM_TXQ_QUANTUM; @@ -3877,10 +3853,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u8 old_coverage_class; u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum; - if (!rdev->ops->set_wiphy_params) { - result = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->set_wiphy_params) + return -EOPNOTSUPP; old_retry_short = rdev->wiphy.retry_short; old_retry_long = rdev->wiphy.retry_long; @@ -3918,15 +3892,11 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.txq_limit = old_txq_limit; rdev->wiphy.txq_memory_limit = old_txq_memory_limit; rdev->wiphy.txq_quantum = old_txq_quantum; - goto out; + return result; } } - result = 0; - -out: - wiphy_unlock(&rdev->wiphy); - return result; + return 0; } int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef) @@ -4144,22 +4114,22 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * if_idx = 0; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (if_idx < if_start) { if_idx++; continue; } + if (nl80211_send_iface(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev, - NL80211_CMD_NEW_INTERFACE) < 0) { - wiphy_unlock(&rdev->wiphy); + NL80211_CMD_NEW_INTERFACE) < 0) goto out; - } + if_idx++; } - wiphy_unlock(&rdev->wiphy); if_start = 0; wp_idx++; @@ -4517,16 +4487,13 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - int ret; /* to avoid failing a new interface creation due to pending removal */ cfg80211_destroy_ifaces(rdev); - wiphy_lock(&rdev->wiphy); - ret = _nl80211_new_interface(skb, info); - wiphy_unlock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); - return ret; + return _nl80211_new_interface(skb, info); } static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) @@ -10098,7 +10065,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, struct cfg80211_chan_def chandef; enum nl80211_dfs_regions dfs_region; unsigned int cac_time_ms; - int err = -EINVAL; + int err; flush_delayed_work(&rdev->dfs_update_channels_wk); @@ -10113,35 +10080,29 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, return -EINVAL; } - wiphy_lock(wiphy); + guard(wiphy)(wiphy); dfs_region = reg_get_dfs_region(wiphy); if (dfs_region == NL80211_DFS_UNSET) - goto unlock; + return -EINVAL; err = nl80211_parse_chandef(rdev, info, &chandef); if (err) - goto unlock; + return err; err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype); if (err < 0) - goto unlock; + return err; - if (err == 0) { - err = -EINVAL; - goto unlock; - } + if (err == 0) + return -EINVAL; - if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) { - err = -EINVAL; - goto unlock; - } + if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) + return -EINVAL; - if (nla_get_flag(info->attrs[NL80211_ATTR_RADAR_BACKGROUND])) { - err = cfg80211_start_background_radar_detection(rdev, wdev, - &chandef); - goto unlock; - } + if (nla_get_flag(info->attrs[NL80211_ATTR_RADAR_BACKGROUND])) + return cfg80211_start_background_radar_detection(rdev, wdev, + &chandef); if (cfg80211_beaconing_iface_active(wdev)) { /* During MLO other link(s) can beacon, only the current link @@ -10151,26 +10112,19 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, !wdev->links[link_id].ap.beacon_interval) { /* nothing */ } else { - err = -EBUSY; - goto unlock; + return -EBUSY; } } - if (wdev->links[link_id].cac_started) { - err = -EBUSY; - goto unlock; - } + if (wdev->links[link_id].cac_started) + return -EBUSY; /* CAC start is offloaded to HW and can't be started manually */ - if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) { - err = -EOPNOTSUPP; - goto unlock; - } + if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) + return -EOPNOTSUPP; - if (!rdev->ops->start_radar_detection) { - err = -EOPNOTSUPP; - goto unlock; - } + if (!rdev->ops->start_radar_detection) + return -EOPNOTSUPP; cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, &chandef); if (WARN_ON(!cac_time_ms)) @@ -10197,10 +10151,8 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, wdev->links[link_id].cac_start_time = jiffies; wdev->links[link_id].cac_time_ms = cac_time_ms; } -unlock: - wiphy_unlock(wiphy); - return err; + return 0; } static int nl80211_notify_radar_detection(struct sk_buff *skb, diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 0396fa19bdf19..a117f5093ca29 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -630,9 +630,9 @@ void cfg80211_pmsr_free_wk(struct work_struct *work) struct wireless_dev *wdev = container_of(work, struct wireless_dev, pmsr_free_wk); - wiphy_lock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); + cfg80211_pmsr_process_abort(wdev); - wiphy_unlock(wdev->wiphy); } void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 1df65a5a44f7a..2dd0533e76605 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2465,11 +2465,11 @@ static void reg_leave_invalid_chans(struct wiphy *wiphy) struct wireless_dev *wdev; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - wiphy_lock(wiphy); + guard(wiphy)(wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) if (!reg_wdev_chan_valid(wiphy, wdev)) cfg80211_leave(rdev, wdev); - wiphy_unlock(wiphy); } static void reg_check_chans_work(struct work_struct *work) @@ -2649,13 +2649,11 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, return; rtnl_lock(); - wiphy_lock(wiphy); - - tmp = get_wiphy_regdom(wiphy); - rcu_assign_pointer(wiphy->regd, new_regd); - rcu_free_regdom(tmp); - - wiphy_unlock(wiphy); + scoped_guard(wiphy, wiphy) { + tmp = get_wiphy_regdom(wiphy); + rcu_assign_pointer(wiphy->regd, new_regd); + rcu_free_regdom(tmp); + } rtnl_unlock(); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); @@ -2825,9 +2823,9 @@ reg_process_hint_driver(struct wiphy *wiphy, tmp = get_wiphy_regdom(wiphy); ASSERT_RTNL(); - wiphy_lock(wiphy); - rcu_assign_pointer(wiphy->regd, regd); - wiphy_unlock(wiphy); + scoped_guard(wiphy, wiphy) { + rcu_assign_pointer(wiphy->regd, regd); + } rcu_free_regdom(tmp); } @@ -3205,9 +3203,9 @@ static void reg_process_self_managed_hints(void) ASSERT_RTNL(); for_each_rdev(rdev) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + reg_process_self_managed_hint(&rdev->wiphy); - wiphy_unlock(&rdev->wiphy); } reg_check_channels(); @@ -3600,14 +3598,12 @@ static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag) struct wireless_dev *wdev; for_each_rdev(rdev) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { - if (!(wdev->wiphy->regulatory_flags & flag)) { - wiphy_unlock(&rdev->wiphy); + if (!(wdev->wiphy->regulatory_flags & flag)) return false; - } } - wiphy_unlock(&rdev->wiphy); } return true; @@ -3883,19 +3879,18 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, if (!driver_request->intersect) { ASSERT_RTNL(); - wiphy_lock(request_wiphy); - if (request_wiphy->regd) - tmp = get_wiphy_regdom(request_wiphy); - - regd = reg_copy_regd(rd); - if (IS_ERR(regd)) { - wiphy_unlock(request_wiphy); - return PTR_ERR(regd); + scoped_guard(wiphy, request_wiphy) { + if (request_wiphy->regd) + tmp = get_wiphy_regdom(request_wiphy); + + regd = reg_copy_regd(rd); + if (IS_ERR(regd)) + return PTR_ERR(regd); + + rcu_assign_pointer(request_wiphy->regd, regd); + rcu_free_regdom(tmp); } - rcu_assign_pointer(request_wiphy->regd, regd); - rcu_free_regdom(tmp); - wiphy_unlock(request_wiphy); reset_regdomains(false, rd); return 0; } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 1c6fd45aa8093..d056248c43d2a 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1238,7 +1238,8 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, sched_scan_res_wk); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->report_results) { req->report_results = false; @@ -1253,7 +1254,6 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work) NL80211_CMD_SCHED_SCAN_RESULTS); } } - wiphy_unlock(&rdev->wiphy); } void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) @@ -1288,9 +1288,9 @@ EXPORT_SYMBOL(cfg80211_sched_scan_stopped_locked); void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid) { - wiphy_lock(wiphy); + guard(wiphy)(wiphy); + cfg80211_sched_scan_stopped_locked(wiphy, reqid); - wiphy_unlock(wiphy); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); @@ -3565,10 +3565,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, /* translate "Scan for SSID" request */ if (wreq) { if (wrqu->data.flags & IW_SCAN_THIS_ESSID) { - if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) { - err = -EINVAL; - goto out; - } + if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); creq->ssids[0].ssid_len = wreq->essid_len; } @@ -3584,20 +3582,20 @@ int cfg80211_wext_siwscan(struct net_device *dev, eth_broadcast_addr(creq->bssid); - wiphy_lock(&rdev->wiphy); - - rdev->scan_req = creq; - err = rdev_scan(rdev, creq); - if (err) { - rdev->scan_req = NULL; - /* creq will be freed below */ - } else { - nl80211_send_scan_start(rdev, dev->ieee80211_ptr); - /* creq now owned by driver */ - creq = NULL; - dev_hold(dev); + scoped_guard(wiphy, &rdev->wiphy) { + rdev->scan_req = creq; + err = rdev_scan(rdev, creq); + if (err) { + rdev->scan_req = NULL; + /* creq will be freed below */ + } else { + nl80211_send_scan_start(rdev, dev->ieee80211_ptr); + /* creq now owned by driver */ + creq = NULL; + dev_hold(dev); + } } - wiphy_unlock(&rdev->wiphy); + out: kfree(creq); return err; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 431da30817a6f..7a734c8085afc 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -251,7 +251,7 @@ void cfg80211_conn_work(struct work_struct *work) u8 bssid_buf[ETH_ALEN], *bssid = NULL; enum nl80211_timeout_reason treason; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) @@ -279,8 +279,6 @@ void cfg80211_conn_work(struct work_struct *work) __cfg80211_connect_result(wdev->netdev, &cr, false); } } - - wiphy_unlock(&rdev->wiphy); } static void cfg80211_step_auth_next(struct cfg80211_conn *conn, @@ -692,13 +690,13 @@ static bool cfg80211_is_all_idle(void) * as chan dfs state, etc. */ for_each_rdev(rdev) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->conn || wdev->connected || cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; } - wiphy_unlock(&rdev->wiphy); } return is_all_idle; @@ -1582,7 +1580,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) container_of(work, struct wireless_dev, disconnect_wk); struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - wiphy_lock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); if (wdev->conn_owner_nlportid) { switch (wdev->iftype) { @@ -1618,6 +1616,4 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) break; } } - - wiphy_unlock(wdev->wiphy); } diff --git a/net/wireless/util.c b/net/wireless/util.c index 040d62051eb96..c7c6261c91461 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2572,7 +2572,6 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; - int ret; wdev = dev->ieee80211_ptr; if (!wdev) @@ -2584,11 +2583,9 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, memset(sinfo, 0, sizeof(*sinfo)); - wiphy_lock(&rdev->wiphy); - ret = rdev_get_station(rdev, dev, mac_addr, sinfo); - wiphy_unlock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); - return ret; + return rdev_get_station(rdev, dev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 90d5c05926673..687f93664d1f1 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -39,7 +39,6 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, struct cfg80211_registered_device *rdev; struct vif_params vifparams; enum nl80211_iftype type; - int ret; rdev = wiphy_to_rdev(wdev->wiphy); @@ -62,11 +61,9 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, memset(&vifparams, 0, sizeof(vifparams)); - wiphy_lock(wdev->wiphy); - ret = cfg80211_change_iface(rdev, dev, type, &vifparams); - wiphy_unlock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); - return ret; + return cfg80211_change_iface(rdev, dev, type, &vifparams); } int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, @@ -258,23 +255,17 @@ int cfg80211_wext_siwrts(struct net_device *dev, u32 orts = wdev->wiphy->rts_threshold; int err; - wiphy_lock(&rdev->wiphy); - if (rts->disabled || !rts->fixed) { + guard(wiphy)(&rdev->wiphy); + if (rts->disabled || !rts->fixed) wdev->wiphy->rts_threshold = (u32) -1; - } else if (rts->value < 0) { - err = -EINVAL; - goto out; - } else { + else if (rts->value < 0) + return -EINVAL; + else wdev->wiphy->rts_threshold = rts->value; - } err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); - if (err) wdev->wiphy->rts_threshold = orts; - -out: - wiphy_unlock(&rdev->wiphy); return err; } @@ -302,12 +293,12 @@ int cfg80211_wext_siwfrag(struct net_device *dev, u32 ofrag = wdev->wiphy->frag_threshold; int err; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + if (frag->disabled || !frag->fixed) { wdev->wiphy->frag_threshold = (u32) -1; } else if (frag->value < 256) { - err = -EINVAL; - goto out; + return -EINVAL; } else { /* Fragment length must be even, so strip LSB. */ wdev->wiphy->frag_threshold = frag->value & ~0x1; @@ -316,9 +307,6 @@ int cfg80211_wext_siwfrag(struct net_device *dev, err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; -out: - wiphy_unlock(&rdev->wiphy); - return err; } @@ -352,7 +340,8 @@ static int cfg80211_wext_siwretry(struct net_device *dev, (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT) return -EINVAL; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + if (retry->flags & IW_RETRY_LONG) { wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; @@ -371,7 +360,6 @@ static int cfg80211_wext_siwretry(struct net_device *dev, wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; } - wiphy_unlock(&rdev->wiphy); return err; } @@ -578,9 +566,9 @@ static int cfg80211_wext_siwencode(struct net_device *dev, struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int idx, err; - bool remove = false; struct key_params params; + bool remove = false; + int idx; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) @@ -592,11 +580,9 @@ static int cfg80211_wext_siwencode(struct net_device *dev, !rdev->ops->set_default_key) return -EOPNOTSUPP; - wiphy_lock(&rdev->wiphy); - if (wdev->valid_links) { - err = -EOPNOTSUPP; - goto out; - } + guard(wiphy)(&rdev->wiphy); + if (wdev->valid_links) + return -EOPNOTSUPP; idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { @@ -604,8 +590,7 @@ static int cfg80211_wext_siwencode(struct net_device *dev, if (idx < 0) idx = 0; } else if (idx < 1 || idx > 4) { - err = -EINVAL; - goto out; + return -EINVAL; } else { idx--; } @@ -614,7 +599,8 @@ static int cfg80211_wext_siwencode(struct net_device *dev, remove = true; else if (erq->length == 0) { /* No key data - just set the default TX key index */ - err = 0; + int err = 0; + if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) @@ -622,28 +608,22 @@ static int cfg80211_wext_siwencode(struct net_device *dev, true); if (!err) wdev->wext.default_key = idx; - goto out; + return err; } memset(¶ms, 0, sizeof(params)); params.key = keybuf; params.key_len = erq->length; - if (erq->length == 5) { + if (erq->length == 5) params.cipher = WLAN_CIPHER_SUITE_WEP40; - } else if (erq->length == 13) { + else if (erq->length == 13) params.cipher = WLAN_CIPHER_SUITE_WEP104; - } else if (!remove) { - err = -EINVAL; - goto out; - } - - err = cfg80211_set_encryption(rdev, dev, false, NULL, remove, - wdev->wext.default_key == -1, - idx, ¶ms); -out: - wiphy_unlock(&rdev->wiphy); + else if (!remove) + return -EINVAL; - return err; + return cfg80211_set_encryption(rdev, dev, false, NULL, remove, + wdev->wext.default_key == -1, + idx, ¶ms); } static int cfg80211_wext_siwencodeext(struct net_device *dev, @@ -659,7 +639,6 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, bool remove = false; struct key_params params; u32 cipher; - int ret; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) @@ -734,16 +713,13 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, params.seq_len = 6; } - wiphy_lock(wdev->wiphy); - ret = cfg80211_set_encryption( - rdev, dev, - !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), - addr, remove, - ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, - idx, ¶ms); - wiphy_unlock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); - return ret; + return cfg80211_set_encryption(rdev, dev, + !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), + addr, remove, + ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, + idx, ¶ms); } static int cfg80211_wext_giwencode(struct net_device *dev, @@ -794,61 +770,41 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, struct cfg80211_chan_def chandef = { .width = NL80211_CHAN_WIDTH_20_NOHT, }; - int freq, ret; + int freq; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); - break; + return cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); - break; + return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); case NL80211_IFTYPE_MONITOR: freq = cfg80211_wext_freq(wextfreq); - if (freq < 0) { - ret = freq; - break; - } - if (freq == 0) { - ret = -EINVAL; - break; - } + if (freq < 0) + return freq; + if (freq == 0) + return -EINVAL; + chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (!chandef.chan) { - ret = -EINVAL; - break; - } - ret = cfg80211_set_monitor_channel(rdev, dev, &chandef); - break; + if (!chandef.chan) + return -EINVAL; + return cfg80211_set_monitor_channel(rdev, dev, &chandef); case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); - if (freq < 0) { - ret = freq; - break; - } - if (freq == 0) { - ret = -EINVAL; - break; - } + if (freq < 0) + return freq; + if (freq == 0) + return -EINVAL; chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (!chandef.chan) { - ret = -EINVAL; - break; - } - ret = cfg80211_set_mesh_channel(rdev, wdev, &chandef); - break; + if (!chandef.chan) + return -EINVAL; + return cfg80211_set_mesh_channel(rdev, wdev, &chandef); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_giwfreq(struct net_device *dev, @@ -861,35 +817,26 @@ static int cfg80211_wext_giwfreq(struct net_device *dev, struct cfg80211_chan_def chandef = {}; int ret; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); - break; + return cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); - break; + return cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); case NL80211_IFTYPE_MONITOR: - if (!rdev->ops->get_channel) { - ret = -EINVAL; - break; - } + if (!rdev->ops->get_channel) + return -EINVAL; ret = rdev_get_channel(rdev, wdev, 0, &chandef); if (ret) - break; + return ret; freq->m = chandef.chan->center_freq; freq->e = 6; - ret = 0; - break; + return ret; default: - ret = -EINVAL; - break; + return -EINVAL; } - - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_siwtxpower(struct net_device *dev, @@ -900,7 +847,6 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); enum nl80211_tx_power_setting type; int dbm = 0; - int ret; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; @@ -942,11 +888,9 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, return 0; } - wiphy_lock(&rdev->wiphy); - ret = rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); - wiphy_unlock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); - return ret; + return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); } static int cfg80211_wext_giwtxpower(struct net_device *dev, @@ -965,9 +909,9 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, if (!rdev->ops->get_tx_power) return -EOPNOTSUPP; - wiphy_lock(&rdev->wiphy); - err = rdev_get_tx_power(rdev, wdev, &val); - wiphy_unlock(&rdev->wiphy); + scoped_guard(wiphy, &rdev->wiphy) { + err = rdev_get_tx_power(rdev, wdev, &val); + } if (err) return err; @@ -1209,9 +1153,9 @@ static int cfg80211_wext_siwpower(struct net_device *dev, timeout = wrq->value / 1000; } - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + err = rdev_set_power_mgmt(rdev, dev, ps, timeout); - wiphy_unlock(&rdev->wiphy); if (err) return err; @@ -1244,8 +1188,8 @@ static int cfg80211_wext_siwrate(struct net_device *dev, struct cfg80211_bitrate_mask mask; u32 fixed, maxrate; struct ieee80211_supported_band *sband; - int band, ridx, ret; bool match = false; + int band, ridx; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; @@ -1283,14 +1227,12 @@ static int cfg80211_wext_siwrate(struct net_device *dev, if (!match) return -EINVAL; - wiphy_lock(&rdev->wiphy); - if (dev->ieee80211_ptr->valid_links) - ret = -EOPNOTSUPP; - else - ret = rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask); - wiphy_unlock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); - return ret; + if (dev->ieee80211_ptr->valid_links) + return -EOPNOTSUPP; + + return rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask); } static int cfg80211_wext_giwrate(struct net_device *dev, @@ -1319,9 +1261,9 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; - wiphy_lock(&rdev->wiphy); - err = rdev_get_station(rdev, dev, addr, &sinfo); - wiphy_unlock(&rdev->wiphy); + scoped_guard(wiphy, &rdev->wiphy) { + err = rdev_get_station(rdev, dev, addr, &sinfo); + } if (err) return err; @@ -1420,23 +1362,17 @@ static int cfg80211_wext_siwap(struct net_device *dev, struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int ret; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); - break; + return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); - break; + return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_giwap(struct net_device *dev, @@ -1446,23 +1382,17 @@ static int cfg80211_wext_giwap(struct net_device *dev, struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int ret; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); - break; + return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); - break; + return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_siwessid(struct net_device *dev, @@ -1472,23 +1402,17 @@ static int cfg80211_wext_siwessid(struct net_device *dev, struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int ret; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_siwessid(dev, info, data, ssid); - break; + return cfg80211_ibss_wext_siwessid(dev, info, data, ssid); case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_siwessid(dev, info, data, ssid); - break; + return cfg80211_mgd_wext_siwessid(dev, info, data, ssid); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_giwessid(struct net_device *dev, @@ -1498,26 +1422,20 @@ static int cfg80211_wext_giwessid(struct net_device *dev, struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int ret; data->flags = 0; data->length = 0; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_giwessid(dev, info, data, ssid); - break; + return cfg80211_ibss_wext_giwessid(dev, info, data, ssid); case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_giwessid(dev, info, data, ssid); - break; + return cfg80211_mgd_wext_giwessid(dev, info, data, ssid); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_siwpmksa(struct net_device *dev, @@ -1528,7 +1446,6 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmksa cfg_pmksa; struct iw_pmksa *pmksa = (struct iw_pmksa *)extra; - int ret; memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa)); @@ -1538,39 +1455,27 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, cfg_pmksa.bssid = pmksa->bssid.sa_data; cfg_pmksa.pmkid = pmksa->pmkid; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (pmksa->cmd) { case IW_PMKSA_ADD: - if (!rdev->ops->set_pmksa) { - ret = -EOPNOTSUPP; - break; - } + if (!rdev->ops->set_pmksa) + return -EOPNOTSUPP; - ret = rdev_set_pmksa(rdev, dev, &cfg_pmksa); - break; + return rdev_set_pmksa(rdev, dev, &cfg_pmksa); case IW_PMKSA_REMOVE: - if (!rdev->ops->del_pmksa) { - ret = -EOPNOTSUPP; - break; - } + if (!rdev->ops->del_pmksa) + return -EOPNOTSUPP; - ret = rdev_del_pmksa(rdev, dev, &cfg_pmksa); - break; + return rdev_del_pmksa(rdev, dev, &cfg_pmksa); case IW_PMKSA_FLUSH: - if (!rdev->ops->flush_pmksa) { - ret = -EOPNOTSUPP; - break; - } + if (!rdev->ops->flush_pmksa) + return -EOPNOTSUPP; - ret = rdev_flush_pmksa(rdev, dev); - break; + return rdev_flush_pmksa(rdev, dev); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static const iw_handler cfg80211_handlers[] = { diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 8edd9ada69d0b..573b6b15a446e 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -302,8 +302,8 @@ int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ie_len = data->length; u8 *ie = extra; - int ie_len = data->length, err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; @@ -311,39 +311,31 @@ int cfg80211_wext_siwgenie(struct net_device *dev, if (!ie_len) ie = NULL; - wiphy_lock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); /* no change */ - err = 0; if (wdev->wext.ie_len == ie_len && memcmp(wdev->wext.ie, ie, ie_len) == 0) - goto out; + return 0; if (ie_len) { ie = kmemdup(extra, ie_len, GFP_KERNEL); - if (!ie) { - err = -ENOMEM; - goto out; - } - } else + if (!ie) + return -ENOMEM; + } else { ie = NULL; + } kfree(wdev->wext.ie); wdev->wext.ie = ie; wdev->wext.ie_len = ie_len; - if (wdev->conn) { - err = cfg80211_disconnect(rdev, dev, - WLAN_REASON_DEAUTH_LEAVING, false); - if (err) - goto out; - } + if (wdev->conn) + return cfg80211_disconnect(rdev, dev, + WLAN_REASON_DEAUTH_LEAVING, false); /* userspace better not think we'll reconnect */ - err = 0; - out: - wiphy_unlock(wdev->wiphy); - return err; + return 0; } int cfg80211_wext_siwmlme(struct net_device *dev, @@ -353,7 +345,6 @@ int cfg80211_wext_siwmlme(struct net_device *dev, struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_mlme *mlme = (struct iw_mlme *)extra; struct cfg80211_registered_device *rdev; - int err; if (!wdev) return -EOPNOTSUPP; @@ -366,17 +357,13 @@ int cfg80211_wext_siwmlme(struct net_device *dev, if (mlme->addr.sa_family != ARPHRD_ETHER) return -EINVAL; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (mlme->cmd) { case IW_MLME_DEAUTH: case IW_MLME_DISASSOC: - err = cfg80211_disconnect(rdev, dev, mlme->reason_code, true); - break; + return cfg80211_disconnect(rdev, dev, mlme->reason_code, true); default: - err = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return err; } From 8e66f6c6738e5b458345cd5f75ef6da035d95599 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 22 Nov 2024 09:42:26 +0100 Subject: [PATCH 0117/1450] wifi: mac80211: use wiphy guard The wiphy guard simplifies some code here, so use it. Reviewed-by: Jeff Johnson Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20241122094225.cea65b2d2fd4.Icc168c4bbeddec98ea096aee9077211a7b88b69e@changeid Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 4 ++-- net/mac80211/ethtool.c | 18 ++++++------------ net/mac80211/iface.c | 25 +++++++++---------------- 3 files changed, 17 insertions(+), 30 deletions(-) diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index be2e486907f94..bf0a2902d93c6 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -284,7 +284,8 @@ static ssize_t aql_txq_limit_write(struct file *file, q_limit_low_old = local->aql_txq_limit_low[ac]; q_limit_high_old = local->aql_txq_limit_high[ac]; - wiphy_lock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); + local->aql_txq_limit_low[ac] = q_limit_low; local->aql_txq_limit_high[ac] = q_limit_high; @@ -296,7 +297,6 @@ static ssize_t aql_txq_limit_write(struct file *file, sta->airtime[ac].aql_limit_high = q_limit_high; } } - wiphy_unlock(local->hw.wiphy); return count; } diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 99f6174a9d696..069aa05139cd4 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -19,16 +19,13 @@ static int ieee80211_set_ringparam(struct net_device *dev, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); - int ret; if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) return -EINVAL; - wiphy_lock(local->hw.wiphy); - ret = drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); - wiphy_unlock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); - return ret; + return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); } static void ieee80211_get_ringparam(struct net_device *dev, @@ -40,10 +37,10 @@ static void ieee80211_get_ringparam(struct net_device *dev, memset(rp, 0, sizeof(*rp)); - wiphy_lock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); + drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending, &rp->rx_pending, &rp->rx_max_pending); - wiphy_unlock(local->hw.wiphy); } static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { @@ -109,7 +106,7 @@ static void ieee80211_get_stats(struct net_device *dev, * network device. */ - wiphy_lock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sta = sta_info_get_bss(sdata, sdata->deflink.u.mgd.bssid); @@ -205,13 +202,10 @@ static void ieee80211_get_stats(struct net_device *dev, else data[i++] = -1LL; - if (WARN_ON(i != STA_STATS_LEN)) { - wiphy_unlock(local->hw.wiphy); + if (WARN_ON(i != STA_STATS_LEN)) return; - } drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); - wiphy_unlock(local->hw.wiphy); } static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index a8fbedd530f45..32aaf3856ccf3 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -300,7 +300,6 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; - int ret; /* * This happens during unregistration if there's a bond device @@ -310,11 +309,9 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr) if (!dev->ieee80211_ptr->registered) return 0; - wiphy_lock(local->hw.wiphy); - ret = _ieee80211_change_mac(sdata, addr); - wiphy_unlock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); - return ret; + return _ieee80211_change_mac(sdata, addr); } static inline int identical_mac_addr_allowed(int type1, int type2) @@ -450,16 +447,13 @@ static int ieee80211_open(struct net_device *dev) if (!is_valid_ether_addr(dev->dev_addr)) return -EADDRNOTAVAIL; - wiphy_lock(sdata->local->hw.wiphy); + guard(wiphy)(sdata->local->hw.wiphy); + err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type); if (err) - goto out; + return err; - err = ieee80211_do_open(&sdata->wdev, true); -out: - wiphy_unlock(sdata->local->hw.wiphy); - - return err; + return ieee80211_do_open(&sdata->wdev, true); } static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) @@ -780,11 +774,11 @@ static int ieee80211_stop(struct net_device *dev) ieee80211_stop_mbssid(sdata); } - wiphy_lock(sdata->local->hw.wiphy); + guard(wiphy)(sdata->local->hw.wiphy); + wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->activate_links_work); ieee80211_do_stop(sdata, true); - wiphy_unlock(sdata->local->hw.wiphy); return 0; } @@ -2282,7 +2276,7 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) */ cfg80211_shutdown_all_interfaces(local->hw.wiphy); - wiphy_lock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); WARN(local->open_count, "%s: open count remains %d\n", wiphy_name(local->hw.wiphy), local->open_count); @@ -2312,7 +2306,6 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) if (!netdev) kfree(sdata); } - wiphy_unlock(local->hw.wiphy); } static int netdev_notify(struct notifier_block *nb, From 13c4f7714c6a1ecf748a2f22099447c14fe6ed8c Mon Sep 17 00:00:00 2001 From: Zichen Xie Date: Fri, 15 Nov 2024 00:38:36 -0600 Subject: [PATCH 0118/1450] wifi: cfg80211: tests: Fix potential NULL dereference in test_cfg80211_parse_colocated_ap() kunit_kzalloc() may return NULL, dereferencing it without NULL check may lead to NULL dereference. Add a NULL check for ies. Fixes: 45d43937a44c ("wifi: cfg80211: add a kunit test for 6 GHz colocated AP parsing") Signed-off-by: Zichen Xie Link: https://patch.msgid.link/20241115063835.5888-1-zichenxie0106@gmail.com Signed-off-by: Johannes Berg --- net/wireless/tests/scan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/wireless/tests/scan.c b/net/wireless/tests/scan.c index 9f458be716595..79a99cf5e8922 100644 --- a/net/wireless/tests/scan.c +++ b/net/wireless/tests/scan.c @@ -810,6 +810,8 @@ static void test_cfg80211_parse_colocated_ap(struct kunit *test) skb_put_data(input, "123", 3); ies = kunit_kzalloc(test, struct_size(ies, data, input->len), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, ies); + ies->len = input->len; memcpy(ies->data, input->data, input->len); From 4e3a841c47bbf985782a9f761d57f2f999e1d31b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Jos=C3=A9=20Arboleda?= Date: Mon, 19 Aug 2024 16:45:20 -0500 Subject: [PATCH 0119/1450] wifi: iwlwifi: mvm: Replace spaces for tabs in iwl_mvm_vendor_events_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch replaces spaces with tabs in the affected source files to adhere to the Linux kernel coding style guidelines. This change is purely stylistic and do not affect the functionality of the code. Signed-off-by: Juan José Arboleda Link: https://patch.msgid.link/715c74c74b336bed81e92e1336bd1a6ddb7b90ff.1724103043.git.soyjuanarbol@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c b/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c index 080a1587caa56..0f7fa6032c663 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/vendor-cmd.c @@ -104,9 +104,9 @@ static const struct wiphy_vendor_command iwl_mvm_vendor_commands[] = { }; enum iwl_mvm_vendor_events_idx { - /* 0x0 - 0x3 are deprecated */ - IWL_MVM_VENDOR_EVENT_IDX_ROAMING_FORBIDDEN = 4, - NUM_IWL_MVM_VENDOR_EVENT_IDX + /* 0x0 - 0x3 are deprecated */ + IWL_MVM_VENDOR_EVENT_IDX_ROAMING_FORBIDDEN = 4, + NUM_IWL_MVM_VENDOR_EVENT_IDX }; static const struct nl80211_vendor_cmd_info From 7a53af85d3bbdbe06cd47b81a6d99a04dc0a3963 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Mon, 25 Nov 2024 14:02:16 +0530 Subject: [PATCH 0120/1450] wifi: cfg80211: send MLO links tx power info in GET_INTERFACE Currently, TX power is reported on interface/wdev level as part of NL80211_CMD_GET_INTERFACE. With MLO, Multiple links can be part of an interface/wdev and hence its necessary to report the TX power of each link. Add support to send tx power for all valid links of an MLD as part of NL80211_CMD_GET_INTERFACE request. As far as userspace is concerned, there is no behavioral change for Non-ML Interfaces. For ML interfaces, userspace should fetch TX power that is nested inside NL80211_ATTR_MLO_LINKS, similar to how channel info(NL80211_ATTR_WIPHY_FREQ) is fetched. Co-developed-by: Aaradhana Sahu Signed-off-by: Aaradhana Sahu Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20241125083217.216095-2-quic_ramess@quicinc.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 1 + .../broadcom/brcm80211/brcmfmac/cfg80211.c | 2 +- .../net/wireless/marvell/mwifiex/cfg80211.c | 2 +- .../wireless/microchip/wilc1000/cfg80211.c | 2 +- .../net/wireless/quantenna/qtnfmac/cfg80211.c | 2 +- .../staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 3 +- include/net/cfg80211.h | 2 +- net/mac80211/cfg.c | 1 + net/wireless/nl80211.c | 13 +++++- net/wireless/rdev-ops.h | 7 +-- net/wireless/trace.h | 44 +++++++++---------- net/wireless/wext-compat.c | 2 +- 12 files changed, 47 insertions(+), 34 deletions(-) diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 61b2e3f15f0e9..72ce321f2a77e 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -1441,6 +1441,7 @@ static int ath6kl_cfg80211_set_txpower(struct wiphy *wiphy, static int ath6kl_cfg80211_get_txpower(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id, int *dbm) { struct ath6kl *ar = (struct ath6kl *)wiphy_priv(wiphy); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 297a7c738c016..689e779fe00fc 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -2676,7 +2676,7 @@ brcmf_cfg80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, static s32 brcmf_cfg80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - s32 *dbm) + unsigned int link_id, s32 *dbm) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct brcmf_cfg80211_vif *vif = wdev_to_vif(wdev); diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index fca3eea7ee842..a099fdaafa45d 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -410,7 +410,7 @@ mwifiex_cfg80211_set_tx_power(struct wiphy *wiphy, static int mwifiex_cfg80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - int *dbm) + unsigned int link_id, int *dbm) { struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy); struct mwifiex_private *priv = mwifiex_get_priv(adapter, diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index e96736cc7259b..e7aa0f9919232 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -1669,7 +1669,7 @@ static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, } static int get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - int *dbm) + unsigned int link_id, int *dbm) { int ret; struct wilc_vif *vif = netdev_priv(wdev->netdev); diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 8b97accf6638f..0b22825283422 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -881,7 +881,7 @@ static int qtnf_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, } static int qtnf_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - int *dbm) + unsigned int link_id, int *dbm) { struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); int ret; diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index c053ee9c1361c..7fcc46a0bb48f 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -1802,7 +1802,8 @@ static int cfg80211_rtw_set_txpower(struct wiphy *wiphy, } static int cfg80211_rtw_get_txpower(struct wiphy *wiphy, - struct wireless_dev *wdev, int *dbm) + struct wireless_dev *wdev, + unsigned int link_id, int *dbm) { *dbm = (12); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 63e79a22a214f..0a48f47a77dc3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4733,7 +4733,7 @@ struct cfg80211_ops { int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm); int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, - int *dbm); + unsigned int link_id, int *dbm); void (*rfkill_poll)(struct wiphy *wiphy); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 61a824ec33da3..b2410a9135560 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3190,6 +3190,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, static int ieee80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id, int *dbm) { struct ieee80211_local *local = wiphy_priv(wiphy); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9590f9bd2ec0c..793d910347e3b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3980,10 +3980,10 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag goto nla_put_failure; } - if (rdev->ops->get_tx_power) { + if (rdev->ops->get_tx_power && !wdev->valid_links) { int dbm, ret; - ret = rdev_get_tx_power(rdev, wdev, &dbm); + ret = rdev_get_tx_power(rdev, wdev, 0, &dbm); if (ret == 0 && nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, DBM_TO_MBM(dbm))) @@ -4052,6 +4052,15 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (ret == 0 && nl80211_send_chandef(msg, &chandef)) goto nla_put_failure; + if (rdev->ops->get_tx_power) { + int dbm, ret; + + ret = rdev_get_tx_power(rdev, wdev, link_id, &dbm); + if (ret == 0 && + nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, + DBM_TO_MBM(dbm))) + goto nla_put_failure; + } nla_nest_end(msg, link); } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index adb6105bbb7d1..8f2aa7e76c0a6 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -600,11 +600,12 @@ static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, int *dbm) + struct wireless_dev *wdev, unsigned int link_id, + int *dbm) { int ret; - trace_rdev_get_tx_power(&rdev->wiphy, wdev); - ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, dbm); + trace_rdev_get_tx_power(&rdev->wiphy, wdev, link_id); + ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, link_id, dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index d5c9bb614fa6d..a57210c8087c2 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1690,9 +1690,28 @@ TRACE_EVENT(rdev_set_wiphy_params, WIPHY_PR_ARG, __entry->changed) ); -DEFINE_EVENT(wiphy_wdev_evt, rdev_get_tx_power, - TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), - TP_ARGS(wiphy, wdev) +DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id), + TP_ARGS(wiphy, wdev, link_id), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(unsigned int, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->link_id = link_id; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) +); + +DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_tx_power, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id), + TP_ARGS(wiphy, wdev, link_id) ); TRACE_EVENT(rdev_set_tx_power, @@ -2192,25 +2211,6 @@ TRACE_EVENT(rdev_set_noack_map, TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", noack_map: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map) ); - -DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, - TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id), - TP_ARGS(wiphy, wdev, link_id), - TP_STRUCT__entry( - WIPHY_ENTRY - WDEV_ENTRY - __field(unsigned int, link_id) - ), - TP_fast_assign( - WIPHY_ASSIGN; - WDEV_ASSIGN; - __entry->link_id = link_id; - ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u", - WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) -); - DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_channel, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id), diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 687f93664d1f1..a74b1afc594e5 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -910,7 +910,7 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, return -EOPNOTSUPP; scoped_guard(wiphy, &rdev->wiphy) { - err = rdev_get_tx_power(rdev, wdev, &val); + err = rdev_get_tx_power(rdev, wdev, 0, &val); } if (err) return err; From 24dab555ad5951824e3fb6b665aaca84ac69dd12 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Mon, 25 Nov 2024 14:02:17 +0530 Subject: [PATCH 0121/1450] wifi: mac80211: get tx power per link ML interfaces can have multiple affiliated links to it and hence there is a need to report tx power of specified link rather deflink. Add changes to report tx power of requested link from mac80211, also pass link id as an argument in get_tx_power op so that supported drivers can use it to report link's tx power. Co-developed-by: Aaradhana Sahu Signed-off-by: Aaradhana Sahu Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20241125083217.216095-3-quic_ramess@quicinc.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 1 + drivers/net/wireless/ath/ath9k/main.c | 2 +- drivers/net/wireless/mediatek/mt76/mac80211.c | 2 +- drivers/net/wireless/mediatek/mt76/mt76.h | 2 +- include/net/mac80211.h | 2 +- net/mac80211/cfg.c | 15 +++++++++++---- net/mac80211/driver-ops.h | 7 ++++--- net/mac80211/trace.h | 10 ++++++---- 8 files changed, 26 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index e6acbff067496..7e75a9b13ef92 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -9356,6 +9356,7 @@ static int ath11k_fw_stats_request(struct ath11k *ar, static int ath11k_mac_op_get_txpower(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + unsigned int link_id, int *dbm) { struct ath11k *ar = hw->priv; diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index b92c89dad8dea..2f137856a8230 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -2767,7 +2767,7 @@ void ath9k_fill_chanctx_ops(void) #endif static int ath9k_get_txpower(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - int *dbm) + unsigned int link_id, int *dbm) { struct ath_softc *sc = hw->priv; struct ath_vif *avp = (void *)vif->drv_priv; diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c index 9d5561f441347..7fbce5e757df5 100644 --- a/drivers/net/wireless/mediatek/mt76/mac80211.c +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c @@ -1596,7 +1596,7 @@ void mt76_wcid_cleanup(struct mt76_dev *dev, struct mt76_wcid *wcid) EXPORT_SYMBOL_GPL(mt76_wcid_cleanup); int mt76_get_txpower(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - int *dbm) + unsigned int link_id, int *dbm) { struct mt76_phy *phy = hw->priv; int n_chains = hweight16(phy->chainmask); diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index 0b75a45ad2e82..ca2dba3ac65d5 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -1431,7 +1431,7 @@ void mt76_sta_pre_rcu_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int mt76_get_min_avg_rssi(struct mt76_dev *dev, bool ext_phy); int mt76_get_txpower(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - int *dbm); + unsigned int link_id, int *dbm); int mt76_init_sar_power(struct ieee80211_hw *hw, const struct cfg80211_sar_specs *sar); int mt76_get_sar_power(struct mt76_phy *phy, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a97c9f85ae9a5..5ce4dfa3fba55 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4759,7 +4759,7 @@ struct ieee80211_ops { u32 (*get_expected_throughput)(struct ieee80211_hw *hw, struct ieee80211_sta *sta); int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - int *dbm); + unsigned int link_id, int *dbm); int (*tdls_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b2410a9135560..2fa594fb6c1a6 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3195,15 +3195,22 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy, { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct ieee80211_link_data *link_data; if (local->ops->get_txpower && (sdata->flags & IEEE80211_SDATA_IN_DRIVER)) - return drv_get_txpower(local, sdata, dbm); + return drv_get_txpower(local, sdata, link_id, dbm); - if (local->emulate_chanctx) + if (local->emulate_chanctx) { *dbm = local->hw.conf.power_level; - else - *dbm = sdata->vif.bss_conf.txpower; + } else { + link_data = wiphy_dereference(wiphy, sdata->link[link_id]); + + if (link_data) + *dbm = link_data->conf->txpower; + else + return -ENOLINK; + } /* INT_MIN indicates no power level was set yet */ if (*dbm == INT_MIN) diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index edd1e4d4ad9d2..c64531e0a60e7 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1273,7 +1273,8 @@ static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, } static inline int drv_get_txpower(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, int *dbm) + struct ieee80211_sub_if_data *sdata, + unsigned int link_id, int *dbm) { int ret; @@ -1283,8 +1284,8 @@ static inline int drv_get_txpower(struct ieee80211_local *local, if (!local->ops->get_txpower) return -EOPNOTSUPP; - ret = local->ops->get_txpower(&local->hw, &sdata->vif, dbm); - trace_drv_get_txpower(local, sdata, *dbm, ret); + ret = local->ops->get_txpower(&local->hw, &sdata->vif, link_id, dbm); + trace_drv_get_txpower(local, sdata, link_id, *dbm, ret); return ret; } diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 7a4985fc2b16b..dc35fed7e9b09 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2173,13 +2173,14 @@ DEFINE_EVENT(chanswitch_evt, drv_channel_switch_rx_beacon, TRACE_EVENT(drv_get_txpower, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - int dbm, int ret), + unsigned int link_id, int dbm, int ret), - TP_ARGS(local, sdata, dbm, ret), + TP_ARGS(local, sdata, link_id, dbm, ret), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY + __field(unsigned int, link_id) __field(int, dbm) __field(int, ret) ), @@ -2187,13 +2188,14 @@ TRACE_EVENT(drv_get_txpower, TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; + __entry->link_id = link_id; __entry->dbm = dbm; __entry->ret = ret; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " dbm:%d ret:%d", - LOCAL_PR_ARG, VIF_PR_ARG, __entry->dbm, __entry->ret + LOCAL_PR_FMT VIF_PR_FMT " link_id:%d dbm:%d ret:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->dbm, __entry->ret ) ); From 4f85a3b370e6f1a353cfbbfb5c398590dfeca9d7 Mon Sep 17 00:00:00 2001 From: Dylan Eskew Date: Wed, 13 Nov 2024 06:46:08 -0800 Subject: [PATCH 0122/1450] wifi: mac80211: ethtool: add monitor channel reporting When running ethtool on a monitor interface, the channel wasn't reporting properly. This adds logic to properly report the channel for monitor interfaces in ethtool. Signed-off-by: Dylan Eskew Link: https://patch.msgid.link/20241113144608.334060-1-dylan.eskew@candelatech.com Signed-off-by: Johannes Berg --- net/mac80211/ethtool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 069aa05139cd4..42f7ee142ce3f 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -157,6 +157,10 @@ static void ieee80211_get_stats(struct net_device *dev, chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) channel = chanctx_conf->def.chan; + else if (local->open_count > 0 && + local->open_count == local->monitors && + sdata->vif.type == NL80211_IFTYPE_MONITOR) + channel = local->monitor_chanreq.oper.chan; else channel = NULL; rcu_read_unlock(); From b63a95d35f7ff59329257cfd31f990b3b295a242 Mon Sep 17 00:00:00 2001 From: Sathishkumar Muruganandam Date: Tue, 19 Nov 2024 09:57:59 +0530 Subject: [PATCH 0123/1450] wifi: mac80211: add EHT 320 MHz support for mesh Currently, ieee80211_ie_build_he_oper() lacks support for 320 MHz handling (already noted as a TODO). This is because 320 MHz is not included in IEEE 802.11-ax. However, IEEE 802.11-be introduces 320 MHz support and if the chandef indicates a 320 MHz bandwidth and is used directly as it is, it will result in an incorrect HE Operation Information Element. In order to support EHT 320 MHz, HE Operation Element should indicate bandwidth as 160 MHz only. In EHT Operation IE, the correct bandwidth will be present. Devices capable of EHT can parse EHT Information Element and connect in 320 MHz and other HE capable devices can parse HE and can connect in 160 MHz. Add support to downgrade the bandwidth in ieee80211_ie_build_he_oper() during 320 MHz operation and advertise it. Signed-off-by: Sathishkumar Muruganandam Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20241119-mesh_320mhz_support-v1-1-f9463338d584@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/util.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index a4e1301cc999d..c88ce537aaa7e 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2762,6 +2762,7 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; struct ieee80211_he_6ghz_oper *he_6ghz_op; + struct cfg80211_chan_def he_chandef; u32 he_oper_params; u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); @@ -2793,27 +2794,33 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) if (chandef->chan->band != NL80211_BAND_6GHZ) goto out; + cfg80211_chandef_create(&he_chandef, chandef->chan, NL80211_CHAN_NO_HT); + he_chandef.center_freq1 = chandef->center_freq1; + he_chandef.center_freq2 = chandef->center_freq2; + he_chandef.width = chandef->width; + /* TODO add VHT operational */ he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; he_6ghz_op->minrate = 6; /* 6 Mbps */ he_6ghz_op->primary = - ieee80211_frequency_to_channel(chandef->chan->center_freq); + ieee80211_frequency_to_channel(he_chandef.chan->center_freq); he_6ghz_op->ccfs0 = - ieee80211_frequency_to_channel(chandef->center_freq1); - if (chandef->center_freq2) + ieee80211_frequency_to_channel(he_chandef.center_freq1); + if (he_chandef.center_freq2) he_6ghz_op->ccfs1 = - ieee80211_frequency_to_channel(chandef->center_freq2); + ieee80211_frequency_to_channel(he_chandef.center_freq2); else he_6ghz_op->ccfs1 = 0; - switch (chandef->width) { + switch (he_chandef.width) { case NL80211_CHAN_WIDTH_320: - /* - * TODO: mesh operation is not defined over 6GHz 320 MHz - * channels. + /* Downgrade EHT 320 MHz BW to 160 MHz for HE and set new + * center_freq1 */ - WARN_ON(1); - break; + ieee80211_chandef_downgrade(&he_chandef, NULL); + he_6ghz_op->ccfs0 = + ieee80211_frequency_to_channel(he_chandef.center_freq1); + fallthrough; case NL80211_CHAN_WIDTH_160: /* Convert 160 MHz channel width to new style as interop * workaround. @@ -2821,7 +2828,7 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; - if (chandef->chan->center_freq < chandef->center_freq1) + if (he_chandef.chan->center_freq < he_chandef.center_freq1) he_6ghz_op->ccfs0 -= 8; else he_6ghz_op->ccfs0 += 8; From b81e0211e9c70be9eb70924e4e29698bfbbbc03a Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Tue, 19 Nov 2024 09:58:00 +0530 Subject: [PATCH 0124/1450] wifi: mac80211_hwsim: add 6 GHz EHT Mesh capabilities To facilitate testing of mesh EHT 320 MHz, add support for advertising this capability. Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20241119-mesh_320mhz_support-v1-2-f9463338d584@quicinc.com Signed-off-by: Johannes Berg --- drivers/net/wireless/virtual/mac80211_hwsim.c | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index 347a15544afed..cf6a331d40427 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -5048,6 +5048,45 @@ static const struct ieee80211_sband_iftype_data sband_capa_6ghz[] = { .tx_mcs_80p80 = cpu_to_le16(0xfffa), }, }, + .eht_cap = { + .has_eht = true, + .eht_cap_elem = { + .mac_cap_info[0] = IEEE80211_EHT_MAC_CAP0_OM_CONTROL | + IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, + .phy_cap_info[0] = IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ, + /* Leave all the other PHY capability bytes + * unset, as DCM, beam forming, RU and PPE + * threshold information are not supported + */ + }, + /* For all MCS and bandwidth, set 8 NSS for both Tx and + * Rx + */ + .eht_mcs_nss_supp = { + /* As B1 and B2 are set in the supported + * channel width set field in the HE PHY + * capabilities information field and 320MHz in + * 6GHz is supported include all the following + * MCS/NSS. + */ + .bw._80 = { + .rx_tx_mcs9_max_nss = 0x88, + .rx_tx_mcs11_max_nss = 0x88, + .rx_tx_mcs13_max_nss = 0x88, + }, + .bw._160 = { + .rx_tx_mcs9_max_nss = 0x88, + .rx_tx_mcs11_max_nss = 0x88, + .rx_tx_mcs13_max_nss = 0x88, + }, + .bw._320 = { + .rx_tx_mcs9_max_nss = 0x88, + .rx_tx_mcs11_max_nss = 0x88, + .rx_tx_mcs13_max_nss = 0x88, + }, + }, + /* PPE threshold information is not supported */ + }, }, #endif }; From ef7009decc30eb2515a64253791d61b72229c119 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Thu, 21 Nov 2024 21:40:17 +0000 Subject: [PATCH 0125/1450] selftests/sched_ext: fix build after renames in sched_ext API The selftests are falining to build on current tip of bpf-next and sched_ext [1]. This has broken BPF CI [2] after merge from upstream. Use appropriate function names in the selftests according to the recent changes in the sched_ext API [3]. [1] https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fc39fb56917bb3cb53e99560ca3612a84456ada2 [2] https://github.com/kernel-patches/bpf/actions/runs/11959327258/job/33340923745 [3] https://lore.kernel.org/all/20241109194853.580310-1-tj@kernel.org/ Signed-off-by: Ihor Solodrai Acked-by: Andrea Righi Acked-by: David Vernet Signed-off-by: Tejun Heo --- .../testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c | 2 +- .../selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c | 4 ++-- tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- .../selftests/sched_ext/enq_select_cpu_fails.bpf.c | 2 +- tools/testing/selftests/sched_ext/exit.bpf.c | 4 ++-- tools/testing/selftests/sched_ext/maximal.bpf.c | 4 ++-- tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c | 2 +- .../selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c | 2 +- .../testing/selftests/sched_ext/select_cpu_dispatch.bpf.c | 2 +- .../selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c | 2 +- .../selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c | 4 ++-- tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c | 8 ++++---- 12 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c index 37d9bf6fb7458..6f4c3f5a1c5d9 100644 --- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c @@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, * If we dispatch to a bogus DSQ that will fall back to the * builtin global DSQ, we fail gracefully. */ - scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + scx_bpf_dsq_insert_vtime(p, 0xcafef00d, SCX_SLICE_DFL, p->scx.dsq_vtime, 0); return cpu; } diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c index dffc97d9cdf14..e4a55027778fd 100644 --- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c @@ -17,8 +17,8 @@ s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, if (cpu >= 0) { /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ - scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, - p->scx.dsq_vtime, 0); + scx_bpf_dsq_insert_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); return cpu; } diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index 6a7db1502c29e..6325bf76f47ee 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -45,7 +45,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) target = bpf_get_prandom_u32() % nr_cpus; - scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); bpf_task_release(p); } diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c index 1efb50d61040a..a7cf868d5e311 100644 --- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c @@ -31,7 +31,7 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, /* Can only call from ops.select_cpu() */ scx_bpf_select_cpu_dfl(p, 0, 0, &found); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); } SEC(".struct_ops.link") diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c index d75d4faf07f6d..4bc36182d3ffc 100644 --- a/tools/testing/selftests/sched_ext/exit.bpf.c +++ b/tools/testing/selftests/sched_ext/exit.bpf.c @@ -33,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) if (exit_point == EXIT_ENQUEUE) EXIT_CLEANLY(); - scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); } void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) @@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) if (exit_point == EXIT_DISPATCH) EXIT_CLEANLY(); - scx_bpf_consume(DSQ_ID); + scx_bpf_dsq_move_to_local(DSQ_ID); } void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c index 4d4cd8d966dba..4c005fa718103 100644 --- a/tools/testing/selftests/sched_ext/maximal.bpf.c +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) { - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); } void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) @@ -28,7 +28,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) { - scx_bpf_consume(SCX_DSQ_GLOBAL); + scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); } void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c index f171ac4709706..13d0f5be788d1 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c @@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, } scx_bpf_put_idle_cpumask(idle_mask); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); } SEC(".struct_ops.link") diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c index 9efdbb7da9288..815f1d5d61ac4 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c @@ -67,7 +67,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, saw_local = true; } - scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); } s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c index 59bfc4f36167a..4bb99699e9209 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c @@ -29,7 +29,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, cpu = prev_cpu; dispatch: - scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, 0); return cpu; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c index 3bbd5fcdfb18e..2a75de11b2cfd 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c @@ -18,7 +18,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p s32 prev_cpu, u64 wake_flags) { /* Dispatching to a random DSQ should fail. */ - scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, 0xcafef00d, SCX_SLICE_DFL, 0); return prev_cpu; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c index 0fda57fe0ecfa..99d075695c974 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c @@ -18,8 +18,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p s32 prev_cpu, u64 wake_flags) { /* Dispatching twice in a row is disallowed. */ - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); return prev_cpu; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c index e6c67bcf5e6e3..bfcb96cd4954b 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c @@ -2,8 +2,8 @@ /* * A scheduler that validates that enqueue flags are properly stored and * applied at dispatch time when a task is directly dispatched from - * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and - * making the test a very basic vtime scheduler. + * ops.select_cpu(). We validate this by using scx_bpf_dsq_insert_vtime(), + * and making the test a very basic vtime scheduler. * * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. * Copyright (c) 2024 David Vernet @@ -47,13 +47,13 @@ s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, cpu = prev_cpu; scx_bpf_test_and_clear_cpu_idle(cpu); ddsp: - scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + scx_bpf_dsq_insert_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); return cpu; } void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) { - if (scx_bpf_consume(VTIME_DSQ)) + if (scx_bpf_dsq_move_to_local(VTIME_DSQ)) consumed = true; } From f24d192985cbd6782850fdbb3839039da2f0ee76 Mon Sep 17 00:00:00 2001 From: guanjing Date: Sun, 17 Nov 2024 10:51:29 +0800 Subject: [PATCH 0126/1450] sched_ext: fix application of sizeof to pointer sizeof when applied to a pointer typed expression gives the size of the pointer. The proper fix in this particular case is to code sizeof(*cpuset) instead of sizeof(cpuset). This issue was detected with the help of Coccinelle. Fixes: 22a920209ab6 ("sched_ext: Implement tickless support") Signed-off-by: guanjing Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/scx_central.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 21deea320bd78..e938156ed0a0d 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -97,7 +97,7 @@ int main(int argc, char **argv) SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); CPU_ZERO(cpuset); CPU_SET(skel->rodata->central_cpu, cpuset); - SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), + SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset), "Failed to affinitize to central CPU %d (max %d)", skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); CPU_FREE(cpuset); From a9ab02ed97c654a9ea09afb3e6294cea3768c388 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 2 Dec 2024 16:22:05 -0800 Subject: [PATCH 0127/1450] netpoll: Use rtnl_dereference() for npinfo pointer access In the __netpoll_setup() function, when accessing the device's npinfo pointer, replace rcu_access_pointer() with rtnl_dereference(). This change is more appropriate, as suggested by Herbert Xu[1]. The function is called with the RTNL mutex held, and the pointer is being dereferenced later, so, dereference earlier and just reuse the pointer for the if/else. The replacement ensures correct pointer access while maintaining the existing locking and RCU semantics of the netpoll subsystem. Link: https://lore.kernel.org/lkml/Zz1cKZYt1e7elibV@gondor.apana.org.au/ [1] Suggested-by: Herbert Xu Signed-off-by: Breno Leitao Reviewed-by: Jacob Keller Acked-by: Herbert Xu Link: https://patch.msgid.link/20241202-netpoll_rcu_herbet_fix-v2-1-2b9d58edc76a@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2e459b9d88eb5..99e5aa9cc992f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -634,7 +634,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) goto out; } - if (!rcu_access_pointer(ndev->npinfo)) { + npinfo = rtnl_dereference(ndev->npinfo); + if (!npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; @@ -654,7 +655,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) goto free_npinfo; } } else { - npinfo = rtnl_dereference(ndev->npinfo); refcount_inc(&npinfo->refcnt); } From a61b19f4a6586590a9ae6baf2ac4a25a852e547f Mon Sep 17 00:00:00 2001 From: Maksym Kutsevol Date: Mon, 2 Dec 2024 11:55:07 -0800 Subject: [PATCH 0128/1450] netpoll: Make netpoll_send_udp return status instead of void netpoll_send_udp can return if send was successful. It will allow client code to be aware of the send status. Possible return values are the result of __netpoll_send_skb (cast to int) and -ENOMEM. This doesn't cover the case when TX was not successful instantaneously and was scheduled for later, __netpoll__send_skb returns success in that case. Signed-off-by: Maksym Kutsevol Link: https://patch.msgid.link/20241202-netcons-add-udp-send-fail-statistics-to-netconsole-v5-1-70e82239f922@kutsevol.com Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 2 +- net/core/netpoll.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index b34301650c479..f91e50a76efd4 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -57,7 +57,7 @@ static inline void netpoll_poll_disable(struct net_device *dev) { return; } static inline void netpoll_poll_enable(struct net_device *dev) { return; } #endif -void netpoll_send_udp(struct netpoll *np, const char *msg, int len); +int netpoll_send_udp(struct netpoll *np, const char *msg, int len); void netpoll_print_options(struct netpoll *np); int netpoll_parse_options(struct netpoll *np, char *opt); int __netpoll_setup(struct netpoll *np, struct net_device *ndev); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 99e5aa9cc992f..6f2647b000b8f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -390,7 +390,7 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) } EXPORT_SYMBOL(netpoll_send_skb); -void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +int netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; @@ -414,7 +414,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb = find_skb(np, total_len + np->dev->needed_tailroom, total_len - len); if (!skb) - return; + return -ENOMEM; skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); @@ -490,7 +490,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb->dev = np->dev; - netpoll_send_skb(np, skb); + return (int)netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); From 36de47bfd013f2152860339ca763fcb64c76f9b2 Mon Sep 17 00:00:00 2001 From: Maksym Kutsevol Date: Mon, 2 Dec 2024 11:55:08 -0800 Subject: [PATCH 0129/1450] netcons: Add udp send fail statistics to netconsole Enhance observability of netconsole. Packet sends can fail. Start tracking at least two failure possibilities: ENOMEM and NET_XMIT_DROP for every target. Stats are exposed via an additional attribute in CONFIGFS. The exposed statistics allows easier debugging of cases when netconsole messages were not seen by receivers, eliminating the guesswork if the sender thinks that messages in question were sent out. Stats are not reset on enable/disable/change remote ip/etc, they belong to the netcons target itself. Reported-by: Breno Leitao Closes: https://lore.kernel.org/all/ZsWoUzyK5du9Ffl+@gmail.com/ Signed-off-by: Maksym Kutsevol Link: https://patch.msgid.link/20241202-netcons-add-udp-send-fail-statistics-to-netconsole-v5-2-70e82239f922@kutsevol.com Signed-off-by: Jakub Kicinski --- Documentation/networking/netconsole.rst | 5 ++- drivers/net/netconsole.c | 60 +++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/netconsole.rst b/Documentation/networking/netconsole.rst index d55c2a22ec7af..94c4680fdf3e7 100644 --- a/Documentation/networking/netconsole.rst +++ b/Documentation/networking/netconsole.rst @@ -124,7 +124,7 @@ To remove a target:: The interface exposes these parameters of a netconsole target to userspace: - ============== ================================= ============ + =============== ================================= ============ enabled Is this target currently enabled? (read-write) extended Extended mode enabled (read-write) release Prepend kernel release to message (read-write) @@ -135,7 +135,8 @@ The interface exposes these parameters of a netconsole target to userspace: remote_ip Remote agent's IP address (read-write) local_mac Local interface's MAC address (read-only) remote_mac Remote agent's MAC address (read-write) - ============== ================================= ============ + transmit_errors Number of packet send errors (read-only) + =============== ================================= ============ The "enabled" attribute is also used to control whether the parameters of a target can be updated or not -- you can modify the parameters of only diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 4ea44a2f48f7b..f422a2f666ef2 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -90,6 +91,12 @@ static DEFINE_MUTEX(target_cleanup_list_lock); */ static struct console netconsole_ext; +struct netconsole_target_stats { + u64_stats_t xmit_drop_count; + u64_stats_t enomem_count; + struct u64_stats_sync syncp; +}; + /** * struct netconsole_target - Represents a configured netconsole target. * @list: Links this target into the target_list. @@ -97,6 +104,7 @@ static struct console netconsole_ext; * @userdata_group: Links to the userdata configfs hierarchy * @userdata_complete: Cached, formatted string of append * @userdata_length: String length of userdata_complete + * @stats: Packet send stats for the target. Used for debugging. * @enabled: On / off knob to enable / disable target. * Visible from userspace (read-write). * We maintain a strict 1:1 correspondence between this and @@ -124,6 +132,7 @@ struct netconsole_target { char userdata_complete[MAX_USERDATA_ENTRY_LENGTH * MAX_USERDATA_ITEMS]; size_t userdata_length; #endif + struct netconsole_target_stats stats; bool enabled; bool extended; bool release; @@ -262,6 +271,7 @@ static void netconsole_process_cleanups_core(void) * | remote_ip * | local_mac * | remote_mac + * | transmit_errors * | userdata/ * | / * | value @@ -371,6 +381,21 @@ static ssize_t remote_mac_show(struct config_item *item, char *buf) return sysfs_emit(buf, "%pM\n", to_target(item)->np.remote_mac); } +static ssize_t transmit_errors_show(struct config_item *item, char *buf) +{ + struct netconsole_target *nt = to_target(item); + u64 xmit_drop_count, enomem_count; + unsigned int start; + + do { + start = u64_stats_fetch_begin(&nt->stats.syncp); + xmit_drop_count = u64_stats_read(&nt->stats.xmit_drop_count); + enomem_count = u64_stats_read(&nt->stats.enomem_count); + } while (u64_stats_fetch_retry(&nt->stats.syncp, start)); + + return sysfs_emit(buf, "%llu\n", xmit_drop_count + enomem_count); +} + /* * This one is special -- targets created through the configfs interface * are not enabled (and the corresponding netpoll activated) by default. @@ -842,6 +867,7 @@ CONFIGFS_ATTR(, remote_ip); CONFIGFS_ATTR_RO(, local_mac); CONFIGFS_ATTR(, remote_mac); CONFIGFS_ATTR(, release); +CONFIGFS_ATTR_RO(, transmit_errors); static struct configfs_attribute *netconsole_target_attrs[] = { &attr_enabled, @@ -854,6 +880,7 @@ static struct configfs_attribute *netconsole_target_attrs[] = { &attr_remote_ip, &attr_local_mac, &attr_remote_mac, + &attr_transmit_errors, NULL, }; @@ -1058,6 +1085,33 @@ static struct notifier_block netconsole_netdev_notifier = { .notifier_call = netconsole_netdev_event, }; +/** + * send_udp - Wrapper for netpoll_send_udp that counts errors + * @nt: target to send message to + * @msg: message to send + * @len: length of message + * + * Calls netpoll_send_udp and classifies the return value. If an error + * occurred it increments statistics in nt->stats accordingly. + * Only calls netpoll_send_udp if CONFIG_NETCONSOLE_DYNAMIC is disabled. + */ +static void send_udp(struct netconsole_target *nt, const char *msg, int len) +{ + int result = netpoll_send_udp(&nt->np, msg, len); + + if (IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC)) { + if (result == NET_XMIT_DROP) { + u64_stats_update_begin(&nt->stats.syncp); + u64_stats_inc(&nt->stats.xmit_drop_count); + u64_stats_update_end(&nt->stats.syncp); + } else if (result == -ENOMEM) { + u64_stats_update_begin(&nt->stats.syncp); + u64_stats_inc(&nt->stats.enomem_count); + u64_stats_update_end(&nt->stats.syncp); + } + } +} + static void send_msg_no_fragmentation(struct netconsole_target *nt, const char *msg, int msg_len, @@ -1085,7 +1139,7 @@ static void send_msg_no_fragmentation(struct netconsole_target *nt, MAX_PRINT_CHUNK - msg_len, "%s", userdata); - netpoll_send_udp(&nt->np, buf, msg_len); + send_udp(nt, buf, msg_len); } static void append_release(char *buf) @@ -1178,7 +1232,7 @@ static void send_fragmented_body(struct netconsole_target *nt, char *buf, this_offset += this_chunk; } - netpoll_send_udp(&nt->np, buf, this_header + this_offset); + send_udp(nt, buf, this_header + this_offset); offset += this_offset; } } @@ -1288,7 +1342,7 @@ static void write_msg(struct console *con, const char *msg, unsigned int len) tmp = msg; for (left = len; left;) { frag = min(left, MAX_PRINT_CHUNK); - netpoll_send_udp(&nt->np, tmp, frag); + send_udp(nt, tmp, frag); tmp += frag; left -= frag; } From 4485043a9bf83d1a03a62b460ef9d6eec643b0ad Mon Sep 17 00:00:00 2001 From: Justin Lai Date: Tue, 3 Dec 2024 18:31:46 +0800 Subject: [PATCH 0130/1450] rtase: Add support for RTL907XD-VA PCIe port 1. Add RTL907XD-VA hardware version id. 2. Add the reported speed for RTL907XD-VA. Signed-off-by: Justin Lai Link: https://patch.msgid.link/20241203103146.734516-1-justinlai0215@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase.h | 1 + drivers/net/ethernet/realtek/rtase/rtase_main.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/realtek/rtase/rtase.h b/drivers/net/ethernet/realtek/rtase/rtase.h index dbc3f92eebc48..2bbfcad613abb 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase.h +++ b/drivers/net/ethernet/realtek/rtase/rtase.h @@ -13,6 +13,7 @@ #define RTASE_HW_VER_906X_7XA 0x00800000 #define RTASE_HW_VER_906X_7XC 0x04000000 #define RTASE_HW_VER_907XD_V1 0x04800000 +#define RTASE_HW_VER_907XD_VA 0x08000000 #define RTASE_RX_DMA_BURST_256 4 #define RTASE_TX_DMA_BURST_UNLIMITED 7 diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index de7f11232593f..6106aa5333bcc 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1725,6 +1725,7 @@ static int rtase_get_settings(struct net_device *dev, cmd->base.speed = SPEED_5000; break; case RTASE_HW_VER_907XD_V1: + case RTASE_HW_VER_907XD_VA: cmd->base.speed = SPEED_10000; break; } @@ -1993,6 +1994,7 @@ static int rtase_check_mac_version_valid(struct rtase_private *tp) case RTASE_HW_VER_906X_7XA: case RTASE_HW_VER_906X_7XC: case RTASE_HW_VER_907XD_V1: + case RTASE_HW_VER_907XD_VA: ret = 0; break; } From 17ed1911f9c8d4f9af8e13b2c95103ee06dadc0f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:30:47 +0000 Subject: [PATCH 0131/1450] net: phylink: pass phylink and pcs into phylink_pcs_neg_mode() Move the call to phylink_pcs_neg_mode() in phylink_major_config() after we have selected the appropriate PCS to allow the PCS to be passed in. Add struct phylink and struct phylink_pcs pointers to phylink_pcs_neg_mode() and pass in the appropriate structures. Set pl->pcs_neg_mode before returning, and remove the return value. This will allow the capabilities of the PCS and any PHY to be used when deciding which pcs_neg_mode should be used. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUrP-006ITh-6u@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 30a654e983523..daee679f33b37 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1085,7 +1085,8 @@ static void phylink_pcs_an_restart(struct phylink *pl) /** * phylink_pcs_neg_mode() - helper to determine PCS inband mode - * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND. + * @pl: a pointer to a &struct phylink returned from phylink_create() + * @pcs: a pointer to &struct phylink_pcs * @interface: interface mode to be used * @advertising: adertisement ethtool link mode mask * @@ -1102,11 +1103,13 @@ static void phylink_pcs_an_restart(struct phylink *pl) * Note: this is for cases where the PCS itself is involved in negotiation * (e.g. Clause 37, SGMII and similar) not Clause 73. */ -static unsigned int phylink_pcs_neg_mode(unsigned int mode, - phy_interface_t interface, - const unsigned long *advertising) +static void phylink_pcs_neg_mode(struct phylink *pl, struct phylink_pcs *pcs, + phy_interface_t interface, + const unsigned long *advertising) { - unsigned int neg_mode; + unsigned int neg_mode, mode; + + mode = pl->cur_link_an_mode; switch (interface) { case PHY_INTERFACE_MODE_SGMII: @@ -1147,7 +1150,7 @@ static unsigned int phylink_pcs_neg_mode(unsigned int mode, break; } - return neg_mode; + pl->pcs_neg_mode = neg_mode; } static void phylink_major_config(struct phylink *pl, bool restart, @@ -1161,10 +1164,6 @@ static void phylink_major_config(struct phylink *pl, bool restart, phylink_dbg(pl, "major config %s\n", phy_modes(state->interface)); - pl->pcs_neg_mode = phylink_pcs_neg_mode(pl->cur_link_an_mode, - state->interface, - state->advertising); - if (pl->mac_ops->mac_select_pcs) { pcs = pl->mac_ops->mac_select_pcs(pl->config, state->interface); if (IS_ERR(pcs)) { @@ -1177,6 +1176,8 @@ static void phylink_major_config(struct phylink *pl, bool restart, pcs_changed = pl->pcs != pcs; } + phylink_pcs_neg_mode(pl, pcs, state->interface, state->advertising); + phylink_pcs_poll_stop(pl); if (pl->mac_ops->mac_prepare) { @@ -1267,9 +1268,8 @@ static int phylink_change_inband_advert(struct phylink *pl) pl->link_config.pause); /* Recompute the PCS neg mode */ - pl->pcs_neg_mode = phylink_pcs_neg_mode(pl->cur_link_an_mode, - pl->link_config.interface, - pl->link_config.advertising); + phylink_pcs_neg_mode(pl, pl->pcs, pl->link_config.interface, + pl->link_config.advertising); neg_mode = pl->cur_link_an_mode; if (pl->pcs->neg_mode) From 1f92ead7e15003f632b5f138e8138095e0997d3d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:30:52 +0000 Subject: [PATCH 0132/1450] net: phylink: split cur_link_an_mode into requested and active There is an interdependence between the current link_an_mode and pcs_neg_mode that some drivers rely upon to know whether inband or PHY mode will be used. In order to support detection of PCS and PHY inband capabilities resulting in automatic selection of inband or PHY mode, we need to cater for this, and support changing the MAC link_an_mode. However, we end up with an inter-dependency between the current link_an_mode and pcs_neg_mode. To solve this, split the current link_an_mode into the requested link_an_mode and active link_an_mode. The requested link_an_mode will always be passed to phylink_pcs_neg_mode(), and the active link_an_mode will be used for everything else, and only updated during phylink_major_config(). This will ensure that phylink_pcs_neg_mode()'s link_an_mode will not depend on the active link_an_mode that will, in a future patch, depend on pcs_neg_mode. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUrU-006ITn-Ai@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 60 ++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index daee679f33b37..098021f1ab490 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -56,7 +56,8 @@ struct phylink { struct phy_device *phydev; phy_interface_t link_interface; /* PHY_INTERFACE_xxx */ u8 cfg_link_an_mode; /* MLO_AN_xxx */ - u8 cur_link_an_mode; + u8 req_link_an_mode; /* Requested MLO_AN_xxx mode */ + u8 act_link_an_mode; /* Active MLO_AN_xxx mode */ u8 link_port; /* The current non-phy ethtool port */ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); @@ -1065,13 +1066,13 @@ static void phylink_mac_config(struct phylink *pl, phylink_dbg(pl, "%s: mode=%s/%s/%s adv=%*pb pause=%02x\n", - __func__, phylink_an_mode_str(pl->cur_link_an_mode), + __func__, phylink_an_mode_str(pl->act_link_an_mode), phy_modes(st.interface), phy_rate_matching_to_str(st.rate_matching), __ETHTOOL_LINK_MODE_MASK_NBITS, st.advertising, st.pause); - pl->mac_ops->mac_config(pl->config, pl->cur_link_an_mode, &st); + pl->mac_ops->mac_config(pl->config, pl->act_link_an_mode, &st); } static void phylink_pcs_an_restart(struct phylink *pl) @@ -1079,7 +1080,7 @@ static void phylink_pcs_an_restart(struct phylink *pl) if (pl->pcs && linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, pl->link_config.advertising) && phy_interface_mode_is_8023z(pl->link_config.interface) && - phylink_autoneg_inband(pl->cur_link_an_mode)) + phylink_autoneg_inband(pl->act_link_an_mode)) pl->pcs->ops->pcs_an_restart(pl->pcs); } @@ -1109,7 +1110,7 @@ static void phylink_pcs_neg_mode(struct phylink *pl, struct phylink_pcs *pcs, { unsigned int neg_mode, mode; - mode = pl->cur_link_an_mode; + mode = pl->req_link_an_mode; switch (interface) { case PHY_INTERFACE_MODE_SGMII: @@ -1151,6 +1152,7 @@ static void phylink_pcs_neg_mode(struct phylink *pl, struct phylink_pcs *pcs, } pl->pcs_neg_mode = neg_mode; + pl->act_link_an_mode = mode; } static void phylink_major_config(struct phylink *pl, bool restart, @@ -1181,7 +1183,7 @@ static void phylink_major_config(struct phylink *pl, bool restart, phylink_pcs_poll_stop(pl); if (pl->mac_ops->mac_prepare) { - err = pl->mac_ops->mac_prepare(pl->config, pl->cur_link_an_mode, + err = pl->mac_ops->mac_prepare(pl->config, pl->act_link_an_mode, state->interface); if (err < 0) { phylink_err(pl, "mac_prepare failed: %pe\n", @@ -1215,7 +1217,7 @@ static void phylink_major_config(struct phylink *pl, bool restart, if (pl->pcs_state == PCS_STATE_STARTING || pcs_changed) phylink_pcs_enable(pl->pcs); - neg_mode = pl->cur_link_an_mode; + neg_mode = pl->act_link_an_mode; if (pl->pcs && pl->pcs->neg_mode) neg_mode = pl->pcs_neg_mode; @@ -1231,7 +1233,7 @@ static void phylink_major_config(struct phylink *pl, bool restart, phylink_pcs_an_restart(pl); if (pl->mac_ops->mac_finish) { - err = pl->mac_ops->mac_finish(pl->config, pl->cur_link_an_mode, + err = pl->mac_ops->mac_finish(pl->config, pl->act_link_an_mode, state->interface); if (err < 0) phylink_err(pl, "mac_finish failed: %pe\n", @@ -1262,7 +1264,7 @@ static int phylink_change_inband_advert(struct phylink *pl) return 0; phylink_dbg(pl, "%s: mode=%s/%s adv=%*pb pause=%02x\n", __func__, - phylink_an_mode_str(pl->cur_link_an_mode), + phylink_an_mode_str(pl->req_link_an_mode), phy_modes(pl->link_config.interface), __ETHTOOL_LINK_MODE_MASK_NBITS, pl->link_config.advertising, pl->link_config.pause); @@ -1271,7 +1273,7 @@ static int phylink_change_inband_advert(struct phylink *pl) phylink_pcs_neg_mode(pl, pl->pcs, pl->link_config.interface, pl->link_config.advertising); - neg_mode = pl->cur_link_an_mode; + neg_mode = pl->act_link_an_mode; if (pl->pcs->neg_mode) neg_mode = pl->pcs_neg_mode; @@ -1336,7 +1338,7 @@ static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) { struct phylink_link_state link_state; - switch (pl->cur_link_an_mode) { + switch (pl->req_link_an_mode) { case MLO_AN_PHY: link_state = pl->phy_state; break; @@ -1410,14 +1412,14 @@ static void phylink_link_up(struct phylink *pl, pl->cur_interface = link_state.interface; - neg_mode = pl->cur_link_an_mode; + neg_mode = pl->act_link_an_mode; if (pl->pcs && pl->pcs->neg_mode) neg_mode = pl->pcs_neg_mode; phylink_pcs_link_up(pl->pcs, neg_mode, pl->cur_interface, speed, duplex); - pl->mac_ops->mac_link_up(pl->config, pl->phydev, pl->cur_link_an_mode, + pl->mac_ops->mac_link_up(pl->config, pl->phydev, pl->act_link_an_mode, pl->cur_interface, speed, duplex, !!(link_state.pause & MLO_PAUSE_TX), rx_pause); @@ -1437,7 +1439,7 @@ static void phylink_link_down(struct phylink *pl) if (ndev) netif_carrier_off(ndev); - pl->mac_ops->mac_link_down(pl->config, pl->cur_link_an_mode, + pl->mac_ops->mac_link_down(pl->config, pl->act_link_an_mode, pl->cur_interface); phylink_info(pl, "Link is Down\n"); } @@ -1463,10 +1465,10 @@ static void phylink_resolve(struct work_struct *w) } else if (pl->link_failed) { link_state.link = false; retrigger = true; - } else if (pl->cur_link_an_mode == MLO_AN_FIXED) { + } else if (pl->act_link_an_mode == MLO_AN_FIXED) { phylink_get_fixed_state(pl, &link_state); mac_config = link_state.link; - } else if (pl->cur_link_an_mode == MLO_AN_PHY) { + } else if (pl->act_link_an_mode == MLO_AN_PHY) { link_state = pl->phy_state; mac_config = link_state.link; } else { @@ -1520,7 +1522,7 @@ static void phylink_resolve(struct work_struct *w) } } - if (pl->cur_link_an_mode != MLO_AN_FIXED) + if (pl->act_link_an_mode != MLO_AN_FIXED) phylink_apply_manual_flow(pl, &link_state); if (mac_config) { @@ -1644,7 +1646,7 @@ int phylink_set_fixed_link(struct phylink *pl, pl->link_config.an_complete = 1; pl->cfg_link_an_mode = MLO_AN_FIXED; - pl->cur_link_an_mode = pl->cfg_link_an_mode; + pl->req_link_an_mode = pl->cfg_link_an_mode; return 0; } @@ -1732,7 +1734,7 @@ struct phylink *phylink_create(struct phylink_config *config, } } - pl->cur_link_an_mode = pl->cfg_link_an_mode; + pl->req_link_an_mode = pl->cfg_link_an_mode; ret = phylink_register_sfp(pl, fwnode); if (ret < 0) { @@ -2189,7 +2191,7 @@ void phylink_start(struct phylink *pl) ASSERT_RTNL(); phylink_info(pl, "configuring for %s/%s link mode\n", - phylink_an_mode_str(pl->cur_link_an_mode), + phylink_an_mode_str(pl->req_link_an_mode), phy_modes(pl->link_config.interface)); /* Always set the carrier off */ @@ -2474,7 +2476,7 @@ int phylink_ethtool_ksettings_get(struct phylink *pl, linkmode_copy(kset->link_modes.supported, pl->supported); - switch (pl->cur_link_an_mode) { + switch (pl->act_link_an_mode) { case MLO_AN_FIXED: /* We are using fixed settings. Report these as the * current link settings - and note that these also @@ -2566,7 +2568,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, /* If we have a fixed link, refuse to change link parameters. * If the link parameters match, accept them but do nothing. */ - if (pl->cur_link_an_mode == MLO_AN_FIXED) { + if (pl->req_link_an_mode == MLO_AN_FIXED) { if (s->speed != pl->link_config.speed || s->duplex != pl->link_config.duplex) return -EINVAL; @@ -2582,7 +2584,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, * is our default case) but do not allow the advertisement to * be changed. If the advertisement matches, simply return. */ - if (pl->cur_link_an_mode == MLO_AN_FIXED) { + if (pl->req_link_an_mode == MLO_AN_FIXED) { if (!linkmode_equal(config.advertising, pl->link_config.advertising)) return -EINVAL; @@ -2617,7 +2619,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, linkmode_copy(support, pl->supported); if (phylink_validate(pl, support, &config)) { phylink_err(pl, "validation of %s/%s with support %*pb failed\n", - phylink_an_mode_str(pl->cur_link_an_mode), + phylink_an_mode_str(pl->req_link_an_mode), phy_modes(config.interface), __ETHTOOL_LINK_MODE_MASK_NBITS, support); return -EINVAL; @@ -2717,7 +2719,7 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, ASSERT_RTNL(); - if (pl->cur_link_an_mode == MLO_AN_FIXED) + if (pl->req_link_an_mode == MLO_AN_FIXED) return -EOPNOTSUPP; if (!phylink_test(pl->supported, Pause) && @@ -2981,7 +2983,7 @@ static int phylink_mii_read(struct phylink *pl, unsigned int phy_id, struct phylink_link_state state; int val = 0xffff; - switch (pl->cur_link_an_mode) { + switch (pl->act_link_an_mode) { case MLO_AN_FIXED: if (phy_id == 0) { phylink_get_fixed_state(pl, &state); @@ -3006,7 +3008,7 @@ static int phylink_mii_read(struct phylink *pl, unsigned int phy_id, static int phylink_mii_write(struct phylink *pl, unsigned int phy_id, unsigned int reg, unsigned int val) { - switch (pl->cur_link_an_mode) { + switch (pl->act_link_an_mode) { case MLO_AN_FIXED: break; @@ -3196,9 +3198,9 @@ static void phylink_sfp_set_config(struct phylink *pl, u8 mode, changed = true; } - if (pl->cur_link_an_mode != mode || + if (pl->req_link_an_mode != mode || pl->link_config.interface != state->interface) { - pl->cur_link_an_mode = mode; + pl->req_link_an_mode = mode; pl->link_config.interface = state->interface; changed = true; From 4e7d000286fe8e12f2d88032711ffab3ab658b12 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:30:57 +0000 Subject: [PATCH 0133/1450] net: phylink: add debug for phylink_major_config() Now that we have a more complexity in phylink_major_config(), augment the debugging so we can see what's going on there. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUrZ-006ITt-Fa@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 098021f1ab490..fda53dd58285e 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -175,6 +175,24 @@ static const char *phylink_an_mode_str(unsigned int mode) return mode < ARRAY_SIZE(modestr) ? modestr[mode] : "unknown"; } +static const char *phylink_pcs_mode_str(unsigned int mode) +{ + if (!mode) + return "none"; + + if (mode & PHYLINK_PCS_NEG_OUTBAND) + return "outband"; + + if (mode & PHYLINK_PCS_NEG_INBAND) { + if (mode & PHYLINK_PCS_NEG_ENABLED) + return "inband,an-enabled"; + else + return "inband,an-disabled"; + } + + return "unknown"; +} + static unsigned int phylink_interface_signal_rate(phy_interface_t interface) { switch (interface) { @@ -1164,7 +1182,9 @@ static void phylink_major_config(struct phylink *pl, bool restart, unsigned int neg_mode; int err; - phylink_dbg(pl, "major config %s\n", phy_modes(state->interface)); + phylink_dbg(pl, "major config, requested %s/%s\n", + phylink_an_mode_str(pl->req_link_an_mode), + phy_modes(state->interface)); if (pl->mac_ops->mac_select_pcs) { pcs = pl->mac_ops->mac_select_pcs(pl->config, state->interface); @@ -1180,6 +1200,11 @@ static void phylink_major_config(struct phylink *pl, bool restart, phylink_pcs_neg_mode(pl, pcs, state->interface, state->advertising); + phylink_dbg(pl, "major config, active %s/%s/%s\n", + phylink_an_mode_str(pl->act_link_an_mode), + phylink_pcs_mode_str(pl->pcs_neg_mode), + phy_modes(state->interface)); + phylink_pcs_poll_stop(pl); if (pl->mac_ops->mac_prepare) { From b4c7698dd95f253c6958d8c6ac219098009bf28a Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:02 +0000 Subject: [PATCH 0134/1450] net: phy: add phy_inband_caps() Add a method to query the PHY's in-band capabilities for a PHY interface mode. Where the interface mode does not have in-band capability, or the PHY driver has not been updated to return this information, then phy_inband_caps() should return zero. Otherwise, PHY drivers will return a value consisting of the following flags: LINK_INBAND_DISABLE indicates that the hardware does not support in-band signalling, or can have in-band signalling configured via software to be disabled. LINK_INBAND_ENABLE indicates that the hardware will use in-band signalling, or can have in-band signalling configured via software to be enabled. LINK_INBAND_BYPASS indicates that the hardware has the ability to bypass in-band signalling when enabled after a timeout if the link partner does not respond to its in-band signalling. This reports the PHY capabilities for the particular interface mode, not the current configuration. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUre-006ITz-KF@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 21 +++++++++++++++++++++ include/linux/phy.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 0d20b534122b2..f42cd65848418 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1005,6 +1005,27 @@ static int phy_check_link_status(struct phy_device *phydev) return 0; } +/** + * phy_inband_caps - query which in-band signalling modes are supported + * @phydev: a pointer to a &struct phy_device + * @interface: the interface mode for the PHY + * + * Returns zero if it is unknown what in-band signalling is supported by the + * PHY (e.g. because the PHY driver doesn't implement the method.) Otherwise, + * returns a bit mask of the LINK_INBAND_* values from + * &enum link_inband_signalling to describe which inband modes are supported + * by the PHY for this interface mode. + */ +unsigned int phy_inband_caps(struct phy_device *phydev, + phy_interface_t interface) +{ + if (phydev->drv && phydev->drv->inband_caps) + return phydev->drv->inband_caps(phydev, interface); + + return 0; +} +EXPORT_SYMBOL_GPL(phy_inband_caps); + /** * _phy_start_aneg - start auto-negotiation for this PHY device * @phydev: the phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index 563c462056857..ccb93d892da94 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -817,6 +817,24 @@ struct phy_tdr_config { }; #define PHY_PAIR_ALL -1 +/** + * enum link_inband_signalling - in-band signalling modes that are supported + * + * @LINK_INBAND_DISABLE: in-band signalling can be disabled + * @LINK_INBAND_ENABLE: in-band signalling can be enabled without bypass + * @LINK_INBAND_BYPASS: in-band signalling can be enabled with bypass + * + * The possible and required bits can only be used if the valid bit is set. + * If possible is clear, that means inband signalling can not be used. + * Required is only valid when possible is set, and means that inband + * signalling must be used. + */ +enum link_inband_signalling { + LINK_INBAND_DISABLE = BIT(0), + LINK_INBAND_ENABLE = BIT(1), + LINK_INBAND_BYPASS = BIT(2), +}; + /** * struct phy_plca_cfg - Configuration of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. @@ -956,6 +974,14 @@ struct phy_driver { */ int (*get_features)(struct phy_device *phydev); + /** + * @inband_caps: query whether in-band is supported for the given PHY + * interface mode. Returns a bitmask of bits defined by enum + * link_inband_signalling. + */ + unsigned int (*inband_caps)(struct phy_device *phydev, + phy_interface_t interface); + /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine @@ -1818,6 +1844,8 @@ int phy_config_aneg(struct phy_device *phydev); int _phy_start_aneg(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); +unsigned int phy_inband_caps(struct phy_device *phydev, + phy_interface_t interface); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); From c64c7fa0a774d9da72071a8517e359992baac982 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:07 +0000 Subject: [PATCH 0135/1450] net: phy: bcm84881: implement phy_inband_caps() method BCM84881 has no support for inband signalling, so this is a trivial implementation that returns no support for inband. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Acked-by: Florian Fainelli Link: https://patch.msgid.link/E1tIUrj-006IU6-ON@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/bcm84881.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/phy/bcm84881.c b/drivers/net/phy/bcm84881.c index 97da3aee49422..47405bded6779 100644 --- a/drivers/net/phy/bcm84881.c +++ b/drivers/net/phy/bcm84881.c @@ -235,11 +235,21 @@ static int bcm84881_read_status(struct phy_device *phydev) return genphy_c45_read_mdix(phydev); } +/* The Broadcom BCM84881 in the Methode DM7052 is unable to provide a SGMII + * or 802.3z control word, so inband will not work. + */ +static unsigned int bcm84881_inband_caps(struct phy_device *phydev, + phy_interface_t interface) +{ + return LINK_INBAND_DISABLE; +} + static struct phy_driver bcm84881_drivers[] = { { .phy_id = 0xae025150, .phy_id_mask = 0xfffffff0, .name = "Broadcom BCM84881", + .inband_caps = bcm84881_inband_caps, .config_init = bcm84881_config_init, .probe = bcm84881_probe, .get_features = bcm84881_get_features, From 1c86828dff88e28b8ade6bddeee0163a023faf91 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:12 +0000 Subject: [PATCH 0136/1450] net: phy: marvell: implement phy_inband_caps() method Provide an implementation for phy_inband_caps() for Marvell PHYs used on SFP modules, so that phylink knows the PHYs capabilities. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUro-006IUC-Rq@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/marvell.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index cd50cd6a7f75c..3075ebc3f964b 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -717,6 +717,20 @@ static int marvell_config_aneg_fiber(struct phy_device *phydev) return genphy_check_and_restart_aneg(phydev, changed); } +static unsigned int m88e1111_inband_caps(struct phy_device *phydev, + phy_interface_t interface) +{ + /* In 1000base-X and SGMII modes, the inband mode can be changed + * through the Fibre page BMCR ANENABLE bit. + */ + if (interface == PHY_INTERFACE_MODE_1000BASEX || + interface == PHY_INTERFACE_MODE_SGMII) + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE | + LINK_INBAND_BYPASS; + + return 0; +} + static int m88e1111_config_aneg(struct phy_device *phydev) { int extsr = phy_read(phydev, MII_M1111_PHY_EXT_SR); @@ -3677,6 +3691,7 @@ static struct phy_driver marvell_drivers[] = { .name = "Marvell 88E1112", /* PHY_GBIT_FEATURES */ .probe = marvell_probe, + .inband_caps = m88e1111_inband_caps, .config_init = m88e1112_config_init, .config_aneg = marvell_config_aneg, .config_intr = marvell_config_intr, @@ -3698,6 +3713,7 @@ static struct phy_driver marvell_drivers[] = { /* PHY_GBIT_FEATURES */ .flags = PHY_POLL_CABLE_TEST, .probe = marvell_probe, + .inband_caps = m88e1111_inband_caps, .config_init = m88e1111gbe_config_init, .config_aneg = m88e1111_config_aneg, .read_status = marvell_read_status, @@ -3721,6 +3737,7 @@ static struct phy_driver marvell_drivers[] = { .name = "Marvell 88E1111 (Finisar)", /* PHY_GBIT_FEATURES */ .probe = marvell_probe, + .inband_caps = m88e1111_inband_caps, .config_init = m88e1111gbe_config_init, .config_aneg = m88e1111_config_aneg, .read_status = marvell_read_status, From 5d58a890c02770ba8d790b1f3c6e8c0e20514dc2 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:18 +0000 Subject: [PATCH 0137/1450] net: phy: add phy_config_inband() Add a method to configure the PHY's in-band mode. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUru-006IUI-08@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 32 ++++++++++++++++++++++++++++++++ include/linux/phy.h | 6 ++++++ 2 files changed, 38 insertions(+) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index f42cd65848418..0c228aa180198 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1026,6 +1026,38 @@ unsigned int phy_inband_caps(struct phy_device *phydev, } EXPORT_SYMBOL_GPL(phy_inband_caps); +/** + * phy_config_inband - configure the desired PHY in-band mode + * @phydev: the phy_device struct + * @modes: in-band modes to configure + * + * Description: disables, enables or enables-with-bypass in-band signalling + * between the PHY and host system. + * + * Returns: zero on success, or negative errno value. + */ +int phy_config_inband(struct phy_device *phydev, unsigned int modes) +{ + int err; + + if (!!(modes & LINK_INBAND_DISABLE) + + !!(modes & LINK_INBAND_ENABLE) + + !!(modes & LINK_INBAND_BYPASS) != 1) + return -EINVAL; + + mutex_lock(&phydev->lock); + if (!phydev->drv) + err = -EIO; + else if (!phydev->drv->config_inband) + err = -EOPNOTSUPP; + else + err = phydev->drv->config_inband(phydev, modes); + mutex_unlock(&phydev->lock); + + return err; +} +EXPORT_SYMBOL(phy_config_inband); + /** * _phy_start_aneg - start auto-negotiation for this PHY device * @phydev: the phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index ccb93d892da94..61a1bc81f597f 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -982,6 +982,11 @@ struct phy_driver { unsigned int (*inband_caps)(struct phy_device *phydev, phy_interface_t interface); + /** + * @config_inband: configure in-band mode for the PHY + */ + int (*config_inband)(struct phy_device *phydev, unsigned int modes); + /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine @@ -1846,6 +1851,7 @@ int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); unsigned int phy_inband_caps(struct phy_device *phydev, phy_interface_t interface); +int phy_config_inband(struct phy_device *phydev, unsigned int modes); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); From a219912e0fec73c346e64ef47013cb2e152f88fc Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:23 +0000 Subject: [PATCH 0138/1450] net: phy: marvell: implement config_inband() method Implement the config_inband() method for Marvell 88E1112, 88E1111, and Finisar's 88E1111 variant. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUrz-006IUO-3r@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/marvell.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 3075ebc3f964b..b885bc0fe6e07 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -731,6 +731,34 @@ static unsigned int m88e1111_inband_caps(struct phy_device *phydev, return 0; } +static int m88e1111_config_inband(struct phy_device *phydev, unsigned int modes) +{ + u16 extsr, bmcr; + int err; + + if (phydev->interface != PHY_INTERFACE_MODE_1000BASEX && + phydev->interface != PHY_INTERFACE_MODE_SGMII) + return -EINVAL; + + if (modes == LINK_INBAND_BYPASS) + extsr = MII_M1111_HWCFG_SERIAL_AN_BYPASS; + else + extsr = 0; + + if (modes == LINK_INBAND_DISABLE) + bmcr = 0; + else + bmcr = BMCR_ANENABLE; + + err = phy_modify(phydev, MII_M1111_PHY_EXT_SR, + MII_M1111_HWCFG_SERIAL_AN_BYPASS, extsr); + if (err < 0) + return extsr; + + return phy_modify_paged(phydev, MII_MARVELL_FIBER_PAGE, MII_BMCR, + BMCR_ANENABLE, bmcr); +} + static int m88e1111_config_aneg(struct phy_device *phydev) { int extsr = phy_read(phydev, MII_M1111_PHY_EXT_SR); @@ -3692,6 +3720,7 @@ static struct phy_driver marvell_drivers[] = { /* PHY_GBIT_FEATURES */ .probe = marvell_probe, .inband_caps = m88e1111_inband_caps, + .config_inband = m88e1111_config_inband, .config_init = m88e1112_config_init, .config_aneg = marvell_config_aneg, .config_intr = marvell_config_intr, @@ -3714,6 +3743,7 @@ static struct phy_driver marvell_drivers[] = { .flags = PHY_POLL_CABLE_TEST, .probe = marvell_probe, .inband_caps = m88e1111_inband_caps, + .config_inband = m88e1111_config_inband, .config_init = m88e1111gbe_config_init, .config_aneg = m88e1111_config_aneg, .read_status = marvell_read_status, @@ -3738,6 +3768,7 @@ static struct phy_driver marvell_drivers[] = { /* PHY_GBIT_FEATURES */ .probe = marvell_probe, .inband_caps = m88e1111_inband_caps, + .config_inband = m88e1111_config_inband, .config_init = m88e1111gbe_config_init, .config_aneg = m88e1111_config_aneg, .read_status = marvell_read_status, From df874f9e52c340cc6f0a0014a97b778f67d46849 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:28 +0000 Subject: [PATCH 0139/1450] net: phylink: add pcs_inband_caps() method Add a pcs_inband_caps() method to query the PCS for its inband link capabilities, and use this to determine whether link modes used with optical SFPs can be supported. When a PCS does not provide a method, we allow inband negotiation to be either on or off, making this a no-op until the pcs_inband_caps() method is implemented by a PCS driver. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUs4-006IUU-7K@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 60 +++++++++++++++++++++++++++++++++++++++ include/linux/phylink.h | 17 +++++++++++ 2 files changed, 77 insertions(+) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index fda53dd58285e..42f3c7ccbf382 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -990,6 +990,15 @@ static void phylink_resolve_an_pause(struct phylink_link_state *state) } } +static unsigned int phylink_pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) +{ + if (pcs && pcs->ops->pcs_inband_caps) + return pcs->ops->pcs_inband_caps(pcs, interface); + + return 0; +} + static void phylink_pcs_pre_config(struct phylink_pcs *pcs, phy_interface_t interface) { @@ -1043,6 +1052,24 @@ static void phylink_pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode, pcs->ops->pcs_link_up(pcs, neg_mode, interface, speed, duplex); } +/* Query inband for a specific interface mode, asking the MAC for the + * PCS which will be used to handle the interface mode. + */ +static unsigned int phylink_inband_caps(struct phylink *pl, + phy_interface_t interface) +{ + struct phylink_pcs *pcs; + + if (!pl->mac_ops->mac_select_pcs) + return 0; + + pcs = pl->mac_ops->mac_select_pcs(pl->config, interface); + if (!pcs) + return 0; + + return phylink_pcs_inband_caps(pcs, interface); +} + static void phylink_pcs_poll_stop(struct phylink *pl) { if (pl->cfg_link_an_mode == MLO_AN_INBAND) @@ -2532,6 +2559,26 @@ int phylink_ethtool_ksettings_get(struct phylink *pl, } EXPORT_SYMBOL_GPL(phylink_ethtool_ksettings_get); +static bool phylink_validate_pcs_inband_autoneg(struct phylink *pl, + phy_interface_t interface, + unsigned long *adv) +{ + unsigned int inband = phylink_inband_caps(pl, interface); + unsigned int mask; + + /* If the PCS doesn't implement inband support, be permissive. */ + if (!inband) + return true; + + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, adv)) + mask = LINK_INBAND_ENABLE; + else + mask = LINK_INBAND_DISABLE; + + /* Check whether the PCS implements the required mode */ + return !!(inband & mask); +} + /** * phylink_ethtool_ksettings_set() - set the link settings * @pl: a pointer to a &struct phylink returned from phylink_create() @@ -2662,6 +2709,13 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, phylink_is_empty_linkmode(config.advertising)) return -EINVAL; + /* Validate the autonegotiation state. We don't have a PHY in this + * situation, so the PCS is the media-facing entity. + */ + if (!phylink_validate_pcs_inband_autoneg(pl, config.interface, + config.advertising)) + return -EINVAL; + mutex_lock(&pl->state_mutex); pl->link_config.speed = config.speed; pl->link_config.duplex = config.duplex; @@ -3341,6 +3395,12 @@ static int phylink_sfp_config_optical(struct phylink *pl) phylink_dbg(pl, "optical SFP: chosen %s interface\n", phy_modes(interface)); + if (!phylink_validate_pcs_inband_autoneg(pl, interface, + config.advertising)) { + phylink_err(pl, "autoneg setting not compatible with PCS"); + return -EINVAL; + } + config.interface = interface; /* Ignore errors if we're expecting a PHY to attach later */ diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5c01048860c4c..5462cc6a37dc2 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -419,6 +419,7 @@ struct phylink_pcs { /** * struct phylink_pcs_ops - MAC PCS operations structure. * @pcs_validate: validate the link configuration. + * @pcs_inband_caps: query inband support for interface mode. * @pcs_enable: enable the PCS. * @pcs_disable: disable the PCS. * @pcs_pre_config: pre-mac_config method (for errata) @@ -434,6 +435,8 @@ struct phylink_pcs { struct phylink_pcs_ops { int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); + unsigned int (*pcs_inband_caps)(struct phylink_pcs *pcs, + phy_interface_t interface); int (*pcs_enable)(struct phylink_pcs *pcs); void (*pcs_disable)(struct phylink_pcs *pcs); void (*pcs_pre_config)(struct phylink_pcs *pcs, @@ -470,6 +473,20 @@ struct phylink_pcs_ops { int pcs_validate(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); +/** + * pcs_inband_caps - query PCS in-band capabilities for interface mode. + * @pcs: a pointer to a &struct phylink_pcs. + * @interface: interface mode to be queried + * + * Returns zero if it is unknown what in-band signalling is supported by the + * PHY (e.g. because the PHY driver doesn't implement the method.) Otherwise, + * returns a bit mask of the LINK_INBAND_* values from + * &enum link_inband_signalling to describe which inband modes are supported + * for this interface mode. + */ +unsigned int pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface); + /** * pcs_enable() - enable the PCS. * @pcs: a pointer to a &struct phylink_pcs. From 513e8fb8fa32035b3325e2e14fb9598f8cb545e9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:33 +0000 Subject: [PATCH 0140/1450] net: mvneta: implement pcs_inband_caps() method Report the PCS in-band capabilities to phylink for Marvell NETA interfaces. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUs9-006IUb-Au@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvneta.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 1fb285fa0bdb7..fe6261b815409 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3960,20 +3960,27 @@ static struct mvneta_port *mvneta_pcs_to_port(struct phylink_pcs *pcs) return container_of(pcs, struct mvneta_port, phylink_pcs); } -static int mvneta_pcs_validate(struct phylink_pcs *pcs, - unsigned long *supported, - const struct phylink_link_state *state) +static unsigned int mvneta_pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) { - /* We only support QSGMII, SGMII, 802.3z and RGMII modes. - * When in 802.3z mode, we must have AN enabled: + /* When operating in an 802.3z mode, we must have AN enabled: * "Bit 2 Field InBandAnEn In-band Auto-Negotiation enable. ... * When = 1 (1000BASE-X) this field must be set to 1." + * Therefore, inband is "required". */ - if (phy_interface_mode_is_8023z(state->interface) && - !phylink_test(state->advertising, Autoneg)) - return -EINVAL; + if (phy_interface_mode_is_8023z(interface)) + return LINK_INBAND_ENABLE; - return 0; + /* QSGMII, SGMII and RGMII can be configured to use inband + * signalling of the AN result. Indicate these as "possible". + */ + if (interface == PHY_INTERFACE_MODE_SGMII || + interface == PHY_INTERFACE_MODE_QSGMII || + phy_interface_mode_is_rgmii(interface)) + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + /* For any other modes, indicate that inband is not supported. */ + return LINK_INBAND_DISABLE; } static void mvneta_pcs_get_state(struct phylink_pcs *pcs, @@ -4071,7 +4078,7 @@ static void mvneta_pcs_an_restart(struct phylink_pcs *pcs) } static const struct phylink_pcs_ops mvneta_phylink_pcs_ops = { - .pcs_validate = mvneta_pcs_validate, + .pcs_inband_caps = mvneta_pcs_inband_caps, .pcs_get_state = mvneta_pcs_get_state, .pcs_config = mvneta_pcs_config, .pcs_an_restart = mvneta_pcs_an_restart, From d4169f0c7665afb8d8adb5e1b1df3db88517d0ad Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:38 +0000 Subject: [PATCH 0141/1450] net: mvpp2: implement pcs_inband_caps() method Report the PCS in-band capabilities to phylink for Marvell PP2 interfaces. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUsE-006IUh-E7@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 571631a303202..f85229a308441 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -6224,19 +6224,26 @@ static const struct phylink_pcs_ops mvpp2_phylink_xlg_pcs_ops = { .pcs_config = mvpp2_xlg_pcs_config, }; -static int mvpp2_gmac_pcs_validate(struct phylink_pcs *pcs, - unsigned long *supported, - const struct phylink_link_state *state) +static unsigned int mvpp2_gmac_pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) { - /* When in 802.3z mode, we must have AN enabled: + /* When operating in an 802.3z mode, we must have AN enabled: * Bit 2 Field InBandAnEn In-band Auto-Negotiation enable. ... * When = 1 (1000BASE-X) this field must be set to 1. + * Therefore, inband is "required". */ - if (phy_interface_mode_is_8023z(state->interface) && - !phylink_test(state->advertising, Autoneg)) - return -EINVAL; + if (phy_interface_mode_is_8023z(interface)) + return LINK_INBAND_ENABLE; - return 0; + /* SGMII and RGMII can be configured to use inband signalling of the + * AN result. Indicate these as "possible". + */ + if (interface == PHY_INTERFACE_MODE_SGMII || + phy_interface_mode_is_rgmii(interface)) + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + /* For any other modes, indicate that inband is not supported. */ + return LINK_INBAND_DISABLE; } static void mvpp2_gmac_pcs_get_state(struct phylink_pcs *pcs, @@ -6343,7 +6350,7 @@ static void mvpp2_gmac_pcs_an_restart(struct phylink_pcs *pcs) } static const struct phylink_pcs_ops mvpp2_phylink_gmac_pcs_ops = { - .pcs_validate = mvpp2_gmac_pcs_validate, + .pcs_inband_caps = mvpp2_gmac_pcs_inband_caps, .pcs_get_state = mvpp2_gmac_pcs_get_state, .pcs_config = mvpp2_gmac_pcs_config, .pcs_an_restart = mvpp2_gmac_pcs_an_restart, From 5fd0f1a02e750e2db4038dee60edea669ce5aab1 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:43 +0000 Subject: [PATCH 0142/1450] net: phylink: add negotiation of in-band capabilities Support for in-band signalling with Serdes links is uncertain. Some PHYs do not support in-band for e.g. SGMII. Some PCS do not support in-band for 2500Base-X. Some PCS require in-band for Base-X protocols. Simply using what is in DT is insufficient when we have hot-pluggable PHYs e.g. in the form of SFP modules, which may not provide the in-band signalling. In order to address this, we have introduced phy_inband_caps() and pcs_inband_caps() functions to allow phylink to retrieve the capabilities from each end of the PCS/PHY link. This commit adds code to resolve whether in-band will be used in the various scenarios that we have: In-band not being used, PHY present using SGMII or Base-X, PHY not present. We also deal with no capabilties provided. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUsJ-006IUn-H3@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 154 +++++++++++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 10 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 42f3c7ccbf382..b0881fa9c72ed 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -75,6 +75,7 @@ struct phylink { struct mutex state_mutex; struct phylink_link_state phy_state; + unsigned int phy_ib_mode; struct work_struct resolve; unsigned int pcs_neg_mode; unsigned int pcs_state; @@ -1153,10 +1154,18 @@ static void phylink_pcs_neg_mode(struct phylink *pl, struct phylink_pcs *pcs, phy_interface_t interface, const unsigned long *advertising) { + unsigned int pcs_ib_caps = 0; + unsigned int phy_ib_caps = 0; unsigned int neg_mode, mode; + enum { + INBAND_CISCO_SGMII, + INBAND_BASEX, + } type; mode = pl->req_link_an_mode; + pl->phy_ib_mode = 0; + switch (interface) { case PHY_INTERFACE_MODE_SGMII: case PHY_INTERFACE_MODE_QSGMII: @@ -1168,10 +1177,7 @@ static void phylink_pcs_neg_mode(struct phylink *pl, struct phylink_pcs *pcs, * inband communication. Note: there exist PHYs that run * with SGMII but do not send the inband data. */ - if (!phylink_autoneg_inband(mode)) - neg_mode = PHYLINK_PCS_NEG_OUTBAND; - else - neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; + type = INBAND_CISCO_SGMII; break; case PHY_INTERFACE_MODE_1000BASEX: @@ -1182,18 +1188,139 @@ static void phylink_pcs_neg_mode(struct phylink *pl, struct phylink_pcs *pcs, * as well, but drivers may not support this, so may * need to override this. */ - if (!phylink_autoneg_inband(mode)) + type = INBAND_BASEX; + break; + + default: + pl->pcs_neg_mode = PHYLINK_PCS_NEG_NONE; + pl->act_link_an_mode = mode; + return; + } + + if (pcs) + pcs_ib_caps = phylink_pcs_inband_caps(pcs, interface); + + if (pl->phydev) + phy_ib_caps = phy_inband_caps(pl->phydev, interface); + + phylink_dbg(pl, "interface %s inband modes: pcs=%02x phy=%02x\n", + phy_modes(interface), pcs_ib_caps, phy_ib_caps); + + if (!phylink_autoneg_inband(mode)) { + bool pcs_ib_only = false; + bool phy_ib_only = false; + + if (pcs_ib_caps && pcs_ib_caps != LINK_INBAND_DISABLE) { + /* PCS supports reporting in-band capabilities, and + * supports more than disable mode. + */ + if (pcs_ib_caps & LINK_INBAND_DISABLE) + neg_mode = PHYLINK_PCS_NEG_OUTBAND; + else if (pcs_ib_caps & LINK_INBAND_ENABLE) + pcs_ib_only = true; + } + + if (phy_ib_caps && phy_ib_caps != LINK_INBAND_DISABLE) { + /* PHY supports in-band capabilities, and supports + * more than disable mode. + */ + if (phy_ib_caps & LINK_INBAND_DISABLE) + pl->phy_ib_mode = LINK_INBAND_DISABLE; + else if (phy_ib_caps & LINK_INBAND_BYPASS) + pl->phy_ib_mode = LINK_INBAND_BYPASS; + else if (phy_ib_caps & LINK_INBAND_ENABLE) + phy_ib_only = true; + } + + /* If either the PCS or PHY requires inband to be enabled, + * this is an invalid configuration. Provide a diagnostic + * message for this case, but don't try to force the issue. + */ + if (pcs_ib_only || phy_ib_only) + phylink_warn(pl, + "firmware wants %s mode, but %s%s%s requires inband\n", + phylink_an_mode_str(mode), + pcs_ib_only ? "PCS" : "", + pcs_ib_only && phy_ib_only ? " and " : "", + phy_ib_only ? "PHY" : ""); + + neg_mode = PHYLINK_PCS_NEG_OUTBAND; + } else if (type == INBAND_CISCO_SGMII || pl->phydev) { + /* For SGMII modes which are designed to be used with PHYs, or + * Base-X with a PHY, we try to use in-band mode where-ever + * possible. However, there are some PHYs e.g. BCM84881 which + * do not support in-band. + */ + const unsigned int inband_ok = LINK_INBAND_ENABLE | + LINK_INBAND_BYPASS; + const unsigned int outband_ok = LINK_INBAND_DISABLE | + LINK_INBAND_BYPASS; + /* PCS PHY + * D E D E + * 0 0 0 0 no information inband enabled + * 1 0 0 0 pcs doesn't support outband + * 0 1 0 0 pcs required inband enabled + * 1 1 0 0 pcs optional inband enabled + * 0 0 1 0 phy doesn't support outband + * 1 0 1 0 pcs+phy doesn't support outband + * 0 1 1 0 pcs required, phy doesn't support, invalid + * 1 1 1 0 pcs optional, phy doesn't support, outband + * 0 0 0 1 phy required inband enabled + * 1 0 0 1 pcs doesn't support, phy required, invalid + * 0 1 0 1 pcs+phy required inband enabled + * 1 1 0 1 pcs optional, phy required inband enabled + * 0 0 1 1 phy optional inband enabled + * 1 0 1 1 pcs doesn't support, phy optional, outband + * 0 1 1 1 pcs required, phy optional inband enabled + * 1 1 1 1 pcs+phy optional inband enabled + */ + if ((!pcs_ib_caps || pcs_ib_caps & inband_ok) && + (!phy_ib_caps || phy_ib_caps & inband_ok)) { + /* In-band supported or unknown at both ends. Enable + * in-band mode with or without bypass at the PHY. + */ + if (phy_ib_caps & LINK_INBAND_ENABLE) + pl->phy_ib_mode = LINK_INBAND_ENABLE; + else if (phy_ib_caps & LINK_INBAND_BYPASS) + pl->phy_ib_mode = LINK_INBAND_BYPASS; + + neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; + } else if ((!pcs_ib_caps || pcs_ib_caps & outband_ok) && + (!phy_ib_caps || phy_ib_caps & outband_ok)) { + /* Either in-band not supported at at least one end. + * In-band bypass at the other end is possible. + */ + if (phy_ib_caps & LINK_INBAND_DISABLE) + pl->phy_ib_mode = LINK_INBAND_DISABLE; + else if (phy_ib_caps & LINK_INBAND_BYPASS) + pl->phy_ib_mode = LINK_INBAND_BYPASS; + neg_mode = PHYLINK_PCS_NEG_OUTBAND; + if (pl->phydev) + mode = MLO_AN_PHY; + } else { + /* invalid */ + phylink_warn(pl, "%s: incompatible in-band capabilities, trying in-band", + phy_modes(interface)); + neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; + } + } else { + /* For Base-X without a PHY */ + if (pcs_ib_caps == LINK_INBAND_DISABLE) + /* If the PCS doesn't support inband, then inband must + * be disabled. + */ + neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED; + else if (pcs_ib_caps == LINK_INBAND_ENABLE) + /* If the PCS requires inband, then inband must always + * be enabled. + */ + neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; else if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, advertising)) neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; else neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED; - break; - - default: - neg_mode = PHYLINK_PCS_NEG_NONE; - break; } pl->pcs_neg_mode = neg_mode; @@ -1292,6 +1419,13 @@ static void phylink_major_config(struct phylink *pl, bool restart, ERR_PTR(err)); } + if (pl->phydev && pl->phy_ib_mode) { + err = phy_config_inband(pl->phydev, pl->phy_ib_mode); + if (err < 0) + phylink_err(pl, "phy_config_inband: %pe\n", + ERR_PTR(err)); + } + if (pl->sfp_bus) { rate_kbd = phylink_interface_signal_rate(state->interface); if (rate_kbd) From 77ac9a8b2536e0eaca6c6f21070068458bf55981 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:48 +0000 Subject: [PATCH 0143/1450] net: phylink: remove phylink_phy_no_inband() Remove phylink_phy_no_inband() now that we are handling the lack of inband negotiation by querying the capabilities of the PHY and PCS, and the BCM84881 PHY driver provides us the information necessary to make the decision. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUsO-006IUt-KN@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index b0881fa9c72ed..95fbc363f9a68 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -3391,10 +3391,11 @@ static phy_interface_t phylink_choose_sfp_interface(struct phylink *pl, return interface; } -static void phylink_sfp_set_config(struct phylink *pl, u8 mode, +static void phylink_sfp_set_config(struct phylink *pl, unsigned long *supported, struct phylink_link_state *state) { + u8 mode = MLO_AN_INBAND; bool changed = false; phylink_dbg(pl, "requesting link mode %s/%s with support %*pb\n", @@ -3428,8 +3429,7 @@ static void phylink_sfp_set_config(struct phylink *pl, u8 mode, phylink_mac_initial_config(pl, false); } -static int phylink_sfp_config_phy(struct phylink *pl, u8 mode, - struct phy_device *phy) +static int phylink_sfp_config_phy(struct phylink *pl, struct phy_device *phy) { __ETHTOOL_DECLARE_LINK_MODE_MASK(support); struct phylink_link_state config; @@ -3473,7 +3473,7 @@ static int phylink_sfp_config_phy(struct phylink *pl, u8 mode, pl->link_port = pl->sfp_port; - phylink_sfp_set_config(pl, mode, support, &config); + phylink_sfp_set_config(pl, support, &config); return 0; } @@ -3548,7 +3548,7 @@ static int phylink_sfp_config_optical(struct phylink *pl) pl->link_port = pl->sfp_port; - phylink_sfp_set_config(pl, MLO_AN_INBAND, pl->sfp_support, &config); + phylink_sfp_set_config(pl, pl->sfp_support, &config); return 0; } @@ -3619,19 +3619,9 @@ static void phylink_sfp_link_up(void *upstream) phylink_enable_and_run_resolve(pl, PHYLINK_DISABLE_LINK); } -/* The Broadcom BCM84881 in the Methode DM7052 is unable to provide a SGMII - * or 802.3z control word, so inband will not work. - */ -static bool phylink_phy_no_inband(struct phy_device *phy) -{ - return phy->is_c45 && phy_id_compare(phy->c45_ids.device_ids[1], - 0xae025150, 0xfffffff0); -} - static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy) { struct phylink *pl = upstream; - u8 mode; /* * This is the new way of dealing with flow control for PHYs, @@ -3642,17 +3632,12 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy) */ phy_support_asym_pause(phy); - if (phylink_phy_no_inband(phy)) - mode = MLO_AN_PHY; - else - mode = MLO_AN_INBAND; - /* Set the PHY's host supported interfaces */ phy_interface_and(phy->host_interfaces, phylink_sfp_interfaces, pl->config->supported_interfaces); /* Do the initial configuration */ - return phylink_sfp_config_phy(pl, mode, phy); + return phylink_sfp_config_phy(pl, phy); } static void phylink_sfp_disconnect_phy(void *upstream, From 5204ccbfa22358f95afd031a3f337e6d9a74baea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Dec 2024 17:36:17 +0000 Subject: [PATCH 0144/1450] inet: add indirect call wrapper for getfrag() calls UDP send path suffers from one indirect call to ip_generic_getfrag() We can use INDIRECT_CALL_1() to avoid it. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241203173617.2595451-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_output.c | 13 +++++++++---- net/ipv6/ip6_output.c | 13 ++++++++----- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0065b1996c947..a59204a8d8505 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1169,7 +1169,10 @@ static int __ip_append_data(struct sock *sk, /* [!] NOTE: copy will be negative if pagedlen>0 * because then the equation reduces to -fraggap. */ - if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + if (copy > 0 && + INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; @@ -1213,8 +1216,9 @@ static int __ip_append_data(struct sock *sk, unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), - offset, copy, off, skb) < 0) { + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; @@ -1252,7 +1256,8 @@ static int __ip_append_data(struct sock *sk, get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f7b4608bb316e..3d672dea9f562 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1697,8 +1697,9 @@ static int __ip6_append_data(struct sock *sk, pskb_trim_unique(skb_prev, maxfraglen); } if (copy > 0 && - getfrag(from, data + transhdrlen, offset, - copy, fraggap, skb) < 0) { + INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; @@ -1742,8 +1743,9 @@ static int __ip6_append_data(struct sock *sk, unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), - offset, copy, off, skb) < 0) { + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; @@ -1781,7 +1783,8 @@ static int __ip6_append_data(struct sock *sk, get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; From ac98b3132402e1b892c16f87d766f21ef18dd344 Mon Sep 17 00:00:00 2001 From: Kenjiro Nakayama Date: Wed, 4 Dec 2024 07:28:44 +0900 Subject: [PATCH 0145/1450] selftests/net: call sendmmsg via udpgso_bench.sh Currently, sendmmsg is implemented in udpgso_bench_tx.c, but it is not called by any test script. This patch adds a test for sendmmsg in udpgso_bench.sh. This allows for basic API testing and benchmarking comparisons with GSO. Signed-off-by: Kenjiro Nakayama Reviewed-by: Hangbin Liu Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241203222843.26983-1-nakayamakenjiro@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/udpgso_bench.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh index 640bc43452faa..88fa1d53ba2b2 100755 --- a/tools/testing/selftests/net/udpgso_bench.sh +++ b/tools/testing/selftests/net/udpgso_bench.sh @@ -92,6 +92,9 @@ run_udp() { echo "udp" run_in_netns ${args} + echo "udp sendmmsg" + run_in_netns ${args} -m + echo "udp gso" run_in_netns ${args} -S 0 From 152d00a913969514967ad3f962b3b1c8983eb2d7 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 3 Dec 2024 22:33:22 +0100 Subject: [PATCH 0146/1450] r8169: simplify setting hwmon attribute visibility Use new member visible to simplify setting the static visibility. Signed-off-by: Heiner Kallweit Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/dba77e76-be45-4a30-96c7-45e284072ad2@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index cc14cd540f748..6934bdee2a910 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5332,13 +5332,6 @@ static bool rtl_aspm_is_safe(struct rtl8169_private *tp) return false; } -static umode_t r8169_hwmon_is_visible(const void *drvdata, - enum hwmon_sensor_types type, - u32 attr, int channel) -{ - return 0444; -} - static int r8169_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *val) { @@ -5355,7 +5348,7 @@ static int r8169_hwmon_read(struct device *dev, enum hwmon_sensor_types type, } static const struct hwmon_ops r8169_hwmon_ops = { - .is_visible = r8169_hwmon_is_visible, + .visible = 0444, .read = r8169_hwmon_read, }; From 05b6555feeddc077f1eaa14c3e2c409b7ddf917b Mon Sep 17 00:00:00 2001 From: Po-Hao Huang Date: Thu, 28 Nov 2024 13:54:28 +0800 Subject: [PATCH 0147/1450] wifi: rtw89: 8922a: Extend channel info field length for scan Extend the bitfield for duration in channel info to 16 bits. Update the related format in H2C and C2H, then increase firmware format sequence to 3. Signed-off-by: Po-Hao Huang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241128055433.11851-2-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.h | 1 + drivers/net/wireless/realtek/rtw89/fw.c | 43 +++++++++++++------ drivers/net/wireless/realtek/rtw89/fw.h | 22 +++++++++- drivers/net/wireless/realtek/rtw89/mac.c | 24 ++++++++--- drivers/net/wireless/realtek/rtw89/rtw8922a.c | 2 +- 5 files changed, 71 insertions(+), 21 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index 409cbdc6b92ae..8c7e8b1c954f4 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -4456,6 +4456,7 @@ enum rtw89_fw_feature { RTW89_FW_FEATURE_RFK_RXDCK_V0, RTW89_FW_FEATURE_NO_WOW_CPU_IO_RX, RTW89_FW_FEATURE_NOTIFY_AP_INFO, + RTW89_FW_FEATURE_CH_INFO_BE_V0, }; struct rtw89_fw_suit { diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index cbd759c844e55..a0408fcd6799e 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -729,6 +729,7 @@ static const struct __fw_feat_cfg fw_feat_tbl[] = { __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 31, 0, RFK_PRE_NOTIFY_V0), __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 42, 0, RFK_RXDCK_V0), __CFG_FW_FEAT(RTL8922A, ge, 0, 35, 46, 0, NOTIFY_AP_INFO), + __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 47, 0, CH_INFO_BE_V0), }; static void rtw89_fw_iterate_feature_cfg(struct rtw89_fw_info *fw, @@ -4956,13 +4957,14 @@ int rtw89_fw_h2c_scan_list_offload_be(struct rtw89_dev *rtwdev, int ch_num, struct rtw89_wait_info *wait = &rtwdev->mac.fw_ofld_wait; struct rtw89_h2c_chinfo_elem_be *elem; struct rtw89_mac_chinfo_be *ch_info; - struct rtw89_h2c_chinfo *h2c; + struct rtw89_h2c_chinfo_be *h2c; struct sk_buff *skb; unsigned int cond; + u8 ver = U8_MAX; int skb_len; int ret; - static_assert(sizeof(*elem) == RTW89_MAC_CHINFO_SIZE); + static_assert(sizeof(*elem) == RTW89_MAC_CHINFO_SIZE_BE); skb_len = struct_size(h2c, elem, ch_num); skb = rtw89_fw_h2c_alloc_skb_with_hdr(rtwdev, skb_len); @@ -4971,8 +4973,11 @@ int rtw89_fw_h2c_scan_list_offload_be(struct rtw89_dev *rtwdev, int ch_num, return -ENOMEM; } + if (RTW89_CHK_FW_FEATURE(CH_INFO_BE_V0, &rtwdev->fw)) + ver = 0; + skb_put(skb, sizeof(*h2c)); - h2c = (struct rtw89_h2c_chinfo *)skb->data; + h2c = (struct rtw89_h2c_chinfo_be *)skb->data; h2c->ch_num = ch_num; h2c->elem_size = sizeof(*elem) / 4; /* in unit of 4 bytes */ @@ -4982,8 +4987,7 @@ int rtw89_fw_h2c_scan_list_offload_be(struct rtw89_dev *rtwdev, int ch_num, list_for_each_entry(ch_info, chan_list, list) { elem = (struct rtw89_h2c_chinfo_elem_be *)skb_put(skb, sizeof(*elem)); - elem->w0 = le32_encode_bits(ch_info->period, RTW89_H2C_CHINFO_BE_W0_PERIOD) | - le32_encode_bits(ch_info->dwell_time, RTW89_H2C_CHINFO_BE_W0_DWELL) | + elem->w0 = le32_encode_bits(ch_info->dwell_time, RTW89_H2C_CHINFO_BE_W0_DWELL) | le32_encode_bits(ch_info->central_ch, RTW89_H2C_CHINFO_BE_W0_CENTER_CH) | le32_encode_bits(ch_info->pri_ch, RTW89_H2C_CHINFO_BE_W0_PRI_CH); @@ -5030,6 +5034,12 @@ int rtw89_fw_h2c_scan_list_offload_be(struct rtw89_dev *rtwdev, int ch_num, RTW89_H2C_CHINFO_BE_W6_FW_PROBE0_SHORTSSIDS) | le32_encode_bits(ch_info->fw_probe0_bssids, RTW89_H2C_CHINFO_BE_W6_FW_PROBE0_BSSIDS); + if (ver == 0) + elem->w0 |= + le32_encode_bits(ch_info->period, RTW89_H2C_CHINFO_BE_W0_PERIOD); + else + elem->w7 = le32_encode_bits(ch_info->period, + RTW89_H2C_CHINFO_BE_W7_PERIOD_V1); } rtw89_h2c_pkt_set_hdr(rtwdev, skb, FWCMD_TYPE_H2C, @@ -5173,6 +5183,7 @@ int rtw89_fw_h2c_scan_offload_be(struct rtw89_dev *rtwdev, u8 probe_id[NUM_NL80211_BANDS]; u8 cfg_len = sizeof(*h2c); unsigned int cond; + u8 ver = U8_MAX; void *ptr; int ret; u32 len; @@ -5193,6 +5204,9 @@ int rtw89_fw_h2c_scan_offload_be(struct rtw89_dev *rtwdev, memset(probe_id, RTW89_SCANOFLD_PKT_NONE, sizeof(probe_id)); + if (RTW89_CHK_FW_FEATURE(CH_INFO_BE_V0, &rtwdev->fw)) + ver = 0; + if (!wowlan) { list_for_each_entry(pkt_info, &scan_info->pkt_list[NL80211_BAND_6GHZ], list) { if (pkt_info->wildcard_6ghz) { @@ -5288,9 +5302,7 @@ int rtw89_fw_h2c_scan_offload_be(struct rtw89_dev *rtwdev, le32_encode_bits(RTW89_OFF_CHAN_TIME / 10, RTW89_H2C_SCANOFLD_BE_OPCH_W0_POLICY_VAL); - opch->w1 = le32_encode_bits(RTW89_CHANNEL_TIME, - RTW89_H2C_SCANOFLD_BE_OPCH_W1_DURATION) | - le32_encode_bits(op->band_type, + opch->w1 = le32_encode_bits(op->band_type, RTW89_H2C_SCANOFLD_BE_OPCH_W1_CH_BAND) | le32_encode_bits(op->band_width, RTW89_H2C_SCANOFLD_BE_OPCH_W1_BW) | @@ -5316,6 +5328,13 @@ int rtw89_fw_h2c_scan_offload_be(struct rtw89_dev *rtwdev, RTW89_H2C_SCANOFLD_BE_OPCH_W3_PKT2) | le32_encode_bits(RTW89_SCANOFLD_PKT_NONE, RTW89_H2C_SCANOFLD_BE_OPCH_W3_PKT3); + + if (ver == 0) + opch->w1 |= le32_encode_bits(RTW89_CHANNEL_TIME, + RTW89_H2C_SCANOFLD_BE_OPCH_W1_DURATION); + else + opch->w4 = le32_encode_bits(RTW89_CHANNEL_TIME, + RTW89_H2C_SCANOFLD_BE_OPCH_W4_DURATION_V1); ptr += sizeof(*opch); } @@ -6498,7 +6517,7 @@ int rtw89_pno_scan_add_chan_list_ax(struct rtw89_dev *rtwdev, INIT_LIST_HEAD(&chan_list); for (idx = 0, list_len = 0; - idx < nd_config->n_channels && list_len < RTW89_SCAN_LIST_LIMIT; + idx < nd_config->n_channels && list_len < RTW89_SCAN_LIST_LIMIT_AX; idx++, list_len++) { channel = nd_config->channels[idx]; ch_info = kzalloc(sizeof(*ch_info), GFP_KERNEL); @@ -6549,7 +6568,7 @@ int rtw89_hw_scan_add_chan_list_ax(struct rtw89_dev *rtwdev, INIT_LIST_HEAD(&chan_list); for (idx = rtwdev->scan_info.last_chan_idx, list_len = 0; - idx < req->n_channels && list_len < RTW89_SCAN_LIST_LIMIT; + idx < req->n_channels && list_len < RTW89_SCAN_LIST_LIMIT_AX; idx++, list_len++) { channel = req->channels[idx]; ch_info = kzalloc(sizeof(*ch_info), GFP_KERNEL); @@ -6626,7 +6645,7 @@ int rtw89_pno_scan_add_chan_list_be(struct rtw89_dev *rtwdev, INIT_LIST_HEAD(&chan_list); for (idx = 0, list_len = 0; - idx < nd_config->n_channels && list_len < RTW89_SCAN_LIST_LIMIT; + idx < nd_config->n_channels && list_len < RTW89_SCAN_LIST_LIMIT_BE; idx++, list_len++) { channel = nd_config->channels[idx]; ch_info = kzalloc(sizeof(*ch_info), GFP_KERNEL); @@ -6681,7 +6700,7 @@ int rtw89_hw_scan_add_chan_list_be(struct rtw89_dev *rtwdev, INIT_LIST_HEAD(&chan_list); for (idx = rtwdev->scan_info.last_chan_idx, list_len = 0; - idx < req->n_channels && list_len < RTW89_SCAN_LIST_LIMIT; + idx < req->n_channels && list_len < RTW89_SCAN_LIST_LIMIT_BE; idx++, list_len++) { channel = req->channels[idx]; ch_info = kzalloc(sizeof(*ch_info), GFP_KERNEL); diff --git a/drivers/net/wireless/realtek/rtw89/fw.h b/drivers/net/wireless/realtek/rtw89/fw.h index 9106bcce18518..95681c390bb84 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.h +++ b/drivers/net/wireless/realtek/rtw89/fw.h @@ -310,9 +310,12 @@ struct rtw89_fw_macid_pause_sleep_grp { #define RTW89_SCANOFLD_DEBUG_MASK 0x1F #define RTW89_CHAN_INVALID 0xFF #define RTW89_MAC_CHINFO_SIZE 28 +#define RTW89_MAC_CHINFO_SIZE_BE 32 #define RTW89_SCAN_LIST_GUARD 4 -#define RTW89_SCAN_LIST_LIMIT \ - ((RTW89_H2C_MAX_SIZE / RTW89_MAC_CHINFO_SIZE) - RTW89_SCAN_LIST_GUARD) +#define RTW89_SCAN_LIST_LIMIT(size) \ + ((RTW89_H2C_MAX_SIZE / (size)) - RTW89_SCAN_LIST_GUARD) +#define RTW89_SCAN_LIST_LIMIT_AX RTW89_SCAN_LIST_LIMIT(RTW89_MAC_CHINFO_SIZE) +#define RTW89_SCAN_LIST_LIMIT_BE RTW89_SCAN_LIST_LIMIT(RTW89_MAC_CHINFO_SIZE_BE) #define RTW89_BCN_LOSS_CNT 10 @@ -2647,6 +2650,7 @@ struct rtw89_h2c_chinfo_elem_be { __le32 w4; __le32 w5; __le32 w6; + __le32 w7; } __packed; #define RTW89_H2C_CHINFO_BE_W0_PERIOD GENMASK(7, 0) @@ -2678,6 +2682,7 @@ struct rtw89_h2c_chinfo_elem_be { #define RTW89_H2C_CHINFO_BE_W5_FW_PROBE0_SSIDS GENMASK(31, 16) #define RTW89_H2C_CHINFO_BE_W6_FW_PROBE0_SHORTSSIDS GENMASK(15, 0) #define RTW89_H2C_CHINFO_BE_W6_FW_PROBE0_BSSIDS GENMASK(31, 16) +#define RTW89_H2C_CHINFO_BE_W7_PERIOD_V1 GENMASK(15, 0) struct rtw89_h2c_chinfo { u8 ch_num; @@ -2687,6 +2692,14 @@ struct rtw89_h2c_chinfo { struct rtw89_h2c_chinfo_elem elem[] __counted_by(ch_num); } __packed; +struct rtw89_h2c_chinfo_be { + u8 ch_num; + u8 elem_size; + u8 arg; + u8 rsvd0; + struct rtw89_h2c_chinfo_elem_be elem[] __counted_by(ch_num); +} __packed; + #define RTW89_H2C_CHINFO_ARG_MAC_IDX_MASK BIT(0) #define RTW89_H2C_CHINFO_ARG_APPEND_MASK BIT(1) @@ -2733,6 +2746,7 @@ struct rtw89_h2c_scanofld_be_opch { __le32 w1; __le32 w2; __le32 w3; + __le32 w4; } __packed; #define RTW89_H2C_SCANOFLD_BE_OPCH_W0_MACID GENMASK(15, 0) @@ -2754,6 +2768,7 @@ struct rtw89_h2c_scanofld_be_opch { #define RTW89_H2C_SCANOFLD_BE_OPCH_W3_PKT1 GENMASK(15, 8) #define RTW89_H2C_SCANOFLD_BE_OPCH_W3_PKT2 GENMASK(23, 16) #define RTW89_H2C_SCANOFLD_BE_OPCH_W3_PKT3 GENMASK(31, 24) +#define RTW89_H2C_SCANOFLD_BE_OPCH_W4_DURATION_V1 GENMASK(15, 0) struct rtw89_h2c_scanofld_be { __le32 w0; @@ -3596,6 +3611,7 @@ struct rtw89_c2h_scanofld { __le32 w5; __le32 w6; __le32 w7; + __le32 w8; } __packed; #define RTW89_C2H_SCANOFLD_W2_PRI_CH GENMASK(7, 0) @@ -3610,6 +3626,8 @@ struct rtw89_c2h_scanofld { #define RTW89_C2H_SCANOFLD_W6_EXPECT_PERIOD GENMASK(15, 8) #define RTW89_C2H_SCANOFLD_W6_FW_DEF GENMASK(23, 16) #define RTW89_C2H_SCANOFLD_W7_REPORT_TSF GENMASK(31, 0) +#define RTW89_C2H_SCANOFLD_W8_PERIOD_V1 GENMASK(15, 0) +#define RTW89_C2H_SCANOFLD_W8_EXPECT_PERIOD_V1 GENMASK(31, 16) #define RTW89_GET_MAC_C2H_MCC_RCV_ACK_GROUP(c2h) \ le32_get_bits(*((const __le32 *)(c2h) + 2), GENMASK(1, 0)) diff --git a/drivers/net/wireless/realtek/rtw89/mac.c b/drivers/net/wireless/realtek/rtw89/mac.c index 03fc214402710..95f2beb89fe6a 100644 --- a/drivers/net/wireless/realtek/rtw89/mac.c +++ b/drivers/net/wireless/realtek/rtw89/mac.c @@ -4788,9 +4788,11 @@ rtw89_mac_c2h_scanofld_rsp(struct rtw89_dev *rtwdev, struct sk_buff *skb, struct rtw89_vif_link *rtwvif_link = rtwdev->scan_info.scanning_vif; struct rtw89_vif *rtwvif; struct rtw89_chan new; - u8 reason, status, tx_fail, band, actual_period, expect_period; u32 last_chan = rtwdev->scan_info.last_chan_idx, report_tsf; + u16 actual_period, expect_period; + u8 reason, status, tx_fail, band; u8 mac_idx, sw_def, fw_def; + u8 ver = U8_MAX; u16 chan; int ret; @@ -4799,6 +4801,9 @@ rtw89_mac_c2h_scanofld_rsp(struct rtw89_dev *rtwdev, struct sk_buff *skb, rtwvif = rtwvif_link->rtwvif; + if (RTW89_CHK_FW_FEATURE(CH_INFO_BE_V0, &rtwdev->fw)) + ver = 0; + tx_fail = le32_get_bits(c2h->w5, RTW89_C2H_SCANOFLD_W5_TX_FAIL); status = le32_get_bits(c2h->w2, RTW89_C2H_SCANOFLD_W2_STATUS); chan = le32_get_bits(c2h->w2, RTW89_C2H_SCANOFLD_W2_PRI_CH); @@ -4811,21 +4816,28 @@ rtw89_mac_c2h_scanofld_rsp(struct rtw89_dev *rtwdev, struct sk_buff *skb, if (!(rtwdev->chip->support_bands & BIT(NL80211_BAND_6GHZ))) band = chan > 14 ? RTW89_BAND_5G : RTW89_BAND_2G; - rtw89_debug(rtwdev, RTW89_DBG_HW_SCAN, - "mac_idx[%d] band: %d, chan: %d, reason: %d, status: %d, tx_fail: %d, actual: %d\n", - mac_idx, band, chan, reason, status, tx_fail, actual_period); - if (rtwdev->chip->chip_gen == RTW89_CHIP_BE) { sw_def = le32_get_bits(c2h->w6, RTW89_C2H_SCANOFLD_W6_SW_DEF); - expect_period = le32_get_bits(c2h->w6, RTW89_C2H_SCANOFLD_W6_EXPECT_PERIOD); fw_def = le32_get_bits(c2h->w6, RTW89_C2H_SCANOFLD_W6_FW_DEF); report_tsf = le32_get_bits(c2h->w7, RTW89_C2H_SCANOFLD_W7_REPORT_TSF); + if (ver == 0) { + expect_period = + le32_get_bits(c2h->w6, RTW89_C2H_SCANOFLD_W6_EXPECT_PERIOD); + } else { + actual_period = le32_get_bits(c2h->w8, RTW89_C2H_SCANOFLD_W8_PERIOD_V1); + expect_period = + le32_get_bits(c2h->w8, RTW89_C2H_SCANOFLD_W8_EXPECT_PERIOD_V1); + } rtw89_debug(rtwdev, RTW89_DBG_HW_SCAN, "sw_def: %d, fw_def: %d, tsf: %x, expect: %d\n", sw_def, fw_def, report_tsf, expect_period); } + rtw89_debug(rtwdev, RTW89_DBG_HW_SCAN, + "mac_idx[%d] band: %d, chan: %d, reason: %d, status: %d, tx_fail: %d, actual: %d\n", + mac_idx, band, chan, reason, status, tx_fail, actual_period); + switch (reason) { case RTW89_SCAN_LEAVE_OP_NOTIFY: case RTW89_SCAN_LEAVE_CH_NOTIFY: diff --git a/drivers/net/wireless/realtek/rtw89/rtw8922a.c b/drivers/net/wireless/realtek/rtw89/rtw8922a.c index a5333099668a1..a96b58ce65926 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8922a.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8922a.c @@ -14,7 +14,7 @@ #include "rtw8922a_rfk.h" #include "util.h" -#define RTW8922A_FW_FORMAT_MAX 2 +#define RTW8922A_FW_FORMAT_MAX 3 #define RTW8922A_FW_BASENAME "rtw89/rtw8922a_fw" #define RTW8922A_MODULE_FIRMWARE \ RTW8922A_FW_BASENAME "-" __stringify(RTW8922A_FW_FORMAT_MAX) ".bin" From d56c261e5214e51e2c6d22149f63555039b5601e Mon Sep 17 00:00:00 2001 From: Po-Hao Huang Date: Thu, 28 Nov 2024 13:54:29 +0800 Subject: [PATCH 0148/1450] wifi: rtw89: 8852b: add beacon filter and CQM support Declare beacon filter and connection monitor for 8852B. This offloads connection monitor mechanism to firmware, and this is required for the MCC feature. Signed-off-by: Po-Hao Huang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241128055433.11851-3-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/fw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index a0408fcd6799e..3fba37b8013aa 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -709,6 +709,7 @@ static const struct __fw_feat_cfg fw_feat_tbl[] = { __CFG_FW_FEAT(RTL8852B, ge, 0, 29, 26, 0, TX_WAKE), __CFG_FW_FEAT(RTL8852B, ge, 0, 29, 29, 0, CRASH_TRIGGER), __CFG_FW_FEAT(RTL8852B, ge, 0, 29, 29, 0, SCAN_OFFLOAD), + __CFG_FW_FEAT(RTL8852B, ge, 0, 29, 29, 7, BEACON_FILTER), __CFG_FW_FEAT(RTL8852B, lt, 0, 29, 30, 0, NO_WOW_CPU_IO_RX), __CFG_FW_FEAT(RTL8852BT, ge, 0, 29, 74, 0, NO_LPS_PG), __CFG_FW_FEAT(RTL8852BT, ge, 0, 29, 74, 0, TX_WAKE), From 3374c63111b0811134b773e9d4c028bd643bb9c9 Mon Sep 17 00:00:00 2001 From: Po-Hao Huang Date: Thu, 28 Nov 2024 13:54:30 +0800 Subject: [PATCH 0149/1450] wifi: rtw89: 8852bt: add beacon filter and CQM support Declare beacon filter and connection monitor for 8852BT. This offloads connection monitor mechanism to firmware, and this is required for the MCC feature. Signed-off-by: Po-Hao Huang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241128055433.11851-4-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/fw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index 3fba37b8013aa..c604ea1d39f17 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -715,6 +715,7 @@ static const struct __fw_feat_cfg fw_feat_tbl[] = { __CFG_FW_FEAT(RTL8852BT, ge, 0, 29, 74, 0, TX_WAKE), __CFG_FW_FEAT(RTL8852BT, ge, 0, 29, 90, 0, CRASH_TRIGGER), __CFG_FW_FEAT(RTL8852BT, ge, 0, 29, 91, 0, SCAN_OFFLOAD), + __CFG_FW_FEAT(RTL8852BT, ge, 0, 29, 110, 0, BEACON_FILTER), __CFG_FW_FEAT(RTL8852C, le, 0, 27, 33, 0, NO_DEEP_PS), __CFG_FW_FEAT(RTL8852C, ge, 0, 27, 34, 0, TX_WAKE), __CFG_FW_FEAT(RTL8852C, ge, 0, 27, 36, 0, SCAN_OFFLOAD), From b6853ed2be27ac7f9511867faff3d6dd1139b915 Mon Sep 17 00:00:00 2001 From: Chih-Kang Chang Date: Thu, 28 Nov 2024 13:54:31 +0800 Subject: [PATCH 0150/1450] wifi: rtw89: 8922a: use RSSI from PHY report in RX descriptor The PPDU status of probe response will fail to parse the IE due to being filtered by the to_self check. Therefore, we parse RSSI from PHY report in RX descriptor. Signed-off-by: Chih-Kang Chang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241128055433.11851-5-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.c | 17 +++++++++ drivers/net/wireless/realtek/rtw89/core.h | 19 ++++++++++ drivers/net/wireless/realtek/rtw89/mac.c | 1 + drivers/net/wireless/realtek/rtw89/mac.h | 36 +++++++++++++++++++ drivers/net/wireless/realtek/rtw89/mac_be.c | 15 ++++++++ drivers/net/wireless/realtek/rtw89/reg.h | 4 +++ drivers/net/wireless/realtek/rtw89/rtw8851b.c | 4 ++- drivers/net/wireless/realtek/rtw89/rtw8852a.c | 5 ++- drivers/net/wireless/realtek/rtw89/rtw8852b.c | 1 + .../wireless/realtek/rtw89/rtw8852b_common.c | 4 ++- .../net/wireless/realtek/rtw89/rtw8852bt.c | 1 + drivers/net/wireless/realtek/rtw89/rtw8852c.c | 6 +++- drivers/net/wireless/realtek/rtw89/rtw8922a.c | 17 +++++++-- drivers/net/wireless/realtek/rtw89/txrx.h | 3 ++ 14 files changed, 127 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c index 6f9b4f0b2748f..29d0ac502bab2 100644 --- a/drivers/net/wireless/realtek/rtw89/core.c +++ b/drivers/net/wireless/realtek/rtw89/core.c @@ -2366,6 +2366,12 @@ static void rtw89_core_update_radiotap(struct rtw89_dev *rtwdev, } } +static void rtw89_core_validate_rx_signal(struct ieee80211_rx_status *rx_status) +{ + if (!rx_status->signal) + rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; +} + static void rtw89_core_rx_to_mac80211(struct rtw89_dev *rtwdev, struct rtw89_rx_phy_ppdu *phy_ppdu, struct rtw89_rx_desc_info *desc_info, @@ -2382,6 +2388,8 @@ static void rtw89_core_rx_to_mac80211(struct rtw89_dev *rtwdev, rtw89_core_rx_stats(rtwdev, phy_ppdu, desc_info, skb_ppdu); rtw89_core_update_rx_status_by_ppdu(rtwdev, rx_status, phy_ppdu); rtw89_core_update_radiotap(rtwdev, skb_ppdu, rx_status); + rtw89_core_validate_rx_signal(rx_status); + /* In low power mode, it does RX in thread context. */ local_bh_disable(); ieee80211_rx_napi(rtwdev->hw, NULL, skb_ppdu, napi); @@ -2517,6 +2525,7 @@ void rtw89_core_query_rxdesc_v2(struct rtw89_dev *rtwdev, struct rtw89_rx_desc_info *desc_info, u8 *data, u32 data_offset) { + struct rtw89_rxdesc_phy_rpt_v2 *rxd_rpt; struct rtw89_rxdesc_short_v2 *rxd_s; struct rtw89_rxdesc_long_v2 *rxd_l; u16 shift_len, drv_info_len, phy_rtp_len, hdr_cnv_len; @@ -2564,6 +2573,12 @@ void rtw89_core_query_rxdesc_v2(struct rtw89_dev *rtwdev, desc_info->rxd_len = sizeof(struct rtw89_rxdesc_short_v2); desc_info->ready = true; + if (phy_rtp_len == sizeof(*rxd_rpt)) { + rxd_rpt = (struct rtw89_rxdesc_phy_rpt_v2 *)(data + data_offset + + desc_info->rxd_len); + desc_info->rssi = le32_get_bits(rxd_rpt->dword0, BE_RXD_PHY_RSSI); + } + if (!desc_info->long_rxdesc) return; @@ -2706,6 +2721,7 @@ static void rtw89_core_update_rx_status(struct rtw89_dev *rtwdev, rx_status->flag |= RX_FLAG_MACTIME_START; rx_status->mactime = desc_info->free_run_cnt; + rtw89_chip_phy_rpt_to_rssi(rtwdev, desc_info, rx_status); rtw89_core_stats_sta_rx_status(rtwdev, desc_info, rx_status); } @@ -4522,6 +4538,7 @@ int rtw89_core_start(struct rtw89_dev *rtwdev) rtw89_phy_dm_init(rtwdev); rtw89_mac_cfg_ppdu_status_bands(rtwdev, true); + rtw89_mac_cfg_phy_rpt_bands(rtwdev, true); rtw89_mac_update_rts_threshold(rtwdev); rtw89_tas_reset(rtwdev); diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index 8c7e8b1c954f4..82844e470d1b8 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -1084,6 +1084,7 @@ struct rtw89_rx_desc_info { u16 offset; u16 rxd_len; bool ready; + u16 rssi; }; struct rtw89_rxdesc_short { @@ -1126,6 +1127,11 @@ struct rtw89_rxdesc_long_v2 { __le32 dword9; } __packed; +struct rtw89_rxdesc_phy_rpt_v2 { + __le32 dword0; + __le32 dword1; +} __packed; + struct rtw89_tx_desc_info { u16 pkt_size; u8 wp_offset; @@ -3624,6 +3630,9 @@ struct rtw89_chip_ops { struct ieee80211_rx_status *status); void (*convert_rpl_to_rssi)(struct rtw89_dev *rtwdev, struct rtw89_rx_phy_ppdu *phy_ppdu); + void (*phy_rpt_to_rssi)(struct rtw89_dev *rtwdev, + struct rtw89_rx_desc_info *desc_info, + struct ieee80211_rx_status *rx_status); void (*ctrl_nbtg_bt_tx)(struct rtw89_dev *rtwdev, bool en, enum rtw89_phy_idx phy_idx); void (*cfg_txrx_path)(struct rtw89_dev *rtwdev); @@ -6706,6 +6715,16 @@ static inline void rtw89_chip_convert_rpl_to_rssi(struct rtw89_dev *rtwdev, chip->ops->convert_rpl_to_rssi(rtwdev, phy_ppdu); } +static inline void rtw89_chip_phy_rpt_to_rssi(struct rtw89_dev *rtwdev, + struct rtw89_rx_desc_info *desc_info, + struct ieee80211_rx_status *rx_status) +{ + const struct rtw89_chip_info *chip = rtwdev->chip; + + if (chip->ops->phy_rpt_to_rssi) + chip->ops->phy_rpt_to_rssi(rtwdev, desc_info, rx_status); +} + static inline void rtw89_ctrl_nbtg_bt_tx(struct rtw89_dev *rtwdev, bool en, enum rtw89_phy_idx phy_idx) { diff --git a/drivers/net/wireless/realtek/rtw89/mac.c b/drivers/net/wireless/realtek/rtw89/mac.c index 95f2beb89fe6a..bb4f58118e05a 100644 --- a/drivers/net/wireless/realtek/rtw89/mac.c +++ b/drivers/net/wireless/realtek/rtw89/mac.c @@ -6736,6 +6736,7 @@ const struct rtw89_mac_gen_def rtw89_mac_gen_ax = { .typ_fltr_opt = rtw89_mac_typ_fltr_opt_ax, .cfg_ppdu_status = rtw89_mac_cfg_ppdu_status_ax, + .cfg_phy_rpt = NULL, .dle_mix_cfg = dle_mix_cfg_ax, .chk_dle_rdy = chk_dle_rdy_ax, diff --git a/drivers/net/wireless/realtek/rtw89/mac.h b/drivers/net/wireless/realtek/rtw89/mac.h index 81507274a97e2..8edea96d037f6 100644 --- a/drivers/net/wireless/realtek/rtw89/mac.h +++ b/drivers/net/wireless/realtek/rtw89/mac.h @@ -169,6 +169,20 @@ enum rtw89_mac_ax_l0_to_l1_event { MAC_AX_L0_TO_L1_EVENT_MAX = 15, }; +enum rtw89_mac_phy_rpt_size { + MAC_AX_PHY_RPT_SIZE_0 = 0, + MAC_AX_PHY_RPT_SIZE_8 = 1, + MAC_AX_PHY_RPT_SIZE_16 = 2, + MAC_AX_PHY_RPT_SIZE_24 = 3, +}; + +enum rtw89_mac_hdr_cnv_size { + MAC_AX_HDR_CNV_SIZE_0 = 0, + MAC_AX_HDR_CNV_SIZE_32 = 1, + MAC_AX_HDR_CNV_SIZE_64 = 2, + MAC_AX_HDR_CNV_SIZE_96 = 3, +}; + enum rtw89_mac_wow_fw_status { WOWLAN_NOT_READY = 0x00, WOWLAN_SLEEP_READY = 0x01, @@ -968,6 +982,7 @@ struct rtw89_mac_gen_def { enum rtw89_mac_fwd_target fwd_target, u8 mac_idx); int (*cfg_ppdu_status)(struct rtw89_dev *rtwdev, u8 mac_idx, bool enable); + void (*cfg_phy_rpt)(struct rtw89_dev *rtwdev, u8 mac_idx, bool enable); int (*dle_mix_cfg)(struct rtw89_dev *rtwdev, const struct rtw89_dle_mem *cfg); int (*chk_dle_rdy)(struct rtw89_dev *rtwdev, bool wde_or_ple); @@ -1223,6 +1238,27 @@ int rtw89_mac_stop_sch_tx_v2(struct rtw89_dev *rtwdev, u8 mac_idx, int rtw89_mac_resume_sch_tx(struct rtw89_dev *rtwdev, u8 mac_idx, u32 tx_en); int rtw89_mac_resume_sch_tx_v1(struct rtw89_dev *rtwdev, u8 mac_idx, u32 tx_en); int rtw89_mac_resume_sch_tx_v2(struct rtw89_dev *rtwdev, u8 mac_idx, u32 tx_en); +void rtw89_mac_cfg_phy_rpt_be(struct rtw89_dev *rtwdev, u8 mac_idx, bool enable); + +static inline +void rtw89_mac_cfg_phy_rpt(struct rtw89_dev *rtwdev, u8 mac_idx, bool enable) +{ + const struct rtw89_mac_gen_def *mac = rtwdev->chip->mac_def; + + if (mac->cfg_phy_rpt) + mac->cfg_phy_rpt(rtwdev, mac_idx, enable); +} + +static inline +void rtw89_mac_cfg_phy_rpt_bands(struct rtw89_dev *rtwdev, bool enable) +{ + rtw89_mac_cfg_phy_rpt(rtwdev, RTW89_MAC_0, enable); + + if (!rtwdev->dbcc_en) + return; + + rtw89_mac_cfg_phy_rpt(rtwdev, RTW89_MAC_1, enable); +} static inline int rtw89_mac_cfg_ppdu_status(struct rtw89_dev *rtwdev, u8 mac_idx, bool enable) diff --git a/drivers/net/wireless/realtek/rtw89/mac_be.c b/drivers/net/wireless/realtek/rtw89/mac_be.c index f7a396c8a3cd5..2dbdeae904ade 100644 --- a/drivers/net/wireless/realtek/rtw89/mac_be.c +++ b/drivers/net/wireless/realtek/rtw89/mac_be.c @@ -1988,6 +1988,20 @@ int rtw89_mac_resume_sch_tx_v2(struct rtw89_dev *rtwdev, u8 mac_idx, u32 tx_en) } EXPORT_SYMBOL(rtw89_mac_resume_sch_tx_v2); +void rtw89_mac_cfg_phy_rpt_be(struct rtw89_dev *rtwdev, u8 mac_idx, bool enable) +{ + u32 reg, val; + + reg = rtw89_mac_reg_by_idx(rtwdev, R_BE_RCR, mac_idx); + val = enable ? MAC_AX_PHY_RPT_SIZE_8 : MAC_AX_PHY_RPT_SIZE_0; + rtw89_write32_mask(rtwdev, reg, B_BE_PHY_RPT_SZ_MASK, val); + rtw89_write32_mask(rtwdev, reg, B_BE_HDR_CNV_SZ_MASK, MAC_AX_HDR_CNV_SIZE_0); + + reg = rtw89_mac_reg_by_idx(rtwdev, R_BE_DRV_INFO_OPTION, mac_idx); + rtw89_write32_mask(rtwdev, reg, B_BE_DRV_INFO_PHYRPT_EN, enable); +} +EXPORT_SYMBOL(rtw89_mac_cfg_phy_rpt_be); + static int rtw89_mac_cfg_ppdu_status_be(struct rtw89_dev *rtwdev, u8 mac_idx, bool enable) { @@ -2583,6 +2597,7 @@ const struct rtw89_mac_gen_def rtw89_mac_gen_be = { .typ_fltr_opt = rtw89_mac_typ_fltr_opt_be, .cfg_ppdu_status = rtw89_mac_cfg_ppdu_status_be, + .cfg_phy_rpt = rtw89_mac_cfg_phy_rpt_be, .dle_mix_cfg = dle_mix_cfg_be, .chk_dle_rdy = chk_dle_rdy_be, diff --git a/drivers/net/wireless/realtek/rtw89/reg.h b/drivers/net/wireless/realtek/rtw89/reg.h index 18ec7c0252fb8..10d0efa7a58ef 100644 --- a/drivers/net/wireless/realtek/rtw89/reg.h +++ b/drivers/net/wireless/realtek/rtw89/reg.h @@ -7447,6 +7447,10 @@ #define B_BE_CSIPRT_HESU_AID_EN BIT(25) #define B_BE_CSIPRT_VHTSU_AID_EN BIT(24) +#define R_BE_DRV_INFO_OPTION 0x11470 +#define R_BE_DRV_INFO_OPTION_C1 0x15470 +#define B_BE_DRV_INFO_PHYRPT_EN BIT(0) + #define R_BE_RX_ERR_ISR 0x114F4 #define R_BE_RX_ERR_ISR_C1 0x154F4 #define B_BE_RX_ERR_TRIG_ACT_TO BIT(9) diff --git a/drivers/net/wireless/realtek/rtw89/rtw8851b.c b/drivers/net/wireless/realtek/rtw89/rtw8851b.c index 1ed4e64cbd2c8..c56f70267882a 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8851b.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8851b.c @@ -2298,7 +2298,8 @@ static void rtw8851b_query_ppdu(struct rtw89_dev *rtwdev, u8 path; u8 *rx_power = phy_ppdu->rssi; - status->signal = RTW89_RSSI_RAW_TO_DBM(rx_power[RF_PATH_A]); + if (!status->signal) + status->signal = RTW89_RSSI_RAW_TO_DBM(rx_power[RF_PATH_A]); for (path = 0; path < rtwdev->chip->rf_path_num; path++) { status->chains |= BIT(path); @@ -2391,6 +2392,7 @@ static const struct rtw89_chip_ops rtw8851b_chip_ops = { .ctrl_btg_bt_rx = rtw8851b_ctrl_btg_bt_rx, .query_ppdu = rtw8851b_query_ppdu, .convert_rpl_to_rssi = NULL, + .phy_rpt_to_rssi = NULL, .ctrl_nbtg_bt_tx = rtw8851b_ctrl_nbtg_bt_tx, .cfg_txrx_path = rtw8851b_bb_cfg_txrx_path, .set_txpwr_ul_tb_offset = rtw8851b_set_txpwr_ul_tb_offset, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852a.c b/drivers/net/wireless/realtek/rtw89/rtw8852a.c index a7105a288bc47..9bd2842c27d50 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852a.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852a.c @@ -2068,7 +2068,9 @@ static void rtw8852a_query_ppdu(struct rtw89_dev *rtwdev, u8 path; u8 *rx_power = phy_ppdu->rssi; - status->signal = RTW89_RSSI_RAW_TO_DBM(max(rx_power[RF_PATH_A], rx_power[RF_PATH_B])); + if (!status->signal) + status->signal = RTW89_RSSI_RAW_TO_DBM(max(rx_power[RF_PATH_A], + rx_power[RF_PATH_B])); for (path = 0; path < rtwdev->chip->rf_path_num; path++) { status->chains |= BIT(path); status->chain_signal[path] = RTW89_RSSI_RAW_TO_DBM(rx_power[path]); @@ -2116,6 +2118,7 @@ static const struct rtw89_chip_ops rtw8852a_chip_ops = { .ctrl_btg_bt_rx = rtw8852a_ctrl_btg_bt_rx, .query_ppdu = rtw8852a_query_ppdu, .convert_rpl_to_rssi = NULL, + .phy_rpt_to_rssi = NULL, .ctrl_nbtg_bt_tx = rtw8852a_ctrl_nbtg_bt_tx, .cfg_txrx_path = NULL, .set_txpwr_ul_tb_offset = rtw8852a_set_txpwr_ul_tb_offset, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852b.c b/drivers/net/wireless/realtek/rtw89/rtw8852b.c index ebc853a905dd2..dfb2bf61b0b83 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852b.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852b.c @@ -745,6 +745,7 @@ static const struct rtw89_chip_ops rtw8852b_chip_ops = { .ctrl_btg_bt_rx = rtw8852bx_ctrl_btg_bt_rx, .query_ppdu = rtw8852bx_query_ppdu, .convert_rpl_to_rssi = rtw8852bx_convert_rpl_to_rssi, + .phy_rpt_to_rssi = NULL, .ctrl_nbtg_bt_tx = rtw8852bx_ctrl_nbtg_bt_tx, .cfg_txrx_path = rtw8852bx_bb_cfg_txrx_path, .set_txpwr_ul_tb_offset = rtw8852bx_set_txpwr_ul_tb_offset, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852b_common.c b/drivers/net/wireless/realtek/rtw89/rtw8852b_common.c index 012739d97f71a..0e094ce9c9b08 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852b_common.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852b_common.c @@ -1950,7 +1950,9 @@ static void __rtw8852bx_query_ppdu(struct rtw89_dev *rtwdev, u8 path; u8 *rx_power = phy_ppdu->rssi; - status->signal = RTW89_RSSI_RAW_TO_DBM(max(rx_power[RF_PATH_A], rx_power[RF_PATH_B])); + if (!status->signal) + status->signal = RTW89_RSSI_RAW_TO_DBM(max(rx_power[RF_PATH_A], + rx_power[RF_PATH_B])); for (path = 0; path < rtwdev->chip->rf_path_num; path++) { status->chains |= BIT(path); status->chain_signal[path] = RTW89_RSSI_RAW_TO_DBM(rx_power[path]); diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852bt.c b/drivers/net/wireless/realtek/rtw89/rtw8852bt.c index cd1385ff80030..bde3e1fb7ca62 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852bt.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852bt.c @@ -679,6 +679,7 @@ static const struct rtw89_chip_ops rtw8852bt_chip_ops = { .ctrl_btg_bt_rx = rtw8852bx_ctrl_btg_bt_rx, .query_ppdu = rtw8852bx_query_ppdu, .convert_rpl_to_rssi = rtw8852bx_convert_rpl_to_rssi, + .phy_rpt_to_rssi = NULL, .ctrl_nbtg_bt_tx = rtw8852bx_ctrl_nbtg_bt_tx, .cfg_txrx_path = rtw8852bx_bb_cfg_txrx_path, .set_txpwr_ul_tb_offset = rtw8852bx_set_txpwr_ul_tb_offset, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852c.c b/drivers/net/wireless/realtek/rtw89/rtw8852c.c index c7d39499ca75a..bc84b15e7826d 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852c.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852c.c @@ -2807,7 +2807,10 @@ static void rtw8852c_query_ppdu(struct rtw89_dev *rtwdev, u8 path; u8 *rx_power = phy_ppdu->rssi; - status->signal = RTW89_RSSI_RAW_TO_DBM(max(rx_power[RF_PATH_A], rx_power[RF_PATH_B])); + if (!status->signal) + status->signal = RTW89_RSSI_RAW_TO_DBM(max(rx_power[RF_PATH_A], + rx_power[RF_PATH_B])); + for (path = 0; path < rtwdev->chip->rf_path_num; path++) { status->chains |= BIT(path); status->chain_signal[path] = RTW89_RSSI_RAW_TO_DBM(rx_power[path]); @@ -2907,6 +2910,7 @@ static const struct rtw89_chip_ops rtw8852c_chip_ops = { .ctrl_btg_bt_rx = rtw8852c_ctrl_btg_bt_rx, .query_ppdu = rtw8852c_query_ppdu, .convert_rpl_to_rssi = NULL, + .phy_rpt_to_rssi = NULL, .ctrl_nbtg_bt_tx = rtw8852c_ctrl_nbtg_bt_tx, .cfg_txrx_path = rtw8852c_bb_cfg_txrx_path, .set_txpwr_ul_tb_offset = rtw8852c_set_txpwr_ul_tb_offset, diff --git a/drivers/net/wireless/realtek/rtw89/rtw8922a.c b/drivers/net/wireless/realtek/rtw89/rtw8922a.c index a96b58ce65926..f04cb3b113722 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8922a.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8922a.c @@ -2565,8 +2565,10 @@ static void rtw8922a_query_ppdu(struct rtw89_dev *rtwdev, u8 path; u8 *rx_power = phy_ppdu->rssi; - status->signal = - RTW89_RSSI_RAW_TO_DBM(max(rx_power[RF_PATH_A], rx_power[RF_PATH_B])); + if (!status->signal) + status->signal = RTW89_RSSI_RAW_TO_DBM(max(rx_power[RF_PATH_A], + rx_power[RF_PATH_B])); + for (path = 0; path < rtwdev->chip->rf_path_num; path++) { status->chains |= BIT(path); status->chain_signal[path] = RTW89_RSSI_RAW_TO_DBM(rx_power[path]); @@ -2607,6 +2609,16 @@ static void rtw8922a_convert_rpl_to_rssi(struct rtw89_dev *rtwdev, phy_ppdu->rssi_avg = phy_ppdu->rpl_avg; } +static void rtw8922a_phy_rpt_to_rssi(struct rtw89_dev *rtwdev, + struct rtw89_rx_desc_info *desc_info, + struct ieee80211_rx_status *rx_status) +{ + if (desc_info->rssi <= 0x1 || (desc_info->rssi >> 2) > MAX_RSSI) + return; + + rx_status->signal = (desc_info->rssi >> 2) - MAX_RSSI; +} + static int rtw8922a_mac_enable_bb_rf(struct rtw89_dev *rtwdev) { rtw89_write8_set(rtwdev, R_BE_FEN_RST_ENABLE, @@ -2665,6 +2677,7 @@ static const struct rtw89_chip_ops rtw8922a_chip_ops = { .ctrl_btg_bt_rx = rtw8922a_ctrl_btg_bt_rx, .query_ppdu = rtw8922a_query_ppdu, .convert_rpl_to_rssi = rtw8922a_convert_rpl_to_rssi, + .phy_rpt_to_rssi = rtw8922a_phy_rpt_to_rssi, .ctrl_nbtg_bt_tx = rtw8922a_ctrl_nbtg_bt_tx, .cfg_txrx_path = rtw8922a_bb_cfg_txrx_path, .set_txpwr_ul_tb_offset = NULL, diff --git a/drivers/net/wireless/realtek/rtw89/txrx.h b/drivers/net/wireless/realtek/rtw89/txrx.h index b2e47829983fe..70fe7cebc9d53 100644 --- a/drivers/net/wireless/realtek/rtw89/txrx.h +++ b/drivers/net/wireless/realtek/rtw89/txrx.h @@ -560,6 +560,9 @@ struct rtw89_phy_sts_iehdr { #define BE_RXD_HDR_OFFSET_MASK GENMASK(20, 16) #define BE_RXD_WL_HD_IV_LEN_MASK GENMASK(26, 21) +/* BE RXD - PHY RPT dword0 */ +#define BE_RXD_PHY_RSSI GENMASK(11, 0) + struct rtw89_phy_sts_ie00 { __le32 w0; __le32 w1; From 148cd1e184cf3f19c17f8596c8f3f1a3b447edd6 Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Thu, 28 Nov 2024 13:54:32 +0800 Subject: [PATCH 0151/1450] wifi: rtw89: refine link handling for link_sta_rc_update The original handling will iterate all active links under the given sta and apply the changes to each. Now, stack tweaks ops from sta_rc_update to link_sta_rc_update, which means targeting a given link. Then, our link iteration looks redundant. So, refine it to apply the changes to the link directly. Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241128055433.11851-6-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/mac80211.c | 9 +++++++-- drivers/net/wireless/realtek/rtw89/phy.c | 15 ++++++--------- drivers/net/wireless/realtek/rtw89/phy.h | 3 +++ 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c index bf7a674bce284..5eac0b524060d 100644 --- a/drivers/net/wireless/realtek/rtw89/mac80211.c +++ b/drivers/net/wireless/realtek/rtw89/mac80211.c @@ -1307,10 +1307,15 @@ static void rtw89_ops_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_link_sta *link_sta, u32 changed) { - struct ieee80211_sta *sta = link_sta->sta; + struct rtw89_sta *rtwsta = sta_to_rtwsta(link_sta->sta); struct rtw89_dev *rtwdev = hw->priv; + struct rtw89_sta_link *rtwsta_link; + + rtwsta_link = rtwsta->links[link_sta->link_id]; + if (unlikely(!rtwsta_link)) + return; - rtw89_phy_ra_update_sta(rtwdev, sta, changed); + rtw89_phy_ra_update_sta_link(rtwdev, rtwsta_link, changed); } static int rtw89_ops_add_chanctx(struct ieee80211_hw *hw, diff --git a/drivers/net/wireless/realtek/rtw89/phy.c b/drivers/net/wireless/realtek/rtw89/phy.c index be2f5338c3a0b..e88ed9ec57c5b 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.c +++ b/drivers/net/wireless/realtek/rtw89/phy.c @@ -467,11 +467,11 @@ static void rtw89_phy_ra_sta_update(struct rtw89_dev *rtwdev, ra->csi_mode = csi_mode; } -static void __rtw89_phy_ra_update_sta(struct rtw89_dev *rtwdev, - struct rtw89_vif_link *rtwvif_link, - struct rtw89_sta_link *rtwsta_link, - u32 changed) +void rtw89_phy_ra_update_sta_link(struct rtw89_dev *rtwdev, + struct rtw89_sta_link *rtwsta_link, + u32 changed) { + struct rtw89_vif_link *rtwvif_link = rtwsta_link->rtwvif_link; struct ieee80211_vif *vif = rtwvif_link_to_vif(rtwvif_link); struct rtw89_ra_info *ra = &rtwsta_link->ra; struct ieee80211_link_sta *link_sta; @@ -504,14 +504,11 @@ void rtw89_phy_ra_update_sta(struct rtw89_dev *rtwdev, struct ieee80211_sta *sta u32 changed) { struct rtw89_sta *rtwsta = sta_to_rtwsta(sta); - struct rtw89_vif_link *rtwvif_link; struct rtw89_sta_link *rtwsta_link; unsigned int link_id; - rtw89_sta_for_each_link(rtwsta, rtwsta_link, link_id) { - rtwvif_link = rtwsta_link->rtwvif_link; - __rtw89_phy_ra_update_sta(rtwdev, rtwvif_link, rtwsta_link, changed); - } + rtw89_sta_for_each_link(rtwsta, rtwsta_link, link_id) + rtw89_phy_ra_update_sta_link(rtwdev, rtwsta_link, changed); } static bool __check_rate_pattern(struct rtw89_phy_rate_pattern *next, diff --git a/drivers/net/wireless/realtek/rtw89/phy.h b/drivers/net/wireless/realtek/rtw89/phy.h index f4ef7f5fb0814..08a026ac9d386 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.h +++ b/drivers/net/wireless/realtek/rtw89/phy.h @@ -926,6 +926,9 @@ void rtw89_phy_ra_assoc(struct rtw89_dev *rtwdev, struct rtw89_sta_link *rtwsta_ void rtw89_phy_ra_update(struct rtw89_dev *rtwdev); void rtw89_phy_ra_update_sta(struct rtw89_dev *rtwdev, struct ieee80211_sta *sta, u32 changed); +void rtw89_phy_ra_update_sta_link(struct rtw89_dev *rtwdev, + struct rtw89_sta_link *rtwsta_link, + u32 changed); void rtw89_phy_rate_pattern_vif(struct rtw89_dev *rtwdev, struct ieee80211_vif *vif, const struct cfg80211_bitrate_mask *mask); From 7b98caea39676561f22db58752551161bb36462b Mon Sep 17 00:00:00 2001 From: Chih-Kang Chang Date: Thu, 28 Nov 2024 13:54:33 +0800 Subject: [PATCH 0152/1450] wifi: rtw89: add crystal_cap check to avoid setting as overflow value In the original flow, the crystal_cap might be calculated as a negative value and set as an overflow value. Therefore, we added a check to limit the calculated crystal_cap value. Additionally, we shrank the crystal_cap adjustment according to specific CFO. Signed-off-by: Chih-Kang Chang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241128055433.11851-7-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/phy.c | 11 ++++++----- drivers/net/wireless/realtek/rtw89/phy.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/phy.c b/drivers/net/wireless/realtek/rtw89/phy.c index e88ed9ec57c5b..8d36bf9627323 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.c +++ b/drivers/net/wireless/realtek/rtw89/phy.c @@ -4266,7 +4266,6 @@ static void rtw89_phy_cfo_set_crystal_cap(struct rtw89_dev *rtwdev, if (!force && cfo->crystal_cap == crystal_cap) return; - crystal_cap = clamp_t(u8, crystal_cap, 0, 127); if (chip->chip_id == RTL8852A || chip->chip_id == RTL8851B) { rtw89_phy_cfo_set_xcap_reg(rtwdev, true, crystal_cap); rtw89_phy_cfo_set_xcap_reg(rtwdev, false, crystal_cap); @@ -4389,7 +4388,7 @@ static void rtw89_phy_cfo_crystal_cap_adjust(struct rtw89_dev *rtwdev, s32 curr_cfo) { struct rtw89_cfo_tracking_info *cfo = &rtwdev->cfo_tracking; - s8 crystal_cap = cfo->crystal_cap; + int crystal_cap = cfo->crystal_cap; s32 cfo_abs = abs(curr_cfo); int sign; @@ -4410,15 +4409,17 @@ static void rtw89_phy_cfo_crystal_cap_adjust(struct rtw89_dev *rtwdev, } sign = curr_cfo > 0 ? 1 : -1; if (cfo_abs > CFO_TRK_STOP_TH_4) - crystal_cap += 7 * sign; + crystal_cap += 3 * sign; else if (cfo_abs > CFO_TRK_STOP_TH_3) - crystal_cap += 5 * sign; - else if (cfo_abs > CFO_TRK_STOP_TH_2) crystal_cap += 3 * sign; + else if (cfo_abs > CFO_TRK_STOP_TH_2) + crystal_cap += 1 * sign; else if (cfo_abs > CFO_TRK_STOP_TH_1) crystal_cap += 1 * sign; else return; + + crystal_cap = clamp(crystal_cap, 0, 127); rtw89_phy_cfo_set_crystal_cap(rtwdev, (u8)crystal_cap, false); rtw89_debug(rtwdev, RTW89_DBG_CFO, "X_cap{Curr,Default}={0x%x,0x%x}\n", diff --git a/drivers/net/wireless/realtek/rtw89/phy.h b/drivers/net/wireless/realtek/rtw89/phy.h index 08a026ac9d386..e6d06f0a6c093 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.h +++ b/drivers/net/wireless/realtek/rtw89/phy.h @@ -57,7 +57,7 @@ #define CFO_TRK_STOP_TH_4 (30 << 2) #define CFO_TRK_STOP_TH_3 (20 << 2) #define CFO_TRK_STOP_TH_2 (10 << 2) -#define CFO_TRK_STOP_TH_1 (00 << 2) +#define CFO_TRK_STOP_TH_1 (03 << 2) #define CFO_TRK_STOP_TH (2 << 2) #define CFO_SW_COMP_FINE_TUNE (2 << 2) #define CFO_PERIOD_CNT 15 From e05feab22fd7dabcd6d272c4e2401ec1acdfdb9b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Tue, 3 Dec 2024 15:45:37 +0200 Subject: [PATCH 0153/1450] RDMA/mlx5: Enforce same type port association for multiport RoCE Different core device types such as PFs and VFs shouldn't be affiliated together since they have different capabilities, fix that by enforcing type check before doing the affiliation. Fixes: 32f69e4be269 ("{net, IB}/mlx5: Manage port association for multiport RoCE") Reviewed-by: Mark Bloch Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/88699500f690dff1c1852c1ddb71f8a1cc8b956e.1733233480.git.leonro@nvidia.com Reviewed-by: Mateusz Polchlopek Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 6 ++++-- include/linux/mlx5/driver.h | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bc7930d0c5647..c2314797afc9b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3639,7 +3639,8 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, list) { if (dev->sys_image_guid == mpi->sys_image_guid && - (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + (mlx5_core_native_port_num(mpi->mdev) - 1) == i && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) { bound = mlx5_ib_bind_slave_port(dev, mpi); } @@ -4785,7 +4786,8 @@ static int mlx5r_mp_probe(struct auxiliary_device *adev, mutex_lock(&mlx5_ib_multiport_mutex); list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { - if (dev->sys_image_guid == mpi->sys_image_guid) + if (dev->sys_image_guid == mpi->sys_image_guid && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) bound = mlx5_ib_bind_slave_port(dev, mpi); if (bound) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d9..4f9e6f6dbaaba 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1202,6 +1202,12 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_VF; } +static inline bool mlx5_core_same_coredev_type(const struct mlx5_core_dev *dev1, + const struct mlx5_core_dev *dev2) +{ + return dev1->coredev_type == dev2->coredev_type; +} + static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; From 79d330fbdffd8cee06d8bdf38d82cb62d8363a27 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Wed, 4 Dec 2024 13:24:12 +0530 Subject: [PATCH 0154/1450] RDMA/bnxt_re: Fix max SGEs for the Work Request Gen P7 supports up to 13 SGEs for now. WQE software structure can hold only 6 now. Since the max send sge is reported as 13, the stack can give requests up to 13 SGEs. This is causing traffic failures and system crashes. Use the define for max SGE supported for variable size. This will work for both static and variable WQEs. Fixes: 227f51743b61 ("RDMA/bnxt_re: Fix the max WQE size for static WQE support") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index ef3424c813456..19e279871f107 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -114,7 +114,6 @@ struct bnxt_qplib_sge { u32 size; }; -#define BNXT_QPLIB_QP_MAX_SGL 6 struct bnxt_qplib_swq { u64 wr_id; int next_idx; @@ -154,7 +153,7 @@ struct bnxt_qplib_swqe { #define BNXT_QPLIB_SWQE_FLAGS_UC_FENCE BIT(2) #define BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT BIT(3) #define BNXT_QPLIB_SWQE_FLAGS_INLINE BIT(4) - struct bnxt_qplib_sge sg_list[BNXT_QPLIB_QP_MAX_SGL]; + struct bnxt_qplib_sge sg_list[BNXT_VAR_MAX_SGE]; int num_sge; /* Max inline data is 96 bytes */ u32 inline_len; From 5effcacc8a8f3eb2a9f069d7e81a9ac793598dfb Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Wed, 4 Dec 2024 13:24:13 +0530 Subject: [PATCH 0155/1450] RDMA/bnxt_re: Avoid initializing the software queue for user queues Software Queues to hold the WRs needs to be created for only kernel queues. Avoid allocating the unnecessary memory for user Queues. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Fixes: 159fb4ceacd7 ("RDMA/bnxt_re: introduce a function to allocate swq") Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 42 +++++++++++++----------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 9af8aaadc99a8..72f35070f671c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -659,13 +659,6 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr); if (rc) return rc; - - srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), - GFP_KERNEL); - if (!srq->swq) { - rc = -ENOMEM; - goto fail; - } srq->dbinfo.flags = 0; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_SRQ, @@ -694,9 +687,17 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, spin_lock_init(&srq->lock); srq->start_idx = 0; srq->last_idx = srq->hwq.max_elements - 1; - for (idx = 0; idx < srq->hwq.max_elements; idx++) - srq->swq[idx].next_idx = idx + 1; - srq->swq[srq->last_idx].next_idx = -1; + if (!srq->hwq.is_user) { + srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), + GFP_KERNEL); + if (!srq->swq) { + rc = -ENOMEM; + goto fail; + } + for (idx = 0; idx < srq->hwq.max_elements; idx++) + srq->swq[idx].next_idx = idx + 1; + srq->swq[srq->last_idx].next_idx = -1; + } srq->id = le32_to_cpu(resp.xid); srq->dbinfo.hwq = &srq->hwq; @@ -1042,13 +1043,14 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (rc) return rc; - rc = bnxt_qplib_alloc_init_swq(sq); - if (rc) - goto fail_sq; - - if (psn_sz) - bnxt_qplib_init_psn_ptr(qp, psn_sz); + if (!sq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(sq); + if (rc) + goto fail_sq; + if (psn_sz) + bnxt_qplib_init_psn_ptr(qp, psn_sz); + } req.sq_size = cpu_to_le32(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); pbl = &sq->hwq.pbl[PBL_LVL_0]; req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); @@ -1074,9 +1076,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); if (rc) goto sq_swq; - rc = bnxt_qplib_alloc_init_swq(rq); - if (rc) - goto fail_rq; + if (!rq->hwq.is_user) { + rc = bnxt_qplib_alloc_init_swq(rq); + if (rc) + goto fail_rq; + } req.rq_size = cpu_to_le32(rq->max_wqe); pbl = &rq->hwq.pbl[PBL_LVL_0]; From 064c22408a73b9e945139b64614c534cbbefb591 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Wed, 4 Dec 2024 13:24:14 +0530 Subject: [PATCH 0156/1450] RDMA/bnxt_re: Avoid sending the modify QP workaround for latest adapters The workaround to modify the UD QP from RTS to RTS is required only for older adapters. Issuing this for latest adapters can caus some unexpected behavior. Fix it Fixes: 1801d87b3598 ("RDMA/bnxt_re: Support new 5760X P7 devices") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 82023394e3300..5428a1408ceee 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2824,7 +2824,8 @@ static int bnxt_re_post_send_shadow_qp(struct bnxt_re_dev *rdev, wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; } @@ -2936,7 +2937,8 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr, wr = wr->next; } bnxt_qplib_post_send_db(&qp->qplib_qp); - bnxt_ud_qp_hw_stall_workaround(qp); + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->rdev->chip_ctx)) + bnxt_ud_qp_hw_stall_workaround(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); return rc; From d507d29bfde3fee6a74d098a9ac640b8fc1a549b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 4 Dec 2024 13:24:16 +0530 Subject: [PATCH 0157/1450] RDMA/bnxt_re: Don't fail destroy QP and cleanup debugfs earlier Change bnxt_re_destroy_qp to always return 0 and don't fail in case of error during destroy. In addition, delete debugfs QP to earlier stage. Fixes: d7d54769c042 ("RDMA/bnxt_re: Add debugfs hook in the driver") Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241204075416.478431-6-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 5428a1408ceee..215074c0860b8 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -967,13 +967,13 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) unsigned int flags; int rc; + bnxt_re_debug_rem_qpinfo(rdev, qp); + bnxt_qplib_flush_cqn_wq(&qp->qplib_qp); rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp); - if (rc) { + if (rc) ibdev_err(&rdev->ibdev, "Failed to destroy HW QP"); - return rc; - } if (rdma_is_kernel_res(&qp->ib_qp.res)) { flags = bnxt_re_lock_cqs(qp); @@ -983,11 +983,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp); - if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) { - rc = bnxt_re_destroy_gsi_sqp(qp); - if (rc) - return rc; - } + if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) + bnxt_re_destroy_gsi_sqp(qp); mutex_lock(&rdev->qp_lock); list_del(&qp->list); @@ -998,8 +995,6 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) else if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD) atomic_dec(&rdev->stats.res.ud_qp_count); - bnxt_re_debug_rem_qpinfo(rdev, qp); - ib_umem_release(qp->rumem); ib_umem_release(qp->sumem); From ffa794846bf777a06407d94ef69b9b1c5ac5a6c6 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:06:58 -0500 Subject: [PATCH 0158/1450] xfrm: config: add CONFIG_XFRM_IPTFS Add new Kconfig option to enable IP-TFS (RFC9347) functionality. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/Kconfig | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index d7b16f2c23e96..f0157702718ff 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -135,6 +135,22 @@ config NET_KEY_MIGRATE If unsure, say N. +config XFRM_IPTFS + tristate "IPsec IP-TFS/AGGFRAG (RFC 9347) encapsulation support" + depends on XFRM + help + Information on the IP-TFS/AGGFRAG encapsulation can be found + in RFC 9347. This feature supports demand driven (i.e., + non-constant send rate) IP-TFS to take advantage of the + AGGFRAG ESP payload encapsulation. This payload type + supports aggregation and fragmentation of the inner IP + packet stream which in turn yields higher small-packet + bandwidth as well as reducing MTU/PMTU issues. Congestion + control is unimplementated as the send rate is demand driven + rather than constant. + + If unsure, say N. + config XFRM_ESPINTCP bool From 64e844505bc08cde3f346f193cbbbab0096fef54 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:06:59 -0500 Subject: [PATCH 0159/1450] include: uapi: protocol number and packet structs for AGGFRAG in ESP Add the RFC assigned IP protocol number for AGGFRAG. Add the on-wire basic and congestion-control IP-TFS packet headers. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- include/uapi/linux/in.h | 2 ++ include/uapi/linux/ip.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 5d32d53508d99..ced0fc3c3aa53 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -79,6 +79,8 @@ enum { #define IPPROTO_MPLS IPPROTO_MPLS IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */ #define IPPROTO_ETHERNET IPPROTO_ETHERNET + IPPROTO_AGGFRAG = 144, /* AGGFRAG in ESP (RFC 9347) */ +#define IPPROTO_AGGFRAG IPPROTO_AGGFRAG IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW IPPROTO_SMC = 256, /* Shared Memory Communications */ diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 283dec7e36451..5bd7ce934d746 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -137,6 +137,22 @@ struct ip_beet_phdr { __u8 reserved; }; +struct ip_iptfs_hdr { + __u8 subtype; /* 0*: basic, 1: CC */ + __u8 flags; + __be16 block_offset; +}; + +struct ip_iptfs_cc_hdr { + __u8 subtype; /* 0: basic, 1*: CC */ + __u8 flags; + __be16 block_offset; + __be32 loss_rate; + __be64 rtt_adelay_xdelay; + __be32 tval; + __be32 techo; +}; + /* index values for the variables in ipv4_devconf */ enum { From f69eb4f65c58f5a081dbafb76011dad73757420c Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:00 -0500 Subject: [PATCH 0160/1450] xfrm: netlink: add config (netlink) options Add netlink options for configuring IP-TFS SAs. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 9 ++++++- net/xfrm/xfrm_compat.c | 10 ++++++-- net/xfrm/xfrm_user.c | 52 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index d73a97e3030a8..a23495c0e0a10 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -158,7 +158,8 @@ enum { #define XFRM_MODE_ROUTEOPTIMIZATION 2 #define XFRM_MODE_IN_TRIGGER 3 #define XFRM_MODE_BEET 4 -#define XFRM_MODE_MAX 5 +#define XFRM_MODE_IPTFS 5 +#define XFRM_MODE_MAX 6 /* Netlink configuration messages. */ enum { @@ -323,6 +324,12 @@ enum xfrm_attr_type_t { XFRMA_SA_DIR, /* __u8 */ XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */ XFRMA_SA_PCPU, /* __u32 */ + XFRMA_IPTFS_DROP_TIME, /* __u32 in: usec to wait for next seq */ + XFRMA_IPTFS_REORDER_WINDOW, /* __u16 in: reorder window size (pkts) */ + XFRMA_IPTFS_DONT_FRAG, /* out: don't use fragmentation */ + XFRMA_IPTFS_INIT_DELAY, /* __u32 out: initial packet wait delay (usec) */ + XFRMA_IPTFS_MAX_QSIZE, /* __u32 out: max ingress queue size (octets) */ + XFRMA_IPTFS_PKT_SIZE, /* __u32 out: size of outer packet, 0 for PMTU */ __XFRMA_MAX #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 5b9ee63e30b69..b8d2e6930041a 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -284,9 +284,15 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_SA_DIR: case XFRMA_NAT_KEEPALIVE_INTERVAL: case XFRMA_SA_PCPU: + case XFRMA_IPTFS_DROP_TIME: + case XFRMA_IPTFS_REORDER_WINDOW: + case XFRMA_IPTFS_DONT_FRAG: + case XFRMA_IPTFS_INIT_DELAY: + case XFRMA_IPTFS_MAX_QSIZE: + case XFRMA_IPTFS_PKT_SIZE: return xfrm_nla_cpy(dst, src, nla_len(src)); default: - BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE); pr_warn_once("unsupported nla_type %d\n", src->nla_type); return -EOPNOTSUPP; } @@ -441,7 +447,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, int err; if (type > XFRMA_MAX) { - BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE); NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b2876e09328b6..749ec56101ac6 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -301,6 +301,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode"); goto out; } + if ((attrs[XFRMA_IPTFS_DROP_TIME] || + attrs[XFRMA_IPTFS_REORDER_WINDOW] || + attrs[XFRMA_IPTFS_DONT_FRAG] || + attrs[XFRMA_IPTFS_INIT_DELAY] || + attrs[XFRMA_IPTFS_MAX_QSIZE] || + attrs[XFRMA_IPTFS_PKT_SIZE]) && + p->mode != XFRM_MODE_IPTFS) { + NL_SET_ERR_MSG(extack, "IP-TFS options can only be used in IP-TFS mode"); + goto out; + } break; case IPPROTO_COMP: @@ -421,6 +431,18 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; } + if (attrs[XFRMA_IPTFS_DROP_TIME]) { + NL_SET_ERR_MSG(extack, "IP-TFS drop time should not be set for output SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) { + NL_SET_ERR_MSG(extack, "IP-TFS reorder window should not be set for output SA"); + err = -EINVAL; + goto out; + } + if (attrs[XFRMA_REPLAY_VAL]) { struct xfrm_replay_state *replay; @@ -458,6 +480,30 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, } } + + if (attrs[XFRMA_IPTFS_DONT_FRAG]) { + NL_SET_ERR_MSG(extack, "IP-TFS don't fragment should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_INIT_DELAY]) { + NL_SET_ERR_MSG(extack, "IP-TFS initial delay should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_MAX_QSIZE]) { + NL_SET_ERR_MSG(extack, "IP-TFS max queue size should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + NL_SET_ERR_MSG(extack, "IP-TFS packet size should not be set for input SA"); + err = -EINVAL; + goto out; + } } if (!sa_dir && attrs[XFRMA_SA_PCPU]) { @@ -3220,6 +3266,12 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, [XFRMA_SA_PCPU] = { .type = NLA_U32 }, + [XFRMA_IPTFS_DROP_TIME] = { .type = NLA_U32 }, + [XFRMA_IPTFS_REORDER_WINDOW] = { .type = NLA_U16 }, + [XFRMA_IPTFS_DONT_FRAG] = { .type = NLA_FLAG }, + [XFRMA_IPTFS_INIT_DELAY] = { .type = NLA_U32 }, + [XFRMA_IPTFS_MAX_QSIZE] = { .type = NLA_U32 }, + [XFRMA_IPTFS_PKT_SIZE] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(xfrma_policy); From 7ac64f4598b4daa3f955f82759760666e047bdf8 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:01 -0500 Subject: [PATCH 0161/1450] xfrm: add mode_cbs module functionality Add a set of callbacks xfrm_mode_cbs to xfrm_state. These callbacks enable the addition of new xfrm modes, such as IP-TFS to be defined in modules. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 43 +++++++++++++++++++++++++ net/xfrm/xfrm_device.c | 3 +- net/xfrm/xfrm_input.c | 18 +++++++++-- net/xfrm/xfrm_output.c | 2 ++ net/xfrm/xfrm_policy.c | 18 +++++++---- net/xfrm/xfrm_state.c | 72 ++++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_user.c | 13 ++++++++ 7 files changed, 159 insertions(+), 10 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 32c09e85a64ce..1ebc09cde6278 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -213,6 +213,7 @@ struct xfrm_state { u16 family; xfrm_address_t saddr; int header_len; + int enc_hdr_len; int trailer_len; u32 extra_flags; struct xfrm_mark smark; @@ -303,6 +304,9 @@ struct xfrm_state { * interpreted by xfrm_type methods. */ void *data; u8 dir; + + const struct xfrm_mode_cbs *mode_cbs; + void *mode_data; }; static inline struct net *xs_net(struct xfrm_state *x) @@ -460,6 +464,45 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); +/** + * struct xfrm_mode_cbs - XFRM mode callbacks + * @owner: module owner or NULL + * @init_state: Add/init mode specific state in `xfrm_state *x` + * @clone_state: Copy mode specific values from `orig` to new state `x` + * @destroy_state: Cleanup mode specific state from `xfrm_state *x` + * @user_init: Process mode specific netlink attributes from user + * @copy_to_user: Add netlink attributes to `attrs` based on state in `x` + * @sa_len: Return space required to store mode specific netlink attributes + * @get_inner_mtu: Return avail payload space after removing encap overhead + * @input: Process received packet from SA using mode + * @output: Output given packet using mode + * @prepare_output: Add mode specific encapsulation to packet in skb. On return + * `transport_header` should point at ESP header, `network_header` should + * point at outer IP header and `mac_header` should opint at the + * protocol/nexthdr field of the outer IP. + * + * One should examine and understand the specific uses of these callbacks in + * xfrm for further detail on how and when these functions are called. RTSL. + */ +struct xfrm_mode_cbs { + struct module *owner; + int (*init_state)(struct xfrm_state *x); + int (*clone_state)(struct xfrm_state *x, struct xfrm_state *orig); + void (*destroy_state)(struct xfrm_state *x); + int (*user_init)(struct net *net, struct xfrm_state *x, + struct nlattr **attrs, + struct netlink_ext_ack *extack); + int (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb); + unsigned int (*sa_len)(const struct xfrm_state *x); + u32 (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu); + int (*input)(struct xfrm_state *x, struct sk_buff *skb); + int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); + int (*prepare_output)(struct xfrm_state *x, struct sk_buff *skb); +}; + +int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs); +void xfrm_unregister_mode_cbs(u8 mode); + static inline int xfrm_af2proto(unsigned int family) { switch(family) { diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index b33c4591e09a4..1fe1b07d879d2 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -42,7 +42,8 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, skb->transport_header = skb->network_header + hsize; skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + x->props.header_len); + pskb_pull(skb, + skb->mac_len + x->props.header_len - x->props.enc_hdr_len); } static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 841a60a6fbfea..2c4ae61e7e3a0 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -446,6 +446,9 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, WARN_ON_ONCE(1); break; default: + if (x->mode_cbs && x->mode_cbs->input) + return x->mode_cbs->input(x, skb); + WARN_ON_ONCE(1); break; } @@ -453,6 +456,10 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, return -EOPNOTSUPP; } +/* NOTE: encap_type - In addition to the normal (non-negative) values for + * encap_type, a negative value of -1 or -2 can be used to resume/restart this + * function after a previous invocation early terminated for async operation. + */ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { const struct xfrm_state_afinfo *afinfo; @@ -489,6 +496,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) family = x->props.family; + /* An encap_type of -2 indicates reconstructed inner packet */ + if (encap_type == -2) + goto resume_decapped; + /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; @@ -679,11 +690,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; - if (xfrm_inner_mode_input(x, skb)) { + err = xfrm_inner_mode_input(x, skb); + if (err == -EINPROGRESS) + return 0; + else if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } - +resume_decapped: if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) { decaps = 1; break; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e5722c95b8bb3..ef81359e40383 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -472,6 +472,8 @@ static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) WARN_ON_ONCE(1); break; default: + if (x->mode_cbs && x->mode_cbs->prepare_output) + return x->mode_cbs->prepare_output(x, skb); WARN_ON_ONCE(1); break; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4408c11c0835f..c04014ee623fb 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2748,13 +2748,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->input = dst_discard; - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); - if (likely(afinfo)) - dst1->output = afinfo->output; - else - dst1->output = dst_discard_out; - rcu_read_unlock(); + if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) { + dst1->output = xfrm[i]->mode_cbs->output; + } else { + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); + if (likely(afinfo)) + dst1->output = afinfo->output; + else + dst1->output = dst_discard_out; + rcu_read_unlock(); + } xdst_prev = xdst; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 67ca7ac955a37..cf68ba891729f 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -515,6 +515,60 @@ static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) return NULL; } +static const struct xfrm_mode_cbs __rcu *xfrm_mode_cbs_map[XFRM_MODE_MAX]; +static DEFINE_SPINLOCK(xfrm_mode_cbs_map_lock); + +int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs) +{ + if (mode >= XFRM_MODE_MAX) + return -EINVAL; + + spin_lock_bh(&xfrm_mode_cbs_map_lock); + rcu_assign_pointer(xfrm_mode_cbs_map[mode], mode_cbs); + spin_unlock_bh(&xfrm_mode_cbs_map_lock); + + return 0; +} +EXPORT_SYMBOL(xfrm_register_mode_cbs); + +void xfrm_unregister_mode_cbs(u8 mode) +{ + if (mode >= XFRM_MODE_MAX) + return; + + spin_lock_bh(&xfrm_mode_cbs_map_lock); + RCU_INIT_POINTER(xfrm_mode_cbs_map[mode], NULL); + spin_unlock_bh(&xfrm_mode_cbs_map_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL(xfrm_unregister_mode_cbs); + +static const struct xfrm_mode_cbs *xfrm_get_mode_cbs(u8 mode) +{ + const struct xfrm_mode_cbs *cbs; + bool try_load = true; + + if (mode >= XFRM_MODE_MAX) + return NULL; + +retry: + rcu_read_lock(); + + cbs = rcu_dereference(xfrm_mode_cbs_map[mode]); + if (cbs && !try_module_get(cbs->owner)) + cbs = NULL; + + rcu_read_unlock(); + + if (mode == XFRM_MODE_IPTFS && !cbs && try_load) { + request_module("xfrm-iptfs"); + try_load = false; + goto retry; + } + + return cbs; +} + void xfrm_state_free(struct xfrm_state *x) { kmem_cache_free(xfrm_state_cache, x); @@ -523,6 +577,8 @@ EXPORT_SYMBOL(xfrm_state_free); static void ___xfrm_state_destroy(struct xfrm_state *x) { + if (x->mode_cbs && x->mode_cbs->destroy_state) + x->mode_cbs->destroy_state(x); hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); kfree(x->aead); @@ -682,6 +738,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) x->replay_maxdiff = 0; x->pcpu_num = UINT_MAX; spin_lock_init(&x->lock); + x->mode_data = NULL; } return x; } @@ -1945,6 +2002,12 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->new_mapping_sport = 0; x->dir = orig->dir; + x->mode_cbs = orig->mode_cbs; + if (x->mode_cbs && x->mode_cbs->clone_state) { + if (x->mode_cbs->clone_state(x, orig)) + goto error; + } + return x; error: @@ -2986,6 +3049,9 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) case XFRM_MODE_TUNNEL: break; default: + if (x->mode_cbs && x->mode_cbs->get_inner_mtu) + return x->mode_cbs->get_inner_mtu(x, mtu); + WARN_ON_ONCE(1); break; } @@ -3086,6 +3152,12 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, } } + x->mode_cbs = xfrm_get_mode_cbs(x->props.mode); + if (x->mode_cbs) { + if (x->mode_cbs->init_state) + err = x->mode_cbs->init_state(x); + module_put(x->mode_cbs->owner); + } error: return err; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 749ec56101ac6..71b452fff8dbd 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -932,6 +932,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } + if (x->mode_cbs && x->mode_cbs->user_init) { + err = x->mode_cbs->user_init(net, x, attrs, extack); + if (err) + goto error; + } + return x; error: @@ -1347,6 +1353,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x, if (ret) goto out; } + if (x->mode_cbs && x->mode_cbs->copy_to_user) + ret = x->mode_cbs->copy_to_user(x, skb); + if (ret) + goto out; if (x->mapping_maxage) { ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage); if (ret) @@ -3606,6 +3616,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) if (x->nat_keepalive_interval) l += nla_total_size(sizeof(x->nat_keepalive_interval)); + if (x->mode_cbs && x->mode_cbs->sa_len) + l += x->mode_cbs->sa_len(x); + return l; } From d1716d5a44c37e5743bf6ea4e5cdbdab37727f27 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:02 -0500 Subject: [PATCH 0162/1450] xfrm: add generic iptfs defines and functionality Define `XFRM_MODE_IPTFS` and `IPSEC_MODE_IPTFS` constants, and add these to switch case and conditionals adjacent with the existing TUNNEL modes. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + include/uapi/linux/ipsec.h | 3 ++- include/uapi/linux/snmp.h | 2 ++ net/ipv4/esp4.c | 3 ++- net/ipv6/esp6.c | 3 ++- net/netfilter/nft_xfrm.c | 3 ++- net/xfrm/xfrm_device.c | 1 + net/xfrm/xfrm_output.c | 4 ++++ net/xfrm/xfrm_policy.c | 8 ++++++-- net/xfrm/xfrm_proc.c | 2 ++ net/xfrm/xfrm_state.c | 12 ++++++++++++ net/xfrm/xfrm_user.c | 12 ++++++++++++ 12 files changed, 48 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1ebc09cde6278..4b0677e481900 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -38,6 +38,7 @@ #define XFRM_PROTO_COMP 108 #define XFRM_PROTO_IPIP 4 #define XFRM_PROTO_IPV6 41 +#define XFRM_PROTO_IPTFS IPPROTO_AGGFRAG #define XFRM_PROTO_ROUTING IPPROTO_ROUTING #define XFRM_PROTO_DSTOPTS IPPROTO_DSTOPTS diff --git a/include/uapi/linux/ipsec.h b/include/uapi/linux/ipsec.h index 50d8ee1791e2a..696b790f43462 100644 --- a/include/uapi/linux/ipsec.h +++ b/include/uapi/linux/ipsec.h @@ -14,7 +14,8 @@ enum { IPSEC_MODE_ANY = 0, /* We do not support this for SA */ IPSEC_MODE_TRANSPORT = 1, IPSEC_MODE_TUNNEL = 2, - IPSEC_MODE_BEET = 3 + IPSEC_MODE_BEET = 3, + IPSEC_MODE_IPTFS = 4 }; enum { diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index adf5fd78dd503..5a2553511190a 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -339,6 +339,8 @@ enum LINUX_MIB_XFRMACQUIREERROR, /* XfrmAcquireError */ LINUX_MIB_XFRMOUTSTATEDIRERROR, /* XfrmOutStateDirError */ LINUX_MIB_XFRMINSTATEDIRERROR, /* XfrmInStateDirError */ + LINUX_MIB_XFRMINIPTFSERROR, /* XfrmInIptfsError */ + LINUX_MIB_XFRMOUTNOQSPACE, /* XfrmOutNoQueueSpace */ __LINUX_MIB_XFRMMAX }; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f3281312eb5eb..b0fbf804bbbae 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -816,7 +816,8 @@ int esp_input_done2(struct sk_buff *skb, int err) } skb_pull_rcsum(skb, hlen); - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL || + x->props.mode == XFRM_MODE_IPTFS) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b2400c226a325..5f3d0cc1555a9 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -859,7 +859,8 @@ int esp6_input_done2(struct sk_buff *skb, int err) skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull_rcsum(skb, hlen); - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL || + x->props.mode == XFRM_MODE_IPTFS) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -hdr_len); diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 8a07b46cc8fb7..3210cfc966ab2 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) return true; } - return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; + return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || + mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 1fe1b07d879d2..d1fa94e52ceae 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -69,6 +69,7 @@ static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) { switch (x->outer_mode.encap) { + case XFRM_MODE_IPTFS: case XFRM_MODE_TUNNEL: if (x->outer_mode.family == AF_INET) return __xfrm_mode_tunnel_prep(x, skb, diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index ef81359e40383..b5025cf6136e9 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -677,6 +677,10 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x) return; } + if (x->outer_mode.encap == XFRM_MODE_IPTFS) { + xo->inner_ipproto = IPPROTO_AGGFRAG; + return; + } /* non-Tunnel Mode */ if (!skb->encapsulation) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c04014ee623fb..9e510021ee91a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2497,6 +2497,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || + tmpl->mode == XFRM_MODE_IPTFS || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; @@ -3294,7 +3295,8 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && - dst->xfrm->props.mode == XFRM_MODE_TUNNEL) + (dst->xfrm->props.mode == XFRM_MODE_TUNNEL || + dst->xfrm->props.mode == XFRM_MODE_IPTFS)) dst->flags |= DST_XFRM_TUNNEL; return dst; @@ -4523,6 +4525,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, @@ -4565,7 +4568,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && - pol->xfrm_vec[i].mode != XFRM_MODE_BEET) + pol->xfrm_vec[i].mode != XFRM_MODE_BEET && + pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index eeb984be03a79..8e07dd614b0bb 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -43,6 +43,8 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR), SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR), SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), + SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR), + SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE), SNMP_MIB_SENTINEL }; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index cf68ba891729f..34067cb8a4795 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -467,6 +467,11 @@ static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET, }, + [XFRM_MODE_IPTFS] = { + .encap = XFRM_MODE_IPTFS, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET, + }, }; static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { @@ -488,6 +493,11 @@ static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET6, }, + [XFRM_MODE_IPTFS] = { + .encap = XFRM_MODE_IPTFS, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET6, + }, }; static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) @@ -2334,6 +2344,7 @@ static int __xfrm6_state_sort_cmp(const void *p) #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: return 4; } return 5; @@ -2360,6 +2371,7 @@ static int __xfrm6_tmpl_sort_cmp(const void *p) #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: return 3; } return 4; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 71b452fff8dbd..08c6d6f0179fb 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -383,6 +383,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_BEET: break; + case XFRM_MODE_IPTFS: + if (p->id.proto != IPPROTO_ESP) { + NL_SET_ERR_MSG(extack, "IP-TFS mode only supported with ESP"); + goto out; + } + if (sa_dir == 0) { + NL_SET_ERR_MSG(extack, "IP-TFS mode requires in or out direction attribute"); + goto out; + } + break; default: NL_SET_ERR_MSG(extack, "Unsupported mode"); @@ -2014,6 +2024,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, return -EINVAL; } break; + case XFRM_MODE_IPTFS: + break; default: if (ut[i].family != prev_family) { NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change"); From 4b3faf610cc63bfac972711635eafbca5e7d7117 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:03 -0500 Subject: [PATCH 0163/1450] xfrm: iptfs: add new iptfs xfrm mode impl Add a new xfrm mode implementing AggFrag/IP-TFS from RFC9347. This utilizes the new xfrm_mode_cbs to implement demand-driven IP-TFS functionality. This functionality can be used to increase bandwidth utilization through small packet aggregation, as well as help solve PMTU issues through it's efficient use of fragmentation. Link: https://www.rfc-editor.org/rfc/rfc9347.txt Multiple commits follow to build the functionality into xfrm_iptfs.c Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/Makefile | 1 + net/xfrm/xfrm_iptfs.c | 216 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 net/xfrm/xfrm_iptfs.c diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 512e0b2f85142..5a1787587cb35 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -21,5 +21,6 @@ obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o +obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c new file mode 100644 index 0000000000000..e7cb8734fc0fd --- /dev/null +++ b/net/xfrm/xfrm_iptfs.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* xfrm_iptfs: IPTFS encapsulation support + * + * April 21 2022, Christian Hopps + * + * Copyright (c) 2022, LabN Consulting, L.L.C. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "xfrm_inout.h" + +/** + * struct xfrm_iptfs_config - configuration for the IPTFS tunnel. + * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery, + * otherwise the user specified value. + */ +struct xfrm_iptfs_config { + u32 pkt_size; /* outer_packet_size or 0 */ +}; + +/** + * struct xfrm_iptfs_data - mode specific xfrm state. + * @cfg: IPTFS tunnel config. + * @x: owning SA (xfrm_state). + * @payload_mtu: max payload size. + */ +struct xfrm_iptfs_data { + struct xfrm_iptfs_config cfg; + + /* Ingress User Input */ + struct xfrm_state *x; /* owning state */ + u32 payload_mtu; /* max payload size */ +}; + +/* ========================== */ +/* State Management Functions */ +/* ========================== */ + +/** + * iptfs_get_inner_mtu() - return inner MTU with no fragmentation. + * @x: xfrm state. + * @outer_mtu: the outer mtu + */ +static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) +{ + struct crypto_aead *aead; + u32 blksize; + + aead = x->data; + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + return ((outer_mtu - x->props.header_len - crypto_aead_authsize(aead)) & + ~(blksize - 1)) - 2; +} + +/** + * iptfs_user_init() - initialize the SA with IPTFS options from netlink. + * @net: the net data + * @x: xfrm state + * @attrs: netlink attributes + * @extack: extack return data + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_user_init(struct net *net, struct xfrm_state *x, + struct nlattr **attrs, + struct netlink_ext_ack *extack) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc; + + xc = &xtfs->cfg; + + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); + if (!xc->pkt_size) { + xtfs->payload_mtu = 0; + } else if (xc->pkt_size > x->props.header_len) { + xtfs->payload_mtu = xc->pkt_size - x->props.header_len; + } else { + NL_SET_ERR_MSG(extack, + "Packet size must be 0 or greater than IPTFS/ESP header length"); + return -EINVAL; + } + } + return 0; +} + +static unsigned int iptfs_sa_len(const struct xfrm_state *x) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc = &xtfs->cfg; + unsigned int l = 0; + + if (x->dir == XFRM_SA_DIR_OUT) + l += nla_total_size(sizeof(xc->pkt_size)); + + return l; +} + +static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc = &xtfs->cfg; + int ret = 0; + + if (x->dir == XFRM_SA_DIR_OUT) + ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); + + return ret; +} + +static void __iptfs_init_state(struct xfrm_state *x, + struct xfrm_iptfs_data *xtfs) +{ + /* Modify type (esp) adjustment values */ + + if (x->props.family == AF_INET) + x->props.header_len += sizeof(struct iphdr) + sizeof(struct ip_iptfs_hdr); + else if (x->props.family == AF_INET6) + x->props.header_len += sizeof(struct ipv6hdr) + sizeof(struct ip_iptfs_hdr); + x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr); + + /* Always keep a module reference when x->mode_data is set */ + __module_get(x->mode_cbs->owner); + + x->mode_data = xtfs; + xtfs->x = x; +} + +static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) +{ + struct xfrm_iptfs_data *xtfs; + + xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL); + if (!xtfs) + return -ENOMEM; + + x->mode_data = xtfs; + xtfs->x = x; + + return 0; +} + +static int iptfs_init_state(struct xfrm_state *x) +{ + struct xfrm_iptfs_data *xtfs; + + if (x->mode_data) { + /* We have arrived here from xfrm_state_clone() */ + xtfs = x->mode_data; + } else { + xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL); + if (!xtfs) + return -ENOMEM; + } + + __iptfs_init_state(x, xtfs); + + return 0; +} + +static void iptfs_destroy_state(struct xfrm_state *x) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + + if (!xtfs) + return; + + kfree_sensitive(xtfs); + + module_put(x->mode_cbs->owner); +} + +static const struct xfrm_mode_cbs iptfs_mode_cbs = { + .owner = THIS_MODULE, + .init_state = iptfs_init_state, + .clone_state = iptfs_clone_state, + .destroy_state = iptfs_destroy_state, + .user_init = iptfs_user_init, + .copy_to_user = iptfs_copy_to_user, + .sa_len = iptfs_sa_len, + .get_inner_mtu = iptfs_get_inner_mtu, +}; + +static int __init xfrm_iptfs_init(void) +{ + int err; + + pr_info("xfrm_iptfs: IPsec IP-TFS tunnel mode module\n"); + + err = xfrm_register_mode_cbs(XFRM_MODE_IPTFS, &iptfs_mode_cbs); + if (err < 0) + pr_info("%s: can't register IP-TFS\n", __func__); + + return err; +} + +static void __exit xfrm_iptfs_fini(void) +{ + xfrm_unregister_mode_cbs(XFRM_MODE_IPTFS); +} + +module_init(xfrm_iptfs_init); +module_exit(xfrm_iptfs_fini); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP-TFS support for xfrm ipsec tunnels"); From 0e4fbf013fa566f274ce9b4ce698c75b1f998c52 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:04 -0500 Subject: [PATCH 0164/1450] xfrm: iptfs: add user packet (tunnel ingress) handling Add tunnel packet output functionality. This is code handles the ingress to the tunnel. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 563 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 560 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index e7cb8734fc0fd..c4cff005ea9a7 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -19,29 +19,541 @@ #include "xfrm_inout.h" +/* ------------------------------------------------ */ +/* IPTFS default SA values (tunnel ingress/dir-out) */ +/* ------------------------------------------------ */ + +/** + * define IPTFS_DEFAULT_INIT_DELAY_USECS - default initial output delay + * + * The initial output delay is the amount of time prior to servicing the output + * queue after queueing the first packet on said queue. This applies anytime the + * output queue was previously empty. + * + * Default 0. + */ +#define IPTFS_DEFAULT_INIT_DELAY_USECS 0 + +/** + * define IPTFS_DEFAULT_MAX_QUEUE_SIZE - default max output queue size. + * + * The default IPTFS max output queue size in octets. The output queue is where + * received packets destined for output over an IPTFS tunnel are stored prior to + * being output in aggregated/fragmented form over the IPTFS tunnel. + * + * Default 1M. + */ +#define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240) + +#define NSECS_IN_USEC 1000 + +#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT + /** * struct xfrm_iptfs_config - configuration for the IPTFS tunnel. * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery, * otherwise the user specified value. + * @max_queue_size: The maximum number of octets allowed to be queued to be sent + * over the IPTFS SA. The queue size is measured as the size of all the + * packets enqueued. */ struct xfrm_iptfs_config { u32 pkt_size; /* outer_packet_size or 0 */ + u32 max_queue_size; /* octets */ }; /** * struct xfrm_iptfs_data - mode specific xfrm state. * @cfg: IPTFS tunnel config. * @x: owning SA (xfrm_state). + * @queue: queued user packets to send. + * @queue_size: number of octets on queue (sum of packet sizes). + * @ecn_queue_size: octets above with ECN mark. + * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. + * @iptfs_timer: output timer. * @payload_mtu: max payload size. */ struct xfrm_iptfs_data { struct xfrm_iptfs_config cfg; /* Ingress User Input */ - struct xfrm_state *x; /* owning state */ + struct xfrm_state *x; /* owning state */ + struct sk_buff_head queue; /* output queue */ + + u32 queue_size; /* octets */ + u32 ecn_queue_size; /* octets above which ECN mark */ + u64 init_delay_ns; /* nanoseconds */ + struct hrtimer iptfs_timer; /* output timer */ u32 payload_mtu; /* max payload size */ }; +static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); + +/* ================================= */ +/* IPTFS Sending (ingress) Functions */ +/* ================================= */ + +/* ------------------------- */ +/* Enqueue to send functions */ +/* ------------------------- */ + +/** + * iptfs_enqueue() - enqueue packet if ok to send. + * @xtfs: xtfs state + * @skb: the packet + * + * Return: true if packet enqueued. + */ +static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb) +{ + u64 newsz = xtfs->queue_size + skb->len; + struct iphdr *iph; + + assert_spin_locked(&xtfs->x->lock); + + if (newsz > xtfs->cfg.max_queue_size) + return false; + + /* Set ECN CE if we are above our ECN queue threshold */ + if (newsz > xtfs->ecn_queue_size) { + iph = ip_hdr(skb); + if (iph->version == 4) + IP_ECN_set_ce(iph); + else if (iph->version == 6) + IP6_ECN_set_ce(skb, ipv6_hdr(skb)); + } + + __skb_queue_tail(&xtfs->queue, skb); + xtfs->queue_size += skb->len; + return true; +} + +static int iptfs_get_cur_pmtu(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs, + struct sk_buff *skb) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb); + u32 payload_mtu = xtfs->payload_mtu; + u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached); + + if (payload_mtu && payload_mtu < pmtu) + pmtu = payload_mtu; + + return pmtu; +} + +static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu) +{ + if (skb->len <= pmtu) + return 0; + + /* We only send ICMP too big if the user has configured us as + * dont-fragment. + */ + if (skb->dev) + XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR); + + if (sk) + xfrm_local_error(skb, pmtu); + else if (ip_hdr(skb)->version == 4) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(pmtu)); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu); + + return 1; +} + +/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload + * (i.e., aggregating or fragmenting as appropriate). + * This is set in dst->output for an SA. + */ +static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *segs, *nskb; + u32 pmtu = 0; + bool ok = true; + bool was_gso; + + /* We have hooked into dst_entry->output which means we have skipped the + * protocol specific netfilter (see xfrm4_output, xfrm6_output). + * when our timer runs we will end up calling xfrm_output directly on + * the encapsulated traffic. + * + * For both cases this is the NF_INET_POST_ROUTING hook which allows + * changing the skb->dst entry which then may not be xfrm based anymore + * in which case a REROUTED flag is set. and dst_output is called. + * + * For IPv6 we are also skipping fragmentation handling for local + * sockets, which may or may not be good depending on our tunnel DF + * setting. Normally with fragmentation supported we want to skip this + * fragmentation. + */ + + pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); + + /* Break apart GSO skbs. If the queue is nearing full then we want the + * accounting and queuing to be based on the individual packets not on the + * aggregate GSO buffer. + */ + was_gso = skb_is_gso(skb); + if (!was_gso) { + segs = skb; + } else { + segs = skb_gso_segment(skb, 0); + if (IS_ERR_OR_NULL(segs)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + if (IS_ERR(segs)) + return PTR_ERR(segs); + return -EINVAL; + } + consume_skb(skb); + skb = NULL; + } + + /* We can be running on multiple cores and from the network softirq or + * from user context depending on where the packet is coming from. + */ + spin_lock_bh(&x->lock); + + skb_list_walk_safe(segs, skb, nskb) { + skb_mark_not_on_list(skb); + + /* Once we drop due to no queue space we continue to drop the + * rest of the packets from that GRO. + */ + if (!ok) { +nospace: + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOQSPACE); + kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING); + continue; + } + + /* Fragmenting handled in following commits. */ + if (iptfs_is_too_big(sk, skb, pmtu)) { + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } + + /* Enqueue to send in tunnel */ + ok = iptfs_enqueue(xtfs, skb); + if (!ok) + goto nospace; + } + + /* Start a delay timer if we don't have one yet */ + if (!hrtimer_is_queued(&xtfs->iptfs_timer)) + hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, IPTFS_HRTIMER_MODE); + + spin_unlock_bh(&x->lock); + return 0; +} + +/* -------------------------- */ +/* Dequeue and send functions */ +/* -------------------------- */ + +static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff) +{ + struct ip_iptfs_hdr *h; + size_t hsz = sizeof(*h); + + /* now reset values to be pointing at the rest of the packets */ + h = skb_push(skb, hsz); + memset(h, 0, hsz); + if (blkoff) + h->block_offset = htons(blkoff); + + /* network_header current points at the inner IP packet + * move it to the iptfs header + */ + skb->transport_header = skb->network_header; + skb->network_header -= hsz; + + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; +} + +static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, struct sk_buff *child) +{ + u32 fllen = 0; + + /* It might be possible to account for a frag list in addition to page + * fragment if it's a valid state to be in. The page fragments size + * should be kept as data_len so only the frag_list size is removed, + * this must be done above as well. + */ + *nextp = skb_shinfo(child)->frag_list; + while (*nextp) { + fllen += (*nextp)->len; + nextp = &(*nextp)->next; + } + skb_frag_list_init(child); + child->len -= fllen; + child->data_len -= fllen; + + return nextp; +} + +static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *skb, *skb2, **nextp; + struct skb_shared_info *shi; + + while ((skb = __skb_dequeue(list))) { + u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); + int remaining; + + /* protocol comes to us cleared sometimes */ + skb->protocol = x->outer_mode.family == AF_INET ? htons(ETH_P_IP) : + htons(ETH_P_IPV6); + + if (skb->len > mtu) { + /* We handle this case before enqueueing so we are only + * here b/c MTU changed after we enqueued before we + * dequeued, just drop these. + */ + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR); + + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } + + /* If we don't have a cksum in the packet we need to add one + * before encapsulation. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb_checksum_help(skb)) { + XFRM_INC_STATS(dev_net(skb_dst(skb)->dev), LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + continue; + } + } + + /* Consider the buffer Tx'd and no longer owned */ + skb_orphan(skb); + + /* Convert first inner packet into an outer IPTFS packet */ + iptfs_output_prepare_skb(skb, 0); + + /* The space remaining to send more inner packet data is `mtu` - + * (skb->len - sizeof iptfs header). This is b/c the `mtu` value + * has the basic IPTFS header len accounted for, and we added + * that header to the skb so it is a part of skb->len, thus we + * subtract it from the skb length. + */ + remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr)); + + /* Re-home (un-nest) nested fragment lists. We need to do this + * b/c we will simply be appending any following aggregated + * inner packets to the frag list. + */ + shi = skb_shinfo(skb); + nextp = &shi->frag_list; + while (*nextp) { + if (skb_has_frag_list(*nextp)) + nextp = iptfs_rehome_fraglist(&(*nextp)->next, *nextp); + else + nextp = &(*nextp)->next; + } + + /* See if we have enough space to simply append. + * + * NOTE: Maybe do not append if we will be mis-aligned, + * SW-based endpoints will probably have to copy in this + * case. + */ + while ((skb2 = skb_peek(list))) { + if (skb2->len > remaining) + break; + + __skb_unlink(skb2, list); + + /* Consider the buffer Tx'd and no longer owned */ + skb_orphan(skb); + + /* If we don't have a cksum in the packet we need to add + * one before encapsulation. + */ + if (skb2->ip_summed == CHECKSUM_PARTIAL) { + if (skb_checksum_help(skb2)) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb2); + continue; + } + } + + /* Do accounting */ + skb->data_len += skb2->len; + skb->len += skb2->len; + remaining -= skb2->len; + + /* Append to the frag_list */ + *nextp = skb2; + nextp = &skb2->next; + if (skb_has_frag_list(skb2)) + nextp = iptfs_rehome_fraglist(nextp, skb2); + skb->truesize += skb2->truesize; + } + + xfrm_output(NULL, skb); + } +} + +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) +{ + struct sk_buff_head list; + struct xfrm_iptfs_data *xtfs; + struct xfrm_state *x; + + xtfs = container_of(me, typeof(*xtfs), iptfs_timer); + x = xtfs->x; + + /* Process all the queued packets + * + * softirq execution order: timer > tasklet > hrtimer + * + * Network rx will have run before us giving one last chance to queue + * ingress packets for us to process and transmit. + */ + + spin_lock(&x->lock); + __skb_queue_head_init(&list); + skb_queue_splice_init(&xtfs->queue, &list); + xtfs->queue_size = 0; + spin_unlock(&x->lock); + + /* After the above unlock, packets can begin queuing again, and the + * timer can be set again, from another CPU either in softirq or user + * context (not from this one since we are running at softirq level + * already). + */ + + iptfs_output_queued(x, &list); + + return HRTIMER_NORESTART; +} + +/** + * iptfs_encap_add_ipv4() - add outer encaps + * @x: xfrm state + * @skb: the packet + * + * This was originally taken from xfrm4_tunnel_encap_add. The reason for the + * copy is that IP-TFS/AGGFRAG can have different functionality for how to set + * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do + * anything with inner headers as they aren't pointing into a normal IP + * singleton inner packet. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct iphdr *top_iph; + + skb_reset_inner_network_header(skb); + skb_reset_inner_transport_header(skb); + + skb_set_network_header(skb, -(x->props.header_len - x->props.enc_hdr_len)); + skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*top_iph); + + top_iph = ip_hdr(skb); + top_iph->ihl = 5; + top_iph->version = 4; + top_iph->protocol = IPPROTO_AGGFRAG; + + /* As we have 0, fractional, 1 or N inner packets there's no obviously + * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 + * 3.1. + */ + top_iph->tos = 0; + + top_iph->frag_off = htons(IP_DF); + top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + ip_select_ident(dev_net(dst->dev), skb, NULL); + + return 0; +} + +#if IS_ENABLED(CONFIG_IPV6) +/** + * iptfs_encap_add_ipv6() - add outer encaps + * @x: xfrm state + * @skb: the packet + * + * This was originally taken from xfrm6_tunnel_encap_add. The reason for the + * copy is that IP-TFS/AGGFRAG can have different functionality for how to set + * the flow label and TOS/DSCP bits. It also sets the protocol to a different + * value and doesn't do anything with inner headers as they aren't pointing into + * a normal IP singleton inner packet. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *top_iph; + int dsfield; + + skb_reset_inner_network_header(skb); + skb_reset_inner_transport_header(skb); + + skb_set_network_header(skb, -x->props.header_len + x->props.enc_hdr_len); + skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + + top_iph = ipv6_hdr(skb); + top_iph->version = 6; + top_iph->priority = 0; + memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl)); + top_iph->nexthdr = IPPROTO_AGGFRAG; + + /* As we have 0, fractional, 1 or N inner packets there's no obviously + * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 + * 3.1. + */ + dsfield = 0; + ipv6_change_dsfield(top_iph, 0, dsfield); + + top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; + + return 0; +} +#endif + +/** + * iptfs_prepare_output() - prepare the skb for output + * @x: xfrm state + * @skb: the packet + * + * Return: Error value, if 0 then skb values should be as follows: + * - transport_header should point at ESP header + * - network_header should point at Outer IP header + * - mac_header should point at protocol/nexthdr of the outer IP + */ +static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb) +{ + if (x->outer_mode.family == AF_INET) + return iptfs_encap_add_ipv4(x, skb); + if (x->outer_mode.family == AF_INET6) { +#if IS_ENABLED(CONFIG_IPV6) + return iptfs_encap_add_ipv6(x, skb); +#else + return -EAFNOSUPPORT; +#endif + } + return -EOPNOTSUPP; +} + /* ========================== */ /* State Management Functions */ /* ========================== */ @@ -77,8 +589,11 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, { struct xfrm_iptfs_data *xtfs = x->mode_data; struct xfrm_iptfs_config *xc; + u64 q; xc = &xtfs->cfg; + xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; + xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; if (attrs[XFRMA_IPTFS_PKT_SIZE]) { xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); @@ -92,6 +607,16 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, return -EINVAL; } } + if (attrs[XFRMA_IPTFS_MAX_QSIZE]) + xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); + if (attrs[XFRMA_IPTFS_INIT_DELAY]) + xtfs->init_delay_ns = + (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * NSECS_IN_USEC; + + q = (u64)xc->max_queue_size * 95; + do_div(q, 100); + xtfs->ecn_queue_size = (u32)q; + return 0; } @@ -101,8 +626,11 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) struct xfrm_iptfs_config *xc = &xtfs->cfg; unsigned int l = 0; - if (x->dir == XFRM_SA_DIR_OUT) + if (x->dir == XFRM_SA_DIR_OUT) { + l += nla_total_size(sizeof(u32)); /* init delay usec */ + l += nla_total_size(sizeof(xc->max_queue_size)); l += nla_total_size(sizeof(xc->pkt_size)); + } return l; } @@ -112,9 +640,21 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_iptfs_data *xtfs = x->mode_data; struct xfrm_iptfs_config *xc = &xtfs->cfg; int ret = 0; + u64 q; + + if (x->dir == XFRM_SA_DIR_OUT) { + q = xtfs->init_delay_ns; + do_div(q, NSECS_IN_USEC); + ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q); + if (ret) + return ret; + + ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, xc->max_queue_size); + if (ret) + return ret; - if (x->dir == XFRM_SA_DIR_OUT) ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); + } return ret; } @@ -122,6 +662,10 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) static void __iptfs_init_state(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs) { + __skb_queue_head_init(&xtfs->queue); + hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); + xtfs->iptfs_timer.function = iptfs_delay_timer; + /* Modify type (esp) adjustment values */ if (x->props.family == AF_INET) @@ -172,10 +716,21 @@ static int iptfs_init_state(struct xfrm_state *x) static void iptfs_destroy_state(struct xfrm_state *x) { struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff_head list; + struct sk_buff *skb; if (!xtfs) return; + spin_lock_bh(&xtfs->x->lock); + hrtimer_cancel(&xtfs->iptfs_timer); + __skb_queue_head_init(&list); + skb_queue_splice_init(&xtfs->queue, &list); + spin_unlock_bh(&xtfs->x->lock); + + while ((skb = __skb_dequeue(&list))) + kfree_skb(skb); + kfree_sensitive(xtfs); module_put(x->mode_cbs->owner); @@ -190,6 +745,8 @@ static const struct xfrm_mode_cbs iptfs_mode_cbs = { .copy_to_user = iptfs_copy_to_user, .sa_len = iptfs_sa_len, .get_inner_mtu = iptfs_get_inner_mtu, + .output = iptfs_output_collect, + .prepare_output = iptfs_prepare_output, }; static int __init xfrm_iptfs_init(void) From b96ba312e21c9b7ac1526829b9640ddc06695c0b Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:05 -0500 Subject: [PATCH 0165/1450] xfrm: iptfs: share page fragments of inner packets When possible rather than appending secondary (aggregated) inner packets to the fragment list, share their page fragments with the outer IPTFS packet. This allows for more efficient packet transmission. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 85 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 8 deletions(-) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index c4cff005ea9a7..7bf18f472fedf 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -90,6 +91,23 @@ struct xfrm_iptfs_data { static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); +/* ======================= */ +/* IPTFS SK_BUFF Functions */ +/* ======================= */ + +/** + * iptfs_skb_head_to_frag() - initialize a skb_frag_t based on skb head data + * @skb: skb with the head data + * @frag: frag to initialize + */ +static void iptfs_skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) +{ + struct page *page = virt_to_head_page(skb->data); + unsigned char *addr = (unsigned char *)page_address(page); + + skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); +} + /* ================================= */ /* IPTFS Sending (ingress) Functions */ /* ================================= */ @@ -297,14 +315,44 @@ static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, struct sk_ return nextp; } +static void iptfs_consume_frags(struct sk_buff *to, struct sk_buff *from) +{ + struct skb_shared_info *fromi = skb_shinfo(from); + struct skb_shared_info *toi = skb_shinfo(to); + unsigned int new_truesize; + + /* If we have data in a head page, grab it */ + if (!skb_headlen(from)) { + new_truesize = SKB_TRUESIZE(skb_end_offset(from)); + } else { + iptfs_skb_head_to_frag(from, &toi->frags[toi->nr_frags]); + skb_frag_ref(to, toi->nr_frags++); + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + } + + /* Move any other page fragments rather than copy */ + memcpy(&toi->frags[toi->nr_frags], fromi->frags, + sizeof(fromi->frags[0]) * fromi->nr_frags); + toi->nr_frags += fromi->nr_frags; + fromi->nr_frags = 0; + from->data_len = 0; + from->len = 0; + to->truesize += from->truesize - new_truesize; + from->truesize = new_truesize; + + /* We are done with this SKB */ + consume_skb(from); +} + static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) { struct xfrm_iptfs_data *xtfs = x->mode_data; struct sk_buff *skb, *skb2, **nextp; - struct skb_shared_info *shi; + struct skb_shared_info *shi, *shi2; while ((skb = __skb_dequeue(list))) { u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); + bool share_ok = true; int remaining; /* protocol comes to us cleared sometimes */ @@ -349,7 +397,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) /* Re-home (un-nest) nested fragment lists. We need to do this * b/c we will simply be appending any following aggregated - * inner packets to the frag list. + * inner packets using the frag list. */ shi = skb_shinfo(skb); nextp = &shi->frag_list; @@ -360,6 +408,9 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) nextp = &(*nextp)->next; } + if (shi->frag_list || skb_cloned(skb) || skb_shared(skb)) + share_ok = false; + /* See if we have enough space to simply append. * * NOTE: Maybe do not append if we will be mis-aligned, @@ -386,17 +437,35 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) } } + /* skb->pp_recycle is passed to __skb_flag_unref for all + * frag pages so we can only share pages with skb's who + * match ourselves. + */ + shi2 = skb_shinfo(skb2); + if (share_ok && + (shi2->frag_list || + (!skb2->head_frag && skb_headlen(skb)) || + skb->pp_recycle != skb2->pp_recycle || + skb_zcopy(skb2) || + (shi->nr_frags + shi2->nr_frags + 1 > MAX_SKB_FRAGS))) + share_ok = false; + /* Do accounting */ skb->data_len += skb2->len; skb->len += skb2->len; remaining -= skb2->len; - /* Append to the frag_list */ - *nextp = skb2; - nextp = &skb2->next; - if (skb_has_frag_list(skb2)) - nextp = iptfs_rehome_fraglist(nextp, skb2); - skb->truesize += skb2->truesize; + if (share_ok) { + iptfs_consume_frags(skb, skb2); + } else { + /* Append to the frag_list */ + *nextp = skb2; + nextp = &skb2->next; + if (skb_has_frag_list(skb2)) + nextp = iptfs_rehome_fraglist(nextp, + skb2); + skb->truesize += skb2->truesize; + } } xfrm_output(NULL, skb); From 8579d342ea2b3c1c672858de180152ccf9cb0ee1 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:06 -0500 Subject: [PATCH 0166/1450] xfrm: iptfs: add fragmenting of larger than MTU user packets Add support for tunneling user (inner) packets that are larger than the tunnel's path MTU (outer) using IP-TFS fragmentation. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 343 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 315 insertions(+), 28 deletions(-) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 7bf18f472fedf..b7d706a006eb8 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -46,6 +46,22 @@ */ #define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240) +/* Assumed: skb->head is cache aligned. + * + * L2 Header resv: Arrange for cacheline to start at skb->data - 16 to keep the + * to-be-pushed L2 header in the same cacheline as resulting `skb->data` (i.e., + * the L3 header). If cacheline size is > 64 then skb->data + pushed L2 will all + * be in a single cacheline if we simply reserve 64 bytes. + * + * L3 Header resv: For L3+L2 headers (i.e., skb->data points at the IPTFS payload) + * we want `skb->data` to be cacheline aligned and all pushed L2L3 headers will + * be in their own cacheline[s]. 128 works for cachelins up to 128 bytes, for + * any larger cacheline sizes the pushed headers will simply share the cacheline + * with the start of the IPTFS payload (skb->data). + */ +#define XFRM_IPTFS_MIN_L3HEADROOM 128 +#define XFRM_IPTFS_MIN_L2HEADROOM (L1_CACHE_BYTES > 64 ? 64 : 64 + 16) + #define NSECS_IN_USEC 1000 #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT @@ -57,10 +73,12 @@ * @max_queue_size: The maximum number of octets allowed to be queued to be sent * over the IPTFS SA. The queue size is measured as the size of all the * packets enqueued. + * @dont_frag: true to inhibit fragmenting across IPTFS outer packets. */ struct xfrm_iptfs_config { u32 pkt_size; /* outer_packet_size or 0 */ u32 max_queue_size; /* octets */ + u8 dont_frag : 1; }; /** @@ -88,13 +106,72 @@ struct xfrm_iptfs_data { u32 payload_mtu; /* max payload size */ }; -static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); /* ======================= */ /* IPTFS SK_BUFF Functions */ /* ======================= */ +/** + * iptfs_alloc_skb() - Allocate a new `skb`. + * @tpl: the skb to copy required meta-data from. + * @len: the linear length of the head data, zero is fine. + * @l3resv: true if skb reserve needs to support pushing L3 headers + * + * A new `skb` is allocated and required meta-data is copied from `tpl`, the + * head data is sized to `len` + reserved space set according to the @l3resv + * boolean. + * + * When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which arranges for + * `skb->data - 16` which is a good guess for good cache alignment (placing the + * to be pushed L2 header at the start of a cacheline. + * + * Otherwise, @l3resv is true and resv is set to the correct reserved space for + * dst->dev plus the calculated L3 overhead for the xfrm dst or + * XFRM_IPTFS_MIN_L3HEADROOM whichever is larger. This is then cache aligned so + * that all the headers will commonly fall in a cacheline when possible. + * + * l3resv=true is used on tunnel ingress (tx), because we need to reserve for + * the new IPTFS packet (i.e., L2+L3 headers). On tunnel egress (rx) the data + * being copied into the skb includes the user L3 headers already so we only + * need to reserve for L2. + * + * Return: the new skb or NULL. + */ +static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len, bool l3resv) +{ + struct sk_buff *skb; + u32 resv; + + if (!l3resv) { + resv = XFRM_IPTFS_MIN_L2HEADROOM; + } else { + struct dst_entry *dst = skb_dst(tpl); + + resv = LL_RESERVED_SPACE(dst->dev) + dst->header_len; + resv = max(resv, XFRM_IPTFS_MIN_L3HEADROOM); + resv = L1_CACHE_ALIGN(resv); + } + + skb = alloc_skb(len + resv, GFP_ATOMIC | __GFP_NOWARN); + if (!skb) + return NULL; + + skb_reserve(skb, resv); + + if (!l3resv) { + /* xfrm_input resume needs dev and xfrm ext from tunnel pkt */ + skb->dev = tpl->dev; + __skb_ext_copy(skb, tpl); + } + + /* dropped by xfrm_input, used by xfrm_output */ + skb_dst_copy(skb, tpl); + + return skb; +} + /** * iptfs_skb_head_to_frag() - initialize a skb_frag_t based on skb head data * @skb: skb with the head data @@ -152,7 +229,7 @@ static int iptfs_get_cur_pmtu(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs { struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb); u32 payload_mtu = xtfs->payload_mtu; - u32 pmtu = iptfs_get_inner_mtu(x, xdst->child_mtu_cached); + u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached); if (payload_mtu && payload_mtu < pmtu) pmtu = payload_mtu; @@ -210,7 +287,8 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff * fragmentation. */ - pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); + if (xtfs->cfg.dont_frag) + pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); /* Break apart GSO skbs. If the queue is nearing full then we want the * accounting and queuing to be based on the individual packets not on the @@ -250,8 +328,10 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff continue; } - /* Fragmenting handled in following commits. */ - if (iptfs_is_too_big(sk, skb, pmtu)) { + /* If the user indicated no iptfs fragmenting check before + * enqueue. + */ + if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) { kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); continue; } @@ -294,6 +374,181 @@ static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff) IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; } +/** + * iptfs_copy_create_frag() - create an inner fragment skb. + * @st: The source packet data. + * @offset: offset in @st of the new fragment data. + * @copy_len: the amount of data to copy from @st. + * + * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must + * not be greater than the max fragment size. + * + * Return: the new fragment skb or an ERR_PTR(). + */ +static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st, u32 offset, u32 copy_len) +{ + struct sk_buff *src = st->root_skb; + struct sk_buff *skb; + int err; + + skb = iptfs_alloc_skb(src, copy_len, true); + if (!skb) + return ERR_PTR(-ENOMEM); + + /* Now copy `copy_len` data from src */ + err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + + return skb; +} + +/** + * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb. + * @skbp: the source packet skb (IN), skb holding the last fragment in + * the fragment stream (OUT). + * @xtfs: IPTFS SA state. + * @mtu: the max IPTFS fragment size. + * + * This function is responsible for fragmenting a larger inner packet into a + * sequence of IPTFS payload packets. The last fragment is returned rather than + * being sent so that the caller can append more inner packets (aggregation) if + * there is room. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_copy_create_frags(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu) +{ + struct skb_seq_state skbseq; + struct list_head sublist; + struct sk_buff *skb = *skbp; + struct sk_buff *nskb = *skbp; + u32 copy_len, offset; + u32 to_copy = skb->len - mtu; + int err = 0; + + INIT_LIST_HEAD(&sublist); + + skb_prepare_seq_read(skb, 0, skb->len, &skbseq); + + /* A trimmed `skb` will be sent as the first fragment, later. */ + offset = mtu; + to_copy = skb->len - offset; + while (to_copy) { + /* Send all but last fragment to allow agg. append */ + list_add_tail(&nskb->list, &sublist); + + /* FUTURE: if the packet has an odd/non-aligning length we could + * send less data in the penultimate fragment so that the last + * fragment then ends on an aligned boundary. + */ + copy_len = min(to_copy, mtu); + nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len); + if (IS_ERR(nskb)) { + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMOUTERROR); + skb_abort_seq_read(&skbseq); + err = PTR_ERR(nskb); + nskb = NULL; + break; + } + iptfs_output_prepare_skb(nskb, to_copy); + offset += copy_len; + to_copy -= copy_len; + } + skb_abort_seq_read(&skbseq); + + /* return last fragment that will be unsent (or NULL) */ + *skbp = nskb; + + /* trim the original skb to MTU */ + if (!err) + err = pskb_trim(skb, mtu); + + if (err) { + /* Free all frags. Don't bother sending a partial packet we will + * never complete. + */ + kfree_skb(nskb); + list_for_each_entry_safe(skb, nskb, &sublist, list) { + skb_list_del_init(skb); + kfree_skb(skb); + } + return err; + } + + /* prepare the initial fragment with an iptfs header */ + iptfs_output_prepare_skb(skb, 0); + + /* Send all but last fragment, if we fail to send a fragment then free + * the rest -- no point in sending a packet that can't be reassembled. + */ + list_for_each_entry_safe(skb, nskb, &sublist, list) { + skb_list_del_init(skb); + if (!err) + err = xfrm_output(NULL, skb); + else + kfree_skb(skb); + } + if (err) + kfree_skb(*skbp); + return err; +} + +/** + * iptfs_first_skb() - handle the first dequeued inner packet for output + * @skbp: the source packet skb (IN), skb holding the last fragment in + * the fragment stream (OUT). + * @xtfs: IPTFS SA state. + * @mtu: the max IPTFS fragment size. + * + * This function is responsible for fragmenting a larger inner packet into a + * sequence of IPTFS payload packets. + * + * The last fragment is returned rather than being sent so that the caller can + * append more inner packets (aggregation) if there is room. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu) +{ + struct sk_buff *skb = *skbp; + int err; + + /* Classic ESP skips the don't fragment ICMP error if DF is clear on + * the inner packet or ignore_df is set. Otherwise it will send an ICMP + * or local error if the inner packet won't fit it's MTU. + * + * With IPTFS we do not care about the inner packet DF bit. If the + * tunnel is configured to "don't fragment" we error back if things + * don't fit in our max packet size. Otherwise we iptfs-fragment as + * normal. + */ + + /* The opportunity for HW offload has ended */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + err = skb_checksum_help(skb); + if (err) + return err; + } + + /* We've split gso up before queuing */ + + /* Consider the buffer Tx'd and no longer owned */ + skb_orphan(skb); + + /* Simple case -- it fits. `mtu` accounted for all the overhead + * including the basic IPTFS header. + */ + if (skb->len <= mtu) { + iptfs_output_prepare_skb(skb, 0); + return 0; + } + + return iptfs_copy_create_frags(skbp, xtfs, mtu); +} + static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, struct sk_buff *child) { u32 fllen = 0; @@ -350,6 +605,15 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) struct sk_buff *skb, *skb2, **nextp; struct skb_shared_info *shi, *shi2; + /* If we are fragmenting due to a large inner packet we will output all + * the outer IPTFS packets required to contain the fragments of the + * single large inner packet. These outer packets need to be sent + * consecutively (ESP seq-wise). Since this output function is always + * running from a timer we do not need a lock to provide this guarantee. + * We will output our packets consecutively before the timer is allowed + * to run again on some other CPU. + */ + while ((skb = __skb_dequeue(list))) { u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); bool share_ok = true; @@ -359,7 +623,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) skb->protocol = x->outer_mode.family == AF_INET ? htons(ETH_P_IP) : htons(ETH_P_IPV6); - if (skb->len > mtu) { + if (skb->len > mtu && xtfs->cfg.dont_frag) { /* We handle this case before enqueueing so we are only * here b/c MTU changed after we enqueued before we * dequeued, just drop these. @@ -370,28 +634,22 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) continue; } - /* If we don't have a cksum in the packet we need to add one - * before encapsulation. + /* Convert first inner packet into an outer IPTFS packet, + * dealing with any fragmentation into multiple outer packets + * if necessary. */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb_checksum_help(skb)) { - XFRM_INC_STATS(dev_net(skb_dst(skb)->dev), LINUX_MIB_XFRMOUTERROR); - kfree_skb(skb); - continue; - } - } - - /* Consider the buffer Tx'd and no longer owned */ - skb_orphan(skb); - - /* Convert first inner packet into an outer IPTFS packet */ - iptfs_output_prepare_skb(skb, 0); + if (iptfs_first_skb(&skb, xtfs, mtu)) + continue; - /* The space remaining to send more inner packet data is `mtu` - - * (skb->len - sizeof iptfs header). This is b/c the `mtu` value - * has the basic IPTFS header len accounted for, and we added - * that header to the skb so it is a part of skb->len, thus we - * subtract it from the skb length. + /* If fragmentation was required the returned skb is the last + * IPTFS fragment in the chain, and it's IPTFS header blkoff has + * been set just past the end of the fragment data. + * + * In either case the space remaining to send more inner packet + * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c + * the `mtu` value has the basic IPTFS header len accounted for, + * and we added that header to the skb so it is a part of + * skb->len, thus we subtract it from the skb length. */ remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr)); @@ -628,11 +886,13 @@ static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb) /* ========================== */ /** - * iptfs_get_inner_mtu() - return inner MTU with no fragmentation. + * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation. * @x: xfrm state. * @outer_mtu: the outer mtu + * + * Return: Correct MTU taking in to account the encap overhead. */ -static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) { struct crypto_aead *aead; u32 blksize; @@ -643,6 +903,23 @@ static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) ~(blksize - 1)) - 2; } +/** + * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm. + * @x: xfrm state. + * @outer_mtu: Outer MTU for the encapsulated packet. + * + * Return: Correct MTU taking in to account the encap overhead. + */ +static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + + /* If not dont-frag we have no MTU */ + if (!xtfs->cfg.dont_frag) + return x->outer_mode.family == AF_INET ? IP_MAX_MTU : IP6_MAX_MTU; + return __iptfs_get_inner_mtu(x, outer_mtu); +} + /** * iptfs_user_init() - initialize the SA with IPTFS options from netlink. * @net: the net data @@ -664,6 +941,8 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; + if (attrs[XFRMA_IPTFS_DONT_FRAG]) + xc->dont_frag = true; if (attrs[XFRMA_IPTFS_PKT_SIZE]) { xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); if (!xc->pkt_size) { @@ -696,6 +975,8 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) unsigned int l = 0; if (x->dir == XFRM_SA_DIR_OUT) { + if (xc->dont_frag) + l += nla_total_size(0); /* dont-frag flag */ l += nla_total_size(sizeof(u32)); /* init delay usec */ l += nla_total_size(sizeof(xc->max_queue_size)); l += nla_total_size(sizeof(xc->pkt_size)); @@ -712,6 +993,12 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) u64 q; if (x->dir == XFRM_SA_DIR_OUT) { + if (xc->dont_frag) { + ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); + if (ret) + return ret; + } + q = xtfs->init_delay_ns; do_div(q, NSECS_IN_USEC); ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q); From 6c82d2433671819a550227bf65bfb6043e3d3305 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:07 -0500 Subject: [PATCH 0167/1450] xfrm: iptfs: add basic receive packet (tunnel egress) handling Add handling of packets received from the tunnel. This implements tunnel egress functionality. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 276 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 276 insertions(+) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index b7d706a006eb8..085964fe3251e 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -20,6 +20,10 @@ #include "xfrm_inout.h" +/* IPTFS encap (header) values. */ +#define IPTFS_SUBTYPE_BASIC 0 +#define IPTFS_SUBTYPE_CC 1 + /* ------------------------------------------------ */ /* IPTFS default SA values (tunnel ingress/dir-out) */ /* ------------------------------------------------ */ @@ -185,6 +189,277 @@ static void iptfs_skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); } +/* ================================== */ +/* IPTFS Receiving (egress) Functions */ +/* ================================== */ + +/** + * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff. + * @skblen: the total data size for `skb`. + * @st: The source for the rest of the data to copy into `skb`. + * @off: The offset into @st to copy data from. + * @len: The length of data to copy from @st into `skb`. This must be <= + * @skblen. + * + * Create a new sk_buff `skb` with @skblen of packet data space. If non-zero, + * copy @rlen bytes of @runt into `skb`. Then using seq functions copy @len + * bytes from @st into `skb` starting from @off. + * + * It is an error for @len to be greater than the amount of data left in @st. + * + * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. + */ +static struct sk_buff * +iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) +{ + struct sk_buff *skb = iptfs_alloc_skb(st->root_skb, skblen, false); + + if (!skb) + return NULL; + if (skb_copy_seq_read(st, off, skb_put(skb, len), len)) { + XFRM_INC_STATS(dev_net(st->root_skb->dev), LINUX_MIB_XFRMINERROR); + kfree_skb(skb); + return NULL; + } + return skb; +} + +/** + * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. + * @x: xfrm state + * @skb: the inner packet + * + * Finish the standard xfrm processing on the inner packet prior to sending back + * through gro_cells_receive. We do this separately b/c we are building a list + * of packets in the hopes that one day a list will be taken by + * xfrm_input. + */ +static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) +{ + skb_reset_network_header(skb); + + /* The packet is going back through gro_cells_receive no need to + * set this. + */ + skb_reset_transport_header(skb); + + /* Packet already has checksum value set. */ + skb->ip_summed = CHECKSUM_NONE; + + /* Our skb will contain the header data copied when this outer packet + * which contained the start of this inner packet. This is true + * when we allocate a new skb as well as when we reuse the existing skb. + */ + if (ip_hdr(skb)->version == 0x4) { + struct iphdr *iph = ip_hdr(skb); + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP_ECN_set_ce(iph); + + skb->protocol = htons(ETH_P_IP); + } else { + struct ipv6hdr *iph = ipv6_hdr(skb); + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP6_ECN_set_ce(skb, iph); + + skb->protocol = htons(ETH_P_IPV6); + } +} + +static bool __input_process_payload(struct xfrm_state *x, u32 data, + struct skb_seq_state *skbseq, + struct list_head *sublist) +{ + u8 hbytes[sizeof(struct ipv6hdr)]; + struct sk_buff *first_skb, *next, *skb; + const unsigned char *old_mac; + struct iphdr *iph; + struct net *net; + u32 remaining, iplen, iphlen, tail; + + net = xs_net(x); + skb = skbseq->root_skb; + first_skb = NULL; + + /* Save the old mac header if set */ + old_mac = skb_mac_header_was_set(skb) ? skb_mac_header(skb) : NULL; + + /* New packets */ + + tail = skb->len; + while (data < tail) { + __be16 protocol = 0; + + /* Gather information on the next data block. + * `data` points to the start of the data block. + */ + remaining = tail - data; + + /* try and copy enough bytes to read length from ipv4/ipv6 */ + iphlen = min_t(u32, remaining, 6); + if (skb_copy_seq_read(skbseq, data, hbytes, iphlen)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + + iph = (struct iphdr *)hbytes; + if (iph->version == 0x4) { + /* must have at least tot_len field present */ + if (remaining < 4) + break; + + iplen = be16_to_cpu(iph->tot_len); + iphlen = iph->ihl << 2; + protocol = cpu_to_be16(ETH_P_IP); + XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = iph->tos; + } else if (iph->version == 0x6) { + /* must have at least payload_len field present */ + if (remaining < 6) + break; + + iplen = be16_to_cpu(((struct ipv6hdr *)hbytes)->payload_len); + iplen += sizeof(struct ipv6hdr); + iphlen = sizeof(struct ipv6hdr); + protocol = cpu_to_be16(ETH_P_IPV6); + XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = + ipv6_get_dsfield((struct ipv6hdr *)iph); + } else if (iph->version == 0x0) { + /* pad */ + break; + } else { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + + if (unlikely(skbseq->stepped_offset)) { + /* We need to reset our seq read, it can't backup at + * this point. + */ + struct sk_buff *save = skbseq->root_skb; + + skb_abort_seq_read(skbseq); + skb_prepare_seq_read(save, data, tail, skbseq); + } + + if (!first_skb) + first_skb = skb; + + /* Fragment handling in following commits */ + if (iplen > remaining) + break; + + skb = iptfs_pskb_extract_seq(iplen, skbseq, data, iplen); + if (!skb) { + /* skip to next packet or done */ + data += iplen; + continue; + } + + skb->protocol = protocol; + if (old_mac) { + /* rebuild the mac header */ + skb_set_mac_header(skb, -first_skb->mac_len); + memcpy(skb_mac_header(skb), old_mac, first_skb->mac_len); + eth_hdr(skb)->h_proto = skb->protocol; + } + + data += iplen; + iptfs_complete_inner_skb(x, skb); + list_add_tail(&skb->list, sublist); + } + + /* Send the packets! */ + list_for_each_entry_safe(skb, next, sublist, list) { + skb_list_del_init(skb); + if (xfrm_input(skb, 0, 0, -2)) + kfree_skb(skb); + } + +done: + return false; +} + +/** + * iptfs_input() - handle receipt of iptfs payload + * @x: xfrm state + * @skb: the packet + * + * Process the IPTFS payload in `skb` and consume it afterwards. + * + * Returns 0. + */ +static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_iptfs_cc_hdr iptcch; + struct skb_seq_state skbseq; + struct list_head sublist; /* rename this it's just a list */ + struct ip_iptfs_hdr *ipth; + struct net *net; + u32 remaining, data; + bool consumed = false; + + net = xs_net(x); + + /* Large enough to hold both types of header */ + ipth = (struct ip_iptfs_hdr *)&iptcch; + + skb_prepare_seq_read(skb, 0, skb->len, &skbseq); + + /* Get the IPTFS header and validate it */ + + if (skb_copy_seq_read(&skbseq, 0, ipth, sizeof(*ipth))) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + data = sizeof(*ipth); + + /* Set data past the basic header */ + if (ipth->subtype == IPTFS_SUBTYPE_CC) { + /* Copy the rest of the CC header */ + remaining = sizeof(iptcch) - sizeof(*ipth); + if (skb_copy_seq_read(&skbseq, data, ipth + 1, remaining)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + data += remaining; + } else if (ipth->subtype != IPTFS_SUBTYPE_BASIC) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto done; + } + + if (ipth->flags != 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto done; + } + + INIT_LIST_HEAD(&sublist); + + /* Fragment handling in following commits */ + data += ntohs(ipth->block_offset); + + /* New packets */ + consumed = __input_process_payload(x, data, &skbseq, &sublist); +done: + skb = skbseq.root_skb; + skb_abort_seq_read(&skbseq); + + if (!consumed) + kfree_skb(skb); + + /* We always have dealt with the input SKB, either we are re-using it, + * or we have freed it. Return EINPROGRESS so that xfrm_input stops + * processing it. + */ + return -EINPROGRESS; +} + /* ================================= */ /* IPTFS Sending (ingress) Functions */ /* ================================= */ @@ -1101,6 +1376,7 @@ static const struct xfrm_mode_cbs iptfs_mode_cbs = { .copy_to_user = iptfs_copy_to_user, .sa_len = iptfs_sa_len, .get_inner_mtu = iptfs_get_inner_mtu, + .input = iptfs_input, .output = iptfs_output_collect, .prepare_output = iptfs_prepare_output, }; From 07569476544681816335099929ff3494dfbf6b05 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:08 -0500 Subject: [PATCH 0168/1450] xfrm: iptfs: handle received fragmented inner packets Add support for handling receipt of partial inner packets that have been fragmented across multiple outer IP-TFS tunnel packets. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 480 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 461 insertions(+), 19 deletions(-) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 085964fe3251e..4af1f7b5818e6 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -24,6 +24,21 @@ #define IPTFS_SUBTYPE_BASIC 0 #define IPTFS_SUBTYPE_CC 1 +/* ----------------------------------------------- */ +/* IP-TFS default SA values (tunnel egress/dir-in) */ +/* ----------------------------------------------- */ + +/** + * define IPTFS_DEFAULT_DROP_TIME_USECS - default drop time + * + * The default IPTFS drop time in microseconds. The drop time is the amount of + * time before a missing out-of-order IPTFS tunnel packet is considered lost. + * See also the reorder window. + * + * Default 1s. + */ +#define IPTFS_DEFAULT_DROP_TIME_USECS 1000000 + /* ------------------------------------------------ */ /* IPTFS default SA values (tunnel ingress/dir-out) */ /* ------------------------------------------------ */ @@ -95,6 +110,13 @@ struct xfrm_iptfs_config { * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. * @iptfs_timer: output timer. * @payload_mtu: max payload size. + * @drop_lock: lock to protect reorder queue. + * @drop_timer: timer for considering next packet lost. + * @drop_time_ns: timer intervan in nanoseconds. + * @ra_newskb: new pkt being reassembled. + * @ra_wantseq: expected next sequence for reassembly. + * @ra_runt: last pkt bytes from very end of last skb. + * @ra_runtlen: size of ra_runt. */ struct xfrm_iptfs_data { struct xfrm_iptfs_config cfg; @@ -108,10 +130,33 @@ struct xfrm_iptfs_data { u64 init_delay_ns; /* nanoseconds */ struct hrtimer iptfs_timer; /* output timer */ u32 payload_mtu; /* max payload size */ + + /* Tunnel egress */ + spinlock_t drop_lock; + struct hrtimer drop_timer; + u64 drop_time_ns; + + /* Tunnel egress reassembly */ + struct sk_buff *ra_newskb; /* new pkt being reassembled */ + u64 ra_wantseq; /* expected next sequence */ + u8 ra_runt[6]; /* last pkt bytes from last skb */ + u8 ra_runtlen; /* count of ra_runt */ }; static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); +static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); + +/* ================= */ +/* Utility Functions */ +/* ================= */ + +static u64 __esp_seq(struct sk_buff *skb) +{ + u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low); + + return seq | (u64)ntohl(XFRM_SKB_CB(skb)->seq.input.hi) << 32; +} /* ======================= */ /* IPTFS SK_BUFF Functions */ @@ -224,6 +269,63 @@ iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) return skb; } +/** + * iptfs_input_save_runt() - save data in xtfs runt space. + * @xtfs: xtfs state + * @seq: the current sequence + * @buf: packet data + * @len: length of packet data + * + * Save the small (`len`) start of a fragmented packet in `buf` in the xtfs data + * runt space. + */ +static void iptfs_input_save_runt(struct xfrm_iptfs_data *xtfs, u64 seq, + u8 *buf, int len) +{ + memcpy(xtfs->ra_runt, buf, len); + + xtfs->ra_runtlen = len; + xtfs->ra_wantseq = seq + 1; +} + +/** + * __iptfs_iphlen() - return the v4/v6 header length using packet data. + * @data: pointer at octet with version nibble + * + * The version data has been checked to be valid (i.e., either 4 or 6). + * + * Return: the IP header size based on the IP version. + */ +static u32 __iptfs_iphlen(u8 *data) +{ + struct iphdr *iph = (struct iphdr *)data; + + if (iph->version == 0x4) + return sizeof(*iph); + return sizeof(struct ipv6hdr); +} + +/** + * __iptfs_iplen() - return the v4/v6 length using packet data. + * @data: pointer to ip (v4/v6) packet header + * + * Grab the IPv4 or IPv6 length value in the start of the inner packet header + * pointed to by `data`. Assumes data len is enough for the length field only. + * + * The version data has been checked to be valid (i.e., either 4 or 6). + * + * Return: the length value. + */ +static u32 __iptfs_iplen(u8 *data) +{ + struct iphdr *iph = (struct iphdr *)data; + + if (iph->version == 0x4) + return ntohs(iph->tot_len); + return ntohs(((struct ipv6hdr *)iph)->payload_len) + + sizeof(struct ipv6hdr); +} + /** * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. * @x: xfrm state @@ -273,6 +375,227 @@ static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) } } +static void __iptfs_reassem_done(struct xfrm_iptfs_data *xtfs, bool free) +{ + assert_spin_locked(&xtfs->drop_lock); + + /* We don't care if it works locking takes care of things */ + hrtimer_try_to_cancel(&xtfs->drop_timer); + if (free) + kfree_skb(xtfs->ra_newskb); + xtfs->ra_newskb = NULL; +} + +/** + * iptfs_reassem_abort() - In-progress packet is aborted free the state. + * @xtfs: xtfs state + */ +static void iptfs_reassem_abort(struct xfrm_iptfs_data *xtfs) +{ + __iptfs_reassem_done(xtfs, true); +} + +/** + * iptfs_reassem_done() - In-progress packet is complete, clear the state. + * @xtfs: xtfs state + */ +static void iptfs_reassem_done(struct xfrm_iptfs_data *xtfs) +{ + __iptfs_reassem_done(xtfs, false); +} + +/** + * iptfs_reassem_cont() - Continue the reassembly of an inner packets. + * @xtfs: xtfs state + * @seq: sequence of current packet + * @st: seq read stat for current packet + * @skb: current packet + * @data: offset into sequential packet data + * @blkoff: packet blkoff value + * @list: list of skbs to enqueue completed packet on + * + * Process an IPTFS payload that has a non-zero `blkoff` or when we are + * expecting the continuation b/c we have a runt or in-progress packet. + * + * Return: the new data offset to continue processing from. + */ +static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + struct skb_seq_state *st, struct sk_buff *skb, + u32 data, u32 blkoff, struct list_head *list) +{ + struct sk_buff *newskb = xtfs->ra_newskb; + u32 remaining = skb->len - data; + u32 runtlen = xtfs->ra_runtlen; + u32 copylen, fraglen, ipremain, iphlen, iphremain, rrem; + + /* Handle packet fragment we aren't expecting */ + if (!runtlen && !xtfs->ra_newskb) + return data + min(blkoff, remaining); + + /* Important to remember that input to this function is an ordered + * packet stream (unless the user disabled the reorder window). Thus if + * we are waiting for, and expecting the next packet so we can continue + * assembly, a newer sequence number indicates older ones are not coming + * (or if they do should be ignored). Technically we can receive older + * ones when the reorder window is disabled; however, the user should + * have disabled fragmentation in this case, and regardless we don't + * deal with it. + * + * blkoff could be zero if the stream is messed up (or it's an all pad + * insertion) be careful to handle that case in each of the below + */ + + /* Too old case: This can happen when the reorder window is disabled so + * ordering isn't actually guaranteed. + */ + if (seq < xtfs->ra_wantseq) + return data + remaining; + + /* Too new case: We missed what we wanted cleanup. */ + if (seq > xtfs->ra_wantseq) { + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + if (blkoff == 0) { + if ((*skb->data & 0xF0) != 0) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + /* Handle all pad case, advance expected sequence number. + * (RFC 9347 S2.2.3) + */ + xtfs->ra_wantseq++; + /* will end parsing */ + return data + remaining; + } + + if (runtlen) { + /* Regardless of what happens we're done with the runt */ + xtfs->ra_runtlen = 0; + + /* The start of this inner packet was at the very end of the last + * iptfs payload which didn't include enough for the ip header + * length field. We must have *at least* that now. + */ + rrem = sizeof(xtfs->ra_runt) - runtlen; + if (remaining < rrem || blkoff < rrem) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* fill in the runt data */ + if (skb_copy_seq_read(st, data, &xtfs->ra_runt[runtlen], + rrem)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + + /* We have enough data to get the ip length value now, + * allocate an in progress skb + */ + ipremain = __iptfs_iplen(xtfs->ra_runt); + if (ipremain < sizeof(xtfs->ra_runt)) { + /* length has to be at least runtsize large */ + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* For the runt case we don't attempt sharing currently. NOTE: + * Currently, this IPTFS implementation will not create runts. + */ + + newskb = iptfs_alloc_skb(skb, ipremain, false); + if (!newskb) { + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINERROR); + goto abandon; + } + xtfs->ra_newskb = newskb; + + /* Copy the runt data into the buffer, but leave data + * pointers the same as normal non-runt case. The extra `rrem` + * recopied bytes are basically cacheline free. Allows using + * same logic below to complete. + */ + memcpy(skb_put(newskb, runtlen), xtfs->ra_runt, + sizeof(xtfs->ra_runt)); + } + + /* Continue reassembling the packet */ + ipremain = __iptfs_iplen(newskb->data); + iphlen = __iptfs_iphlen(newskb->data); + + ipremain -= newskb->len; + if (blkoff < ipremain) { + /* Corrupt data, we don't have enough to complete the packet */ + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* We want the IP header in linear space */ + if (newskb->len < iphlen) { + iphremain = iphlen - newskb->len; + if (blkoff < iphremain) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + fraglen = min(blkoff, remaining); + copylen = min(fraglen, iphremain); + if (skb_copy_seq_read(st, data, skb_put(newskb, copylen), + copylen)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + /* this is a silly condition that might occur anyway */ + if (copylen < iphremain) { + xtfs->ra_wantseq++; + return data + fraglen; + } + /* update data and things derived from it */ + data += copylen; + blkoff -= copylen; + remaining -= copylen; + ipremain -= copylen; + } + + fraglen = min(blkoff, remaining); + copylen = min(fraglen, ipremain); + + /* copy fragment data into newskb */ + if (skb_copy_seq_read(st, data, skb_put(newskb, copylen), copylen)) { + XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + + if (copylen < ipremain) { + xtfs->ra_wantseq++; + } else { + /* We are done with packet reassembly! */ + iptfs_reassem_done(xtfs); + iptfs_complete_inner_skb(xtfs->x, newskb); + list_add_tail(&newskb->list, list); + } + + /* will continue on to new data block or end */ + return data + fraglen; + +abandon: + if (xtfs->ra_newskb) { + iptfs_reassem_abort(xtfs); + } else { + xtfs->ra_runtlen = 0; + xtfs->ra_wantseq = 0; + } + /* skip past fragment, maybe to end */ + return data + min(blkoff, remaining); +} + static bool __input_process_payload(struct xfrm_state *x, u32 data, struct skb_seq_state *skbseq, struct list_head *sublist) @@ -280,14 +603,20 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, u8 hbytes[sizeof(struct ipv6hdr)]; struct sk_buff *first_skb, *next, *skb; const unsigned char *old_mac; + struct xfrm_iptfs_data *xtfs; struct iphdr *iph; struct net *net; u32 remaining, iplen, iphlen, tail; + u32 capturelen; + u64 seq; + xtfs = x->mode_data; net = xs_net(x); skb = skbseq->root_skb; first_skb = NULL; + seq = __esp_seq(skb); + /* Save the old mac header if set */ old_mac = skb_mac_header_was_set(skb) ? skb_mac_header(skb) : NULL; @@ -312,8 +641,13 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, iph = (struct iphdr *)hbytes; if (iph->version == 0x4) { /* must have at least tot_len field present */ - if (remaining < 4) + if (remaining < 4) { + /* save the bytes we have, advance data and exit */ + iptfs_input_save_runt(xtfs, seq, hbytes, + remaining); + data += remaining; break; + } iplen = be16_to_cpu(iph->tot_len); iphlen = iph->ihl << 2; @@ -321,8 +655,13 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = iph->tos; } else if (iph->version == 0x6) { /* must have at least payload_len field present */ - if (remaining < 6) + if (remaining < 6) { + /* save the bytes we have, advance data and exit */ + iptfs_input_save_runt(xtfs, seq, hbytes, + remaining); + data += remaining; break; + } iplen = be16_to_cpu(((struct ipv6hdr *)hbytes)->payload_len); iplen += sizeof(struct ipv6hdr); @@ -332,6 +671,7 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, ipv6_get_dsfield((struct ipv6hdr *)iph); } else if (iph->version == 0x0) { /* pad */ + data = tail; break; } else { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); @@ -351,14 +691,11 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, if (!first_skb) first_skb = skb; - /* Fragment handling in following commits */ - if (iplen > remaining) - break; - - skb = iptfs_pskb_extract_seq(iplen, skbseq, data, iplen); + capturelen = min(iplen, remaining); + skb = iptfs_pskb_extract_seq(iplen, skbseq, data, capturelen); if (!skb) { /* skip to next packet or done */ - data += iplen; + data += capturelen; continue; } @@ -370,18 +707,40 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, eth_hdr(skb)->h_proto = skb->protocol; } - data += iplen; + data += capturelen; + + if (skb->len < iplen) { + /* Start reassembly */ + spin_lock(&xtfs->drop_lock); + + xtfs->ra_newskb = skb; + xtfs->ra_wantseq = seq + 1; + if (!hrtimer_is_queued(&xtfs->drop_timer)) { + /* softirq blocked lest the timer fire and interrupt us */ + hrtimer_start(&xtfs->drop_timer, + xtfs->drop_time_ns, + IPTFS_HRTIMER_MODE); + } + + spin_unlock(&xtfs->drop_lock); + + break; + } + iptfs_complete_inner_skb(x, skb); list_add_tail(&skb->list, sublist); } + if (data != tail) + /* this should not happen from the above code */ + XFRM_INC_STATS(net, LINUX_MIB_XFRMINIPTFSERROR); + /* Send the packets! */ list_for_each_entry_safe(skb, next, sublist, list) { skb_list_del_init(skb); if (xfrm_input(skb, 0, 0, -2)) kfree_skb(skb); } - done: return false; } @@ -400,13 +759,18 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) struct ip_iptfs_cc_hdr iptcch; struct skb_seq_state skbseq; struct list_head sublist; /* rename this it's just a list */ + struct xfrm_iptfs_data *xtfs; struct ip_iptfs_hdr *ipth; struct net *net; - u32 remaining, data; + u32 blkoff, data, remaining; bool consumed = false; + u64 seq; + xtfs = x->mode_data; net = xs_net(x); + seq = __esp_seq(skb); + /* Large enough to hold both types of header */ ipth = (struct ip_iptfs_hdr *)&iptcch; @@ -441,17 +805,30 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) INIT_LIST_HEAD(&sublist); - /* Fragment handling in following commits */ - data += ntohs(ipth->block_offset); + /* Handle fragment at start of payload, and/or waiting reassembly. */ + + blkoff = ntohs(ipth->block_offset); + /* check before locking i.e., maybe */ + if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { + spin_lock(&xtfs->drop_lock); + + /* check again after lock */ + if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { + data = iptfs_reassem_cont(xtfs, seq, &skbseq, skb, data, + blkoff, &sublist); + } + + spin_unlock(&xtfs->drop_lock); + } /* New packets */ consumed = __input_process_payload(x, data, &skbseq, &sublist); done: - skb = skbseq.root_skb; - skb_abort_seq_read(&skbseq); - - if (!consumed) + if (!consumed) { + skb = skbseq.root_skb; + skb_abort_seq_read(&skbseq); kfree_skb(skb); + } /* We always have dealt with the input SKB, either we are re-using it, * or we have freed it. Return EINPROGRESS so that xfrm_input stops @@ -460,6 +837,47 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) return -EINPROGRESS; } +/** + * iptfs_drop_timer() - Handle drop timer expiry. + * @me: the timer + * + * This is similar to our input function. + * + * The drop timer is set when we start an in progress reassembly, and also when + * we save a future packet in the window saved array. + * + * NOTE packets in the save window are always newer WRT drop times as + * they get further in the future. i.e. for: + * + * if slots (S0, S1, ... Sn) and `Dn` is the drop time for slot `Sn`, + * then D(n-1) <= D(n). + * + * So, regardless of why the timer is firing we can always discard any inprogress + * fragment; either it's the reassembly timer, or slot 0 is going to be + * dropped as S0 must have the most recent drop time, and slot 0 holds the + * continuation fragment of the in progress packet. + * + * Returns HRTIMER_NORESTART. + */ +static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) +{ + struct xfrm_iptfs_data *xtfs; + struct sk_buff *skb; + + xtfs = container_of(me, typeof(*xtfs), drop_timer); + + /* Drop any in progress packet */ + spin_lock(&xtfs->drop_lock); + skb = xtfs->ra_newskb; + xtfs->ra_newskb = NULL; + spin_unlock(&xtfs->drop_lock); + + if (skb) + kfree_skb_reason(skb, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); + + return HRTIMER_NORESTART; +} + /* ================================= */ /* IPTFS Sending (ingress) Functions */ /* ================================= */ @@ -1214,6 +1632,7 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, xc = &xtfs->cfg; xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; + xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC; xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; if (attrs[XFRMA_IPTFS_DONT_FRAG]) @@ -1232,6 +1651,10 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, } if (attrs[XFRMA_IPTFS_MAX_QSIZE]) xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); + if (attrs[XFRMA_IPTFS_DROP_TIME]) + xtfs->drop_time_ns = + (u64)nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]) * + NSECS_IN_USEC; if (attrs[XFRMA_IPTFS_INIT_DELAY]) xtfs->init_delay_ns = (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * NSECS_IN_USEC; @@ -1249,7 +1672,9 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) struct xfrm_iptfs_config *xc = &xtfs->cfg; unsigned int l = 0; - if (x->dir == XFRM_SA_DIR_OUT) { + if (x->dir == XFRM_SA_DIR_IN) { + l += nla_total_size(sizeof(u32)); /* drop time usec */ + } else { if (xc->dont_frag) l += nla_total_size(0); /* dont-frag flag */ l += nla_total_size(sizeof(u32)); /* init delay usec */ @@ -1267,7 +1692,11 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) int ret = 0; u64 q; - if (x->dir == XFRM_SA_DIR_OUT) { + if (x->dir == XFRM_SA_DIR_IN) { + q = xtfs->drop_time_ns; + do_div(q, NSECS_IN_USEC); + ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q); + } else { if (xc->dont_frag) { ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); if (ret) @@ -1297,6 +1726,10 @@ static void __iptfs_init_state(struct xfrm_state *x, hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); xtfs->iptfs_timer.function = iptfs_delay_timer; + spin_lock_init(&xtfs->drop_lock); + hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); + xtfs->drop_timer.function = iptfs_drop_timer; + /* Modify type (esp) adjustment values */ if (x->props.family == AF_INET) @@ -1323,6 +1756,8 @@ static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) x->mode_data = xtfs; xtfs->x = x; + xtfs->ra_newskb = NULL; + return 0; } @@ -1362,6 +1797,13 @@ static void iptfs_destroy_state(struct xfrm_state *x) while ((skb = __skb_dequeue(&list))) kfree_skb(skb); + spin_lock_bh(&xtfs->drop_lock); + hrtimer_cancel(&xtfs->drop_timer); + spin_unlock_bh(&xtfs->drop_lock); + + if (xtfs->ra_newskb) + kfree_skb(xtfs->ra_newskb); + kfree_sensitive(xtfs); module_put(x->mode_cbs->owner); From 3f3339885fb343b7b42d7c34717108ce07da24ae Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:09 -0500 Subject: [PATCH 0169/1450] xfrm: iptfs: add reusing received skb for the tunnel egress packet Add an optimization of re-using the tunnel outer skb re-transmission of the inner packet to avoid skb allocation and copy. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 123 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 108 insertions(+), 15 deletions(-) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 4af1f7b5818e6..8538fb02ae8aa 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -601,12 +601,12 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, struct list_head *sublist) { u8 hbytes[sizeof(struct ipv6hdr)]; - struct sk_buff *first_skb, *next, *skb; + struct sk_buff *defer, *first_skb, *next, *skb; const unsigned char *old_mac; struct xfrm_iptfs_data *xtfs; struct iphdr *iph; struct net *net; - u32 remaining, iplen, iphlen, tail; + u32 first_iplen, iphlen, iplen, remaining, tail; u32 capturelen; u64 seq; @@ -614,6 +614,7 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, net = xs_net(x); skb = skbseq->root_skb; first_skb = NULL; + defer = NULL; seq = __esp_seq(skb); @@ -688,23 +689,92 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, skb_prepare_seq_read(save, data, tail, skbseq); } - if (!first_skb) + if (first_skb) { + skb = NULL; + } else { first_skb = skb; + first_iplen = iplen; + + /* We are going to skip over `data` bytes to reach the + * start of the IP header of `iphlen` len for `iplen` + * inner packet. + */ + + if (skb_has_frag_list(skb)) { + defer = skb; + skb = NULL; + } else if (data + iphlen <= skb_headlen(skb) && + /* make sure our header is 32-bit aligned? */ + /* ((uintptr_t)(skb->data + data) & 0x3) == 0 && */ + skb_tailroom(skb) + tail - data >= iplen) { + /* Reuse the received skb. + * + * We have enough headlen to pull past any + * initial fragment data, leaving at least the + * IP header in the linear buffer space. + * + * For linear buffer space we only require that + * linear buffer space is large enough to + * eventually hold the entire reassembled + * packet (by including tailroom in the check). + * + * For non-linear tailroom is 0 and so we only + * re-use if the entire packet is present + * already. + * + * NOTE: there are many more options for + * sharing, KISS for now. Also, this can produce + * skb's with the IP header unaligned to 32 + * bits. If that ends up being a problem then a + * check should be added to the conditional + * above that the header lies on a 32-bit + * boundary as well. + */ + skb_pull(skb, data); + + /* our range just changed */ + data = 0; + tail = skb->len; + remaining = skb->len; + + skb->protocol = protocol; + skb_mac_header_rebuild(skb); + if (skb->mac_len) + eth_hdr(skb)->h_proto = skb->protocol; + + /* all pointers could be changed now reset walk */ + skb_abort_seq_read(skbseq); + skb_prepare_seq_read(skb, data, tail, skbseq); + } else { + /* We couldn't reuse the input skb so allocate a + * new one. + */ + defer = skb; + skb = NULL; + } + + /* Don't trim `first_skb` until the end as we are + * walking that data now. + */ + } capturelen = min(iplen, remaining); - skb = iptfs_pskb_extract_seq(iplen, skbseq, data, capturelen); if (!skb) { - /* skip to next packet or done */ - data += capturelen; - continue; - } + skb = iptfs_pskb_extract_seq(iplen, skbseq, data, + capturelen); + if (!skb) { + /* skip to next packet or done */ + data += capturelen; + continue; + } - skb->protocol = protocol; - if (old_mac) { - /* rebuild the mac header */ - skb_set_mac_header(skb, -first_skb->mac_len); - memcpy(skb_mac_header(skb), old_mac, first_skb->mac_len); - eth_hdr(skb)->h_proto = skb->protocol; + skb->protocol = protocol; + if (old_mac) { + /* rebuild the mac header */ + skb_set_mac_header(skb, -first_skb->mac_len); + memcpy(skb_mac_header(skb), old_mac, first_skb->mac_len); + eth_hdr(skb)->h_proto = skb->protocol; + } } data += capturelen; @@ -735,6 +805,16 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, /* this should not happen from the above code */ XFRM_INC_STATS(net, LINUX_MIB_XFRMINIPTFSERROR); + if (first_skb && first_iplen && !defer && first_skb != xtfs->ra_newskb) { + /* first_skb is queued b/c !defer and not partial */ + if (pskb_trim(first_skb, first_iplen)) { + /* error trimming */ + list_del(&first_skb->list); + defer = first_skb; + } + first_skb->ip_summed = CHECKSUM_NONE; + } + /* Send the packets! */ list_for_each_entry_safe(skb, next, sublist, list) { skb_list_del_init(skb); @@ -742,7 +822,20 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, kfree_skb(skb); } done: - return false; + skb = skbseq->root_skb; + skb_abort_seq_read(skbseq); + + if (defer) { + consume_skb(defer); + } else if (!first_skb) { + /* skb is the original passed in skb, but we didn't get far + * enough to process it as the first_skb, if we had it would + * either be save in ra_newskb, trimmed and sent on as an skb or + * placed in defer to be freed. + */ + kfree_skb(skb); + } + return true; } /** From 5f2b6a9095743a6bf1f34c43c4fe78fa8bdf5ad7 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:10 -0500 Subject: [PATCH 0170/1450] xfrm: iptfs: add skb-fragment sharing code Avoid copying the inner packet data by sharing the skb data fragments from the output packet skb into new inner packet skb. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 296 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 290 insertions(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 8538fb02ae8aa..1258158e57baa 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -81,6 +81,9 @@ #define XFRM_IPTFS_MIN_L3HEADROOM 128 #define XFRM_IPTFS_MIN_L2HEADROOM (L1_CACHE_BYTES > 64 ? 64 : 64 + 16) +/* Min to try to share outer iptfs skb data vs copying into new skb */ +#define IPTFS_PKT_SHARE_MIN 129 + #define NSECS_IN_USEC 1000 #define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT @@ -234,10 +237,254 @@ static void iptfs_skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); } +/** + * struct iptfs_skb_frag_walk - use to track a walk through fragments + * @fragi: current fragment index + * @past: length of data in fragments before @fragi + * @total: length of data in all fragments + * @nr_frags: number of fragments present in array + * @initial_offset: the value passed in to skb_prepare_frag_walk() + * @frags: the page fragments inc. room for head page + * @pp_recycle: copy of skb->pp_recycle + */ +struct iptfs_skb_frag_walk { + u32 fragi; + u32 past; + u32 total; + u32 nr_frags; + u32 initial_offset; + skb_frag_t frags[MAX_SKB_FRAGS + 1]; + bool pp_recycle; +}; + +/** + * iptfs_skb_prepare_frag_walk() - initialize a frag walk over an skb. + * @skb: the skb to walk. + * @initial_offset: start the walk @initial_offset into the skb. + * @walk: the walk to initialize + * + * Future calls to skb_add_frags() will expect the @offset value to be at + * least @initial_offset large. + */ +static void iptfs_skb_prepare_frag_walk(struct sk_buff *skb, u32 initial_offset, + struct iptfs_skb_frag_walk *walk) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb_frag_t *frag, *from; + u32 i; + + walk->initial_offset = initial_offset; + walk->fragi = 0; + walk->past = 0; + walk->total = 0; + walk->nr_frags = 0; + walk->pp_recycle = skb->pp_recycle; + + if (skb->head_frag) { + if (initial_offset >= skb_headlen(skb)) { + initial_offset -= skb_headlen(skb); + } else { + frag = &walk->frags[walk->nr_frags++]; + iptfs_skb_head_to_frag(skb, frag); + frag->offset += initial_offset; + frag->len -= initial_offset; + walk->total += frag->len; + initial_offset = 0; + } + } else { + initial_offset -= skb_headlen(skb); + } + + for (i = 0; i < shinfo->nr_frags; i++) { + from = &shinfo->frags[i]; + if (initial_offset >= from->len) { + initial_offset -= from->len; + continue; + } + frag = &walk->frags[walk->nr_frags++]; + *frag = *from; + if (initial_offset) { + frag->offset += initial_offset; + frag->len -= initial_offset; + initial_offset = 0; + } + walk->total += frag->len; + } +} + +static u32 iptfs_skb_reset_frag_walk(struct iptfs_skb_frag_walk *walk, + u32 offset) +{ + /* Adjust offset to refer to internal walk values */ + offset -= walk->initial_offset; + + /* Get to the correct fragment for offset */ + while (offset < walk->past) { + walk->past -= walk->frags[--walk->fragi].len; + if (offset >= walk->past) + break; + } + while (offset >= walk->past + walk->frags[walk->fragi].len) + walk->past += walk->frags[walk->fragi++].len; + + /* offset now relative to this current frag */ + offset -= walk->past; + return offset; +} + +/** + * iptfs_skb_can_add_frags() - check if ok to add frags from walk to skb + * @skb: skb to check for adding frags to + * @walk: the walk that will be used as source for frags. + * @offset: offset from beginning of original skb to start from. + * @len: amount of data to add frag references to in @skb. + * + * Return: true if ok to add frags. + */ +static bool iptfs_skb_can_add_frags(const struct sk_buff *skb, + struct iptfs_skb_frag_walk *walk, + u32 offset, u32 len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + u32 fragi, nr_frags, fraglen; + + if (skb_has_frag_list(skb) || skb->pp_recycle != walk->pp_recycle) + return false; + + /* Make offset relative to current frag after setting that */ + offset = iptfs_skb_reset_frag_walk(walk, offset); + + /* Verify we have array space for the fragments we need to add */ + fragi = walk->fragi; + nr_frags = shinfo->nr_frags; + while (len && fragi < walk->nr_frags) { + skb_frag_t *frag = &walk->frags[fragi]; + + fraglen = frag->len; + if (offset) { + fraglen -= offset; + offset = 0; + } + if (++nr_frags > MAX_SKB_FRAGS) + return false; + if (len <= fraglen) + return true; + len -= fraglen; + fragi++; + } + /* We may not copy all @len but what we have will fit. */ + return true; +} + +/** + * iptfs_skb_add_frags() - add a range of fragment references into an skb + * @skb: skb to add references into + * @walk: the walk to add referenced fragments from. + * @offset: offset from beginning of original skb to start from. + * @len: amount of data to add frag references to in @skb. + * + * iptfs_skb_can_add_frags() should be called before this function to verify + * that the destination @skb is compatible with the walk and has space in the + * array for the to be added frag references. + * + * Return: The number of bytes not added to @skb b/c we reached the end of the + * walk before adding all of @len. + */ +static int iptfs_skb_add_frags(struct sk_buff *skb, + struct iptfs_skb_frag_walk *walk, u32 offset, + u32 len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + u32 fraglen; + + if (!walk->nr_frags || offset >= walk->total + walk->initial_offset) + return len; + + /* make offset relative to current frag after setting that */ + offset = iptfs_skb_reset_frag_walk(walk, offset); + + while (len && walk->fragi < walk->nr_frags) { + skb_frag_t *frag = &walk->frags[walk->fragi]; + skb_frag_t *tofrag = &shinfo->frags[shinfo->nr_frags]; + + *tofrag = *frag; + if (offset) { + tofrag->offset += offset; + tofrag->len -= offset; + offset = 0; + } + __skb_frag_ref(tofrag); + shinfo->nr_frags++; + + /* see if we are done */ + fraglen = tofrag->len; + if (len < fraglen) { + tofrag->len = len; + skb->len += len; + skb->data_len += len; + return 0; + } + /* advance to next source fragment */ + len -= fraglen; /* careful, use dst bv_len */ + skb->len += fraglen; /* careful, " " " */ + skb->data_len += fraglen; /* careful, " " " */ + walk->past += frag->len; /* careful, use src bv_len */ + walk->fragi++; + } + return len; +} + /* ================================== */ /* IPTFS Receiving (egress) Functions */ /* ================================== */ +/** + * iptfs_pskb_add_frags() - Create and add frags into a new sk_buff. + * @tpl: template to create new skb from. + * @walk: The source for fragments to add. + * @off: The offset into @walk to add frags from, also used with @st and + * @copy_len. + * @len: The length of data to add covering frags from @walk into @skb. + * This must be <= @skblen. + * @st: The sequence state to copy from into the new head skb. + * @copy_len: Copy @copy_len bytes from @st at offset @off into the new skb + * linear space. + * + * Create a new sk_buff `skb` using the template @tpl. Copy @copy_len bytes from + * @st into the new skb linear space, and then add shared fragments from the + * frag walk for the remaining @len of data (i.e., @len - @copy_len bytes). + * + * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. + */ +static struct sk_buff * +iptfs_pskb_add_frags(struct sk_buff *tpl, struct iptfs_skb_frag_walk *walk, + u32 off, u32 len, struct skb_seq_state *st, u32 copy_len) +{ + struct sk_buff *skb; + + skb = iptfs_alloc_skb(tpl, copy_len, false); + if (!skb) + return NULL; + + /* this should not normally be happening */ + if (!iptfs_skb_can_add_frags(skb, walk, off + copy_len, + len - copy_len)) { + kfree_skb(skb); + return NULL; + } + + if (copy_len && + skb_copy_seq_read(st, off, skb_put(skb, copy_len), copy_len)) { + XFRM_INC_STATS(dev_net(st->root_skb->dev), + LINUX_MIB_XFRMINERROR); + kfree_skb(skb); + return NULL; + } + + iptfs_skb_add_frags(skb, walk, off + copy_len, len - copy_len); + return skb; +} + /** * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff. * @skblen: the total data size for `skb`. @@ -423,6 +670,8 @@ static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, struct skb_seq_state *st, struct sk_buff *skb, u32 data, u32 blkoff, struct list_head *list) { + struct iptfs_skb_frag_walk _fragwalk; + struct iptfs_skb_frag_walk *fragwalk = NULL; struct sk_buff *newskb = xtfs->ra_newskb; u32 remaining = skb->len - data; u32 runtlen = xtfs->ra_runtlen; @@ -567,10 +816,26 @@ static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, fraglen = min(blkoff, remaining); copylen = min(fraglen, ipremain); - /* copy fragment data into newskb */ - if (skb_copy_seq_read(st, data, skb_put(newskb, copylen), copylen)) { - XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINBUFFERERROR); - goto abandon; + /* If we may have the opportunity to share prepare a fragwalk. */ + if (!skb_has_frag_list(skb) && !skb_has_frag_list(newskb) && + (skb->head_frag || skb->len == skb->data_len) && + skb->pp_recycle == newskb->pp_recycle) { + fragwalk = &_fragwalk; + iptfs_skb_prepare_frag_walk(skb, data, fragwalk); + } + + /* Try share then copy. */ + if (fragwalk && + iptfs_skb_can_add_frags(newskb, fragwalk, data, copylen)) { + iptfs_skb_add_frags(newskb, fragwalk, data, copylen); + } else { + /* copy fragment data into newskb */ + if (skb_copy_seq_read(st, data, skb_put(newskb, copylen), + copylen)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } } if (copylen < ipremain) { @@ -601,6 +866,8 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, struct list_head *sublist) { u8 hbytes[sizeof(struct ipv6hdr)]; + struct iptfs_skb_frag_walk _fragwalk; + struct iptfs_skb_frag_walk *fragwalk = NULL; struct sk_buff *defer, *first_skb, *next, *skb; const unsigned char *old_mac; struct xfrm_iptfs_data *xtfs; @@ -694,6 +961,7 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, } else { first_skb = skb; first_iplen = iplen; + fragwalk = NULL; /* We are going to skip over `data` bytes to reach the * start of the IP header of `iphlen` len for `iplen` @@ -745,6 +1013,13 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, /* all pointers could be changed now reset walk */ skb_abort_seq_read(skbseq); skb_prepare_seq_read(skb, data, tail, skbseq); + } else if (skb->head_frag && + /* We have the IP header right now */ + remaining >= iphlen) { + fragwalk = &_fragwalk; + iptfs_skb_prepare_frag_walk(skb, data, fragwalk); + defer = skb; + skb = NULL; } else { /* We couldn't reuse the input skb so allocate a * new one. @@ -760,8 +1035,17 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, capturelen = min(iplen, remaining); if (!skb) { - skb = iptfs_pskb_extract_seq(iplen, skbseq, data, - capturelen); + if (!fragwalk || + /* Large enough to be worth sharing */ + iplen < IPTFS_PKT_SHARE_MIN || + /* Have IP header + some data to share. */ + capturelen <= iphlen || + /* Try creating skb and adding frags */ + !(skb = iptfs_pskb_add_frags(first_skb, fragwalk, + data, capturelen, + skbseq, iphlen))) { + skb = iptfs_pskb_extract_seq(iplen, skbseq, data, capturelen); + } if (!skb) { /* skip to next packet or done */ data += capturelen; From 6be02e3e4f376fea468846c8562655ca5ee18204 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:11 -0500 Subject: [PATCH 0171/1450] xfrm: iptfs: handle reordering of received packets Handle the receipt of the outer tunnel packets out-of-order. Pointers to the out-of-order packets are saved in a window (array) awaiting needed prior packets. When the required prior packets are received the now in-order packets are then passed on to the regular packet receive code. A timer is used to consider missing earlier packet as lost so the algorithm will advance. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 497 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 484 insertions(+), 13 deletions(-) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 1258158e57baa..3ca7d2a04ea63 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -39,6 +39,17 @@ */ #define IPTFS_DEFAULT_DROP_TIME_USECS 1000000 +/** + * define IPTFS_DEFAULT_REORDER_WINDOW - default reorder window size + * + * The default IPTFS reorder window size. The reorder window size dictates the + * maximum number of IPTFS tunnel packets in a sequence that may arrive out of + * order. + * + * Default 3. (tcp folks suggested) + */ +#define IPTFS_DEFAULT_REORDER_WINDOW 3 + /* ------------------------------------------------ */ /* IPTFS default SA values (tunnel ingress/dir-out) */ /* ------------------------------------------------ */ @@ -95,14 +106,22 @@ * @max_queue_size: The maximum number of octets allowed to be queued to be sent * over the IPTFS SA. The queue size is measured as the size of all the * packets enqueued. + * @reorder_win_size: the number slots in the reorder window, thus the number of + * packets that may arrive out of order. * @dont_frag: true to inhibit fragmenting across IPTFS outer packets. */ struct xfrm_iptfs_config { u32 pkt_size; /* outer_packet_size or 0 */ u32 max_queue_size; /* octets */ + u16 reorder_win_size; u8 dont_frag : 1; }; +struct skb_wseq { + struct sk_buff *skb; + u64 drop_time; +}; + /** * struct xfrm_iptfs_data - mode specific xfrm state. * @cfg: IPTFS tunnel config. @@ -113,6 +132,10 @@ struct xfrm_iptfs_config { * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. * @iptfs_timer: output timer. * @payload_mtu: max payload size. + * @w_seq_set: true after first seq received. + * @w_wantseq: waiting for this seq number as next to process (in order). + * @w_saved: the saved buf array (reorder window). + * @w_savedlen: the saved len (not size). * @drop_lock: lock to protect reorder queue. * @drop_timer: timer for considering next packet lost. * @drop_time_ns: timer intervan in nanoseconds. @@ -134,12 +157,16 @@ struct xfrm_iptfs_data { struct hrtimer iptfs_timer; /* output timer */ u32 payload_mtu; /* max payload size */ - /* Tunnel egress */ + /* Tunnel input reordering */ + bool w_seq_set; /* true after first seq received */ + u64 w_wantseq; /* expected next sequence */ + struct skb_wseq *w_saved; /* the saved buf array */ + u32 w_savedlen; /* the saved len (not size) */ spinlock_t drop_lock; struct hrtimer drop_timer; u64 drop_time_ns; - /* Tunnel egress reassembly */ + /* Tunnel input reassembly */ struct sk_buff *ra_newskb; /* new pkt being reassembled */ u64 ra_wantseq; /* expected next sequence */ u8 ra_runt[6]; /* last pkt bytes from last skb */ @@ -1123,15 +1150,13 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data, } /** - * iptfs_input() - handle receipt of iptfs payload + * iptfs_input_ordered() - handle next in order IPTFS payload. * @x: xfrm state - * @skb: the packet + * @skb: current packet * * Process the IPTFS payload in `skb` and consume it afterwards. - * - * Returns 0. */ -static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) +static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb) { struct ip_iptfs_cc_hdr iptcch; struct skb_seq_state skbseq; @@ -1206,12 +1231,355 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) skb_abort_seq_read(&skbseq); kfree_skb(skb); } +} - /* We always have dealt with the input SKB, either we are re-using it, - * or we have freed it. Return EINPROGRESS so that xfrm_input stops - * processing it. +/* ------------------------------- */ +/* Input (Egress) Re-ordering Code */ +/* ------------------------------- */ + +static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift) +{ + u32 savedlen = xtfs->w_savedlen; + + if (shift > savedlen) + shift = savedlen; + if (shift != savedlen) + memcpy(xtfs->w_saved, xtfs->w_saved + shift, + (savedlen - shift) * sizeof(*xtfs->w_saved)); + memset(xtfs->w_saved + savedlen - shift, 0, + shift * sizeof(*xtfs->w_saved)); + xtfs->w_savedlen -= shift; +} + +static void __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, + struct list_head *freelist) +{ + list_add_tail(&inskb->list, freelist); +} + +static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list) + +{ + struct skb_wseq *s, *se; + const u32 savedlen = xtfs->w_savedlen; + time64_t now = ktime_get_raw_fast_ns(); + u32 count = 0; + u32 scount = 0; + + if (xtfs->w_saved[0].drop_time > now) + goto set_timer; + + ++xtfs->w_wantseq; + + /* Keep flushing packets until we reach a drop time greater than now. */ + s = xtfs->w_saved; + se = s + savedlen; + do { + /* Walking past empty slots until we reach a packet */ + for (; s < se && !s->skb; s++) { + if (s->drop_time > now) + goto outerdone; + } + /* Sending packets until we hit another empty slot. */ + for (; s < se && s->skb; scount++, s++) + list_add_tail(&s->skb->list, list); + } while (s < se); +outerdone: + + count = s - xtfs->w_saved; + if (count) { + xtfs->w_wantseq += count; + + /* Shift handled slots plus final empty slot into slot 0. */ + __vec_shift(xtfs, count); + } + + if (xtfs->w_savedlen) { +set_timer: + /* Drifting is OK */ + hrtimer_start(&xtfs->drop_timer, + xtfs->w_saved[0].drop_time - now, + IPTFS_HRTIMER_MODE); + } + return scount; +} + +static void __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, + struct list_head *list) +{ + struct skb_wseq *s, *se; + const u32 savedlen = xtfs->w_savedlen; + u32 count = 0; + + /* Got what we wanted. */ + list_add_tail(&inskb->list, list); + ++xtfs->w_wantseq; + if (!savedlen) + return; + + /* Flush remaining consecutive packets. */ + + /* Keep sending until we hit another missed pkt. */ + for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++) + list_add_tail(&s->skb->list, list); + count = s - xtfs->w_saved; + if (count) + xtfs->w_wantseq += count; + + /* Shift handled slots plus final empty slot into slot 0. */ + __vec_shift(xtfs, count + 1); +} + +/* Set the slot's drop time and all the empty slots below it until reaching a + * filled slot which will already be set. + */ +static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index) +{ + const u32 savedlen = xtfs->w_savedlen; + struct skb_wseq *s = xtfs->w_saved; + time64_t drop_time; + + assert_spin_locked(&xtfs->drop_lock); + + if (savedlen > index + 1) { + /* we are below another, our drop time and the timer are already set */ + return; + } + /* we are the most future so get a new drop time. */ + drop_time = ktime_get_raw_fast_ns(); + drop_time += xtfs->drop_time_ns; + + /* Walk back through the array setting drop times as we go */ + s[index].drop_time = drop_time; + while (index-- > 0 && !s[index].skb) + s[index].drop_time = drop_time; + + /* If we walked all the way back, schedule the drop timer if needed */ + if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer)) + hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns, + IPTFS_HRTIMER_MODE); +} + +static void __reorder_future_fits(struct xfrm_iptfs_data *xtfs, + struct sk_buff *inskb, + struct list_head *freelist) +{ + const u64 inseq = __esp_seq(inskb); + const u64 wantseq = xtfs->w_wantseq; + const u64 distance = inseq - wantseq; + const u32 savedlen = xtfs->w_savedlen; + const u32 index = distance - 1; + + /* Handle future sequence number received which fits in the window. + * + * We know we don't have the seq we want so we won't be able to flush + * anything. */ - return -EINPROGRESS; + + /* slot count is 4, saved size is 3 savedlen is 2 + * + * "window boundary" is based on the fixed window size + * distance is also slot number + * index is an array index (i.e., - 1 of slot) + * : : - implicit NULL after array len + * + * +--------- used length (savedlen == 2) + * | +----- array size (nslots - 1 == 3) + * | | + window boundary (nslots == 4) + * V V | V + * | + * 0 1 2 3 | slot number + * --- 0 1 2 | array index + * [-] [b] : :| array + * + * "2" "3" "4" *5*| seq numbers + * + * We receive seq number 5 + * distance == 3 [inseq(5) - w_wantseq(2)] + * index == 2 [distance(6) - 1] + */ + + if (xtfs->w_saved[index].skb) { + /* a dup of a future */ + list_add_tail(&inskb->list, freelist); + return; + } + + xtfs->w_saved[index].skb = inskb; + xtfs->w_savedlen = max(savedlen, index + 1); + iptfs_set_window_drop_times(xtfs, index); +} + +static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs, + struct sk_buff *inskb, + struct list_head *list) +{ + const u32 nslots = xtfs->cfg.reorder_win_size + 1; + const u64 inseq = __esp_seq(inskb); + u32 savedlen = xtfs->w_savedlen; + u64 wantseq = xtfs->w_wantseq; + struct skb_wseq *wnext; + struct sk_buff *slot0; + u32 beyond, shifting, slot; + u64 distance; + + /* Handle future sequence number received. + * + * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1 + * b/c we are beyond the window boundary. + * + * We know we don't have the wantseq so that counts as a drop. + */ + + /* example: slot count is 4, array size is 3 savedlen is 2, slot 0 is + * the missing sequence number. + * + * the final slot at savedlen (index savedlen - 1) is always occupied. + * + * beyond is "beyond array size" not savedlen. + * + * +--------- array length (savedlen == 2) + * | +----- array size (nslots - 1 == 3) + * | | +- window boundary (nslots == 4) + * V V | + * | + * 0 1 2 3 | slot number + * --- 0 1 2 | array index + * [b] [c] : :| array + * | + * "2" "3" "4" "5"|*6* seq numbers + * + * We receive seq number 6 + * distance == 4 [inseq(6) - w_wantseq(2)] + * newslot == distance + * index == 3 [distance(4) - 1] + * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] + * shifting == 1 [min(savedlen(2), beyond(1)] + * slot0_skb == [b], and should match w_wantseq + * + * +--- window boundary (nslots == 4) + * 0 1 2 3 | 4 slot number + * --- 0 1 2 | 3 array index + * [b] : : : :| array + * "2" "3" "4" "5" *6* seq numbers + * + * We receive seq number 6 + * distance == 4 [inseq(6) - w_wantseq(2)] + * newslot == distance + * index == 3 [distance(4) - 1] + * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] + * shifting == 1 [min(savedlen(1), beyond(1)] + * slot0_skb == [b] and should match w_wantseq + * + * +-- window boundary (nslots == 4) + * 0 1 2 3 | 4 5 6 slot number + * --- 0 1 2 | 3 4 5 array index + * [-] [c] : :| array + * "2" "3" "4" "5" "6" "7" *8* seq numbers + * + * savedlen = 2, beyond = 3 + * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-] + * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c] + * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1) + * + * We receive seq number 8 + * distance == 6 [inseq(8) - w_wantseq(2)] + * newslot == distance + * index == 5 [distance(6) - 1] + * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))] + * shifting == 2 [min(savedlen(2), beyond(3)] + * + * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true. + */ + + /* Now send any packets that are being shifted out of saved, and account + * for missing packets that are exiting the window as we shift it. + */ + + distance = inseq - wantseq; + beyond = distance - (nslots - 1); + + /* If savedlen > beyond we are shifting some, else all. */ + shifting = min(savedlen, beyond); + + /* slot0 is the buf that just shifted out and into slot0 */ + slot0 = NULL; + wnext = xtfs->w_saved; + for (slot = 1; slot <= shifting; slot++, wnext++) { + /* handle what was in slot0 before we occupy it */ + if (slot0) + list_add_tail(&slot0->list, list); + slot0 = wnext->skb; + wnext->skb = NULL; + } + + /* slot0 is now either NULL (in which case it's what we now are waiting + * for, or a buf in which case we need to handle it like we received it; + * however, we may be advancing past that buffer as well.. + */ + + /* Handle case where we need to shift more than we had saved, slot0 will + * be NULL iff savedlen is 0, otherwise slot0 will always be + * non-NULL b/c we shifted the final element, which is always set if + * there is any saved, into slot0. + */ + if (savedlen < beyond) { + if (savedlen != 0) + list_add_tail(&slot0->list, list); + slot0 = NULL; + /* slot0 has had an empty slot pushed into it */ + } + + /* Remove the entries */ + __vec_shift(xtfs, beyond); + + /* Advance want seq */ + xtfs->w_wantseq += beyond; + + /* Process drops here when implementing congestion control */ + + /* We've shifted. plug the packet in at the end. */ + xtfs->w_savedlen = nslots - 1; + xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb; + iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1); + + /* if we don't have a slot0 then we must wait for it */ + if (!slot0) + return; + + /* If slot0, seq must match new want seq */ + + /* slot0 is valid, treat like we received expected. */ + __reorder_this(xtfs, slot0, list); +} + +/* Receive a new packet into the reorder window. Return a list of ordered + * packets from the window. + */ +static void iptfs_input_reorder(struct xfrm_iptfs_data *xtfs, + struct sk_buff *inskb, struct list_head *list, + struct list_head *freelist) +{ + const u32 nslots = xtfs->cfg.reorder_win_size + 1; + u64 inseq = __esp_seq(inskb); + u64 wantseq; + + assert_spin_locked(&xtfs->drop_lock); + + if (unlikely(!xtfs->w_seq_set)) { + xtfs->w_seq_set = true; + xtfs->w_wantseq = inseq; + } + wantseq = xtfs->w_wantseq; + + if (likely(inseq == wantseq)) + __reorder_this(xtfs, inskb, list); + else if (inseq < wantseq) + __reorder_past(xtfs, inskb, freelist); + else if ((inseq - wantseq) < nslots) + __reorder_future_fits(xtfs, inskb, freelist); + else + __reorder_future_shifts(xtfs, inskb, list); } /** @@ -1238,23 +1606,92 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) */ static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) { + struct sk_buff *skb, *next; + struct list_head list; struct xfrm_iptfs_data *xtfs; - struct sk_buff *skb; + struct xfrm_state *x; + u32 count; xtfs = container_of(me, typeof(*xtfs), drop_timer); + x = xtfs->x; + + INIT_LIST_HEAD(&list); - /* Drop any in progress packet */ spin_lock(&xtfs->drop_lock); + + /* Drop any in progress packet */ skb = xtfs->ra_newskb; xtfs->ra_newskb = NULL; + + /* Now drop as many packets as we should from the reordering window + * saved array + */ + count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0; + spin_unlock(&xtfs->drop_lock); if (skb) kfree_skb_reason(skb, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); + if (count) { + list_for_each_entry_safe(skb, next, &list, list) { + skb_list_del_init(skb); + iptfs_input_ordered(x, skb); + } + } + return HRTIMER_NORESTART; } +/** + * iptfs_input() - handle receipt of iptfs payload + * @x: xfrm state + * @skb: the packet + * + * We have an IPTFS payload order it if needed, then process newly in order + * packets. + * + * Return: -EINPROGRESS to inform xfrm_input to stop processing the skb. + */ +static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct list_head freelist, list; + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *next; + + /* Fast path for no reorder window. */ + if (xtfs->cfg.reorder_win_size == 0) { + iptfs_input_ordered(x, skb); + goto done; + } + + /* Fetch list of in-order packets from the reordering window as well as + * a list of buffers we need to now free. + */ + INIT_LIST_HEAD(&list); + INIT_LIST_HEAD(&freelist); + + spin_lock(&xtfs->drop_lock); + iptfs_input_reorder(xtfs, skb, &list, &freelist); + spin_unlock(&xtfs->drop_lock); + + list_for_each_entry_safe(skb, next, &list, list) { + skb_list_del_init(skb); + iptfs_input_ordered(x, skb); + } + + list_for_each_entry_safe(skb, next, &freelist, list) { + skb_list_del_init(skb); + kfree_skb(skb); + } +done: + /* We always have dealt with the input SKB, either we are re-using it, + * or we have freed it. Return EINPROGRESS so that xfrm_input stops + * processing it. + */ + return -EINPROGRESS; +} + /* ================================= */ /* IPTFS Sending (ingress) Functions */ /* ================================= */ @@ -2009,11 +2446,24 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, xc = &xtfs->cfg; xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; + xc->reorder_win_size = IPTFS_DEFAULT_REORDER_WINDOW; xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC; xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; if (attrs[XFRMA_IPTFS_DONT_FRAG]) xc->dont_frag = true; + if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) + xc->reorder_win_size = + nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]); + /* saved array is for saving 1..N seq nums from wantseq */ + if (xc->reorder_win_size) { + xtfs->w_saved = kcalloc(xc->reorder_win_size, + sizeof(*xtfs->w_saved), GFP_KERNEL); + if (!xtfs->w_saved) { + NL_SET_ERR_MSG(extack, "Cannot alloc reorder window"); + return -ENOMEM; + } + } if (attrs[XFRMA_IPTFS_PKT_SIZE]) { xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); if (!xc->pkt_size) { @@ -2051,6 +2501,7 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x) if (x->dir == XFRM_SA_DIR_IN) { l += nla_total_size(sizeof(u32)); /* drop time usec */ + l += nla_total_size(sizeof(xc->reorder_win_size)); } else { if (xc->dont_frag) l += nla_total_size(0); /* dont-frag flag */ @@ -2073,6 +2524,11 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) q = xtfs->drop_time_ns; do_div(q, NSECS_IN_USEC); ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q); + if (ret) + return ret; + + ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW, + xc->reorder_win_size); } else { if (xc->dont_frag) { ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); @@ -2134,6 +2590,14 @@ static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) xtfs->x = x; xtfs->ra_newskb = NULL; + if (xtfs->cfg.reorder_win_size) { + xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size, + sizeof(*xtfs->w_saved), GFP_KERNEL); + if (!xtfs->w_saved) { + kfree_sensitive(xtfs); + return -ENOMEM; + } + } return 0; } @@ -2160,6 +2624,7 @@ static void iptfs_destroy_state(struct xfrm_state *x) { struct xfrm_iptfs_data *xtfs = x->mode_data; struct sk_buff_head list; + struct skb_wseq *s, *se; struct sk_buff *skb; if (!xtfs) @@ -2181,6 +2646,12 @@ static void iptfs_destroy_state(struct xfrm_state *x) if (xtfs->ra_newskb) kfree_skb(xtfs->ra_newskb); + for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++) { + if (s->skb) + kfree_skb(s->skb); + } + + kfree_sensitive(xtfs->w_saved); kfree_sensitive(xtfs); module_put(x->mode_cbs->owner); From ed58b186c7737bf0db1ebf57207b30fe740e1d07 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:12 -0500 Subject: [PATCH 0172/1450] xfrm: iptfs: add tracepoint functionality Add tracepoints to the IP-TFS code. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/trace_iptfs.h | 218 +++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_iptfs.c | 71 +++++++++++++- 2 files changed, 288 insertions(+), 1 deletion(-) create mode 100644 net/xfrm/trace_iptfs.h diff --git a/net/xfrm/trace_iptfs.h b/net/xfrm/trace_iptfs.h new file mode 100644 index 0000000000000..74391ba244453 --- /dev/null +++ b/net/xfrm/trace_iptfs.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* xfrm_trace_iptfs.h + * + * August 12 2023, Christian Hopps + * + * Copyright (c) 2023, LabN Consulting, L.L.C. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM iptfs + +#if !defined(_TRACE_IPTFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IPTFS_H + +#include +#include +#include +#include + +struct xfrm_iptfs_data; + +TRACE_EVENT(iptfs_egress_recv, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u16 blkoff), + TP_ARGS(skb, xtfs, blkoff), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(void *, head) + __field(void *, head_pg_addr) + __field(void *, pg0addr) + __field(u32, skb_len) + __field(u32, data_len) + __field(u32, headroom) + __field(u32, tailroom) + __field(u32, tail) + __field(u32, end) + __field(u32, pg0off) + __field(u8, head_frag) + __field(u8, frag_list) + __field(u8, nr_frags) + __field(u16, blkoff)), + TP_fast_assign(__entry->skb = skb; + __entry->head = skb->head; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->headroom = skb_headroom(skb); + __entry->tailroom = skb_tailroom(skb); + __entry->tail = (u32)skb->tail; + __entry->end = (u32)skb->end; + __entry->head_frag = skb->head_frag; + __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->blkoff = blkoff; + __entry->head_pg_addr = page_address(virt_to_head_page(skb->head)); + __entry->pg0addr = (__entry->nr_frags + ? page_address(netmem_to_page(skb_shinfo(skb)->frags[0].netmem)) + : NULL); + __entry->pg0off = (__entry->nr_frags + ? skb_shinfo(skb)->frags[0].offset + : 0); + ), + TP_printk("EGRESS: skb=%p len=%u data_len=%u headroom=%u head_frag=%u frag_list=%u nr_frags=%u blkoff=%u\n\t\ttailroom=%u tail=%u end=%u head=%p hdpgaddr=%p pg0->addr=%p pg0->data=%p pg0->off=%u", + __entry->skb, __entry->skb_len, __entry->data_len, __entry->headroom, + __entry->head_frag, __entry->frag_list, __entry->nr_frags, __entry->blkoff, + __entry->tailroom, __entry->tail, __entry->end, __entry->head, + __entry->head_pg_addr, __entry->pg0addr, __entry->pg0addr + __entry->pg0off, + __entry->pg0off) + ) + +DECLARE_EVENT_CLASS(iptfs_ingress_preq_event, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, + u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, data_len) + __field(u32, pmtu) + __field(u32, queue_size) + __field(u32, proto_seq) + __field(u8, proto) + __field(u8, was_gso) + ), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->queue_size = + xtfs->cfg.max_queue_size - xtfs->queue_size; + __entry->proto = __trace_ip_proto(ip_hdr(skb)); + __entry->proto_seq = __trace_ip_proto_seq(ip_hdr(skb)); + __entry->pmtu = pmtu; + __entry->was_gso = was_gso; + ), + TP_printk("INGRPREQ: skb=%p len=%u data_len=%u qsize=%u proto=%u proto_seq=%u pmtu=%u was_gso=%u", + __entry->skb, __entry->skb_len, __entry->data_len, + __entry->queue_size, __entry->proto, __entry->proto_seq, + __entry->pmtu, __entry->was_gso)); + +DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_enqueue, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso)); + +DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_no_queue_space, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso)); + +DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_too_big, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso)); + +DECLARE_EVENT_CLASS(iptfs_ingress_postq_event, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, data_len) + __field(u32, mtu) + __field(u32, proto_seq) + __field(u16, blkoff) + __field(u8, proto)), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->mtu = mtu; + __entry->blkoff = blkoff; + __entry->proto = iph ? __trace_ip_proto(iph) : 0; + __entry->proto_seq = iph ? __trace_ip_proto_seq(iph) : 0; + ), + TP_printk("INGRPSTQ: skb=%p len=%u data_len=%u mtu=%u blkoff=%u proto=%u proto_seq=%u", + __entry->skb, __entry->skb_len, __entry->data_len, __entry->mtu, + __entry->blkoff, __entry->proto, __entry->proto_seq)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_dequeue, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_fragmenting, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_final_fragment, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_toobig, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +TRACE_EVENT(iptfs_ingress_nth_peek, + TP_PROTO(struct sk_buff *skb, u32 remaining), + TP_ARGS(skb, remaining), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, remaining)), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->remaining = remaining; + ), + TP_printk("INGRPSTQ: NTHPEEK: skb=%p len=%u remaining=%u", + __entry->skb, __entry->skb_len, __entry->remaining)); + +TRACE_EVENT(iptfs_ingress_nth_add, TP_PROTO(struct sk_buff *skb, u8 share_ok), + TP_ARGS(skb, share_ok), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, data_len) + __field(u8, share_ok) + __field(u8, head_frag) + __field(u8, pp_recycle) + __field(u8, cloned) + __field(u8, shared) + __field(u8, nr_frags) + __field(u8, frag_list) + ), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->share_ok = share_ok; + __entry->head_frag = skb->head_frag; + __entry->pp_recycle = skb->pp_recycle; + __entry->cloned = skb_cloned(skb); + __entry->shared = skb_shared(skb); + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; + ), + TP_printk("INGRPSTQ: NTHADD: skb=%p len=%u data_len=%u share_ok=%u head_frag=%u pp_recycle=%u cloned=%u shared=%u nr_frags=%u frag_list=%u", + __entry->skb, __entry->skb_len, __entry->data_len, __entry->share_ok, + __entry->head_frag, __entry->pp_recycle, __entry->cloned, __entry->shared, + __entry->nr_frags, __entry->frag_list)); + +DECLARE_EVENT_CLASS(iptfs_timer_event, + TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), + TP_ARGS(xtfs, time_val), + TP_STRUCT__entry(__field(u64, time_val) + __field(u64, set_time)), + TP_fast_assign(__entry->time_val = time_val; + __entry->set_time = xtfs->iptfs_settime; + ), + TP_printk("TIMER: set_time=%llu time_val=%llu", + __entry->set_time, __entry->time_val)); + +DEFINE_EVENT(iptfs_timer_event, iptfs_timer_start, + TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), + TP_ARGS(xtfs, time_val)); + +DEFINE_EVENT(iptfs_timer_event, iptfs_timer_expire, + TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), + TP_ARGS(xtfs, time_val)); + +#endif /* _TRACE_IPTFS_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../net/xfrm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_iptfs +#include diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 3ca7d2a04ea63..755f1eea8bfa1 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -19,6 +19,7 @@ #include #include "xfrm_inout.h" +#include "trace_iptfs.h" /* IPTFS encap (header) values. */ #define IPTFS_SUBTYPE_BASIC 0 @@ -131,6 +132,7 @@ struct skb_wseq { * @ecn_queue_size: octets above with ECN mark. * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. * @iptfs_timer: output timer. + * @iptfs_settime: time the output timer was set. * @payload_mtu: max payload size. * @w_seq_set: true after first seq received. * @w_wantseq: waiting for this seq number as next to process (in order). @@ -155,6 +157,7 @@ struct xfrm_iptfs_data { u32 ecn_queue_size; /* octets above which ECN mark */ u64 init_delay_ns; /* nanoseconds */ struct hrtimer iptfs_timer; /* output timer */ + time64_t iptfs_settime; /* time timer was set */ u32 payload_mtu; /* max payload size */ /* Tunnel input reordering */ @@ -181,6 +184,41 @@ static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); /* Utility Functions */ /* ================= */ +#ifdef TRACEPOINTS_ENABLED +static u32 __trace_ip_proto(struct iphdr *iph) +{ + if (iph->version == 4) + return iph->protocol; + return ((struct ipv6hdr *)iph)->nexthdr; +} + +static u32 __trace_ip_proto_seq(struct iphdr *iph) +{ + void *nexthdr; + u32 protocol = 0; + + if (iph->version == 4) { + nexthdr = (void *)(iph + 1); + protocol = iph->protocol; + } else if (iph->version == 6) { + nexthdr = (void *)(((struct ipv6hdr *)(iph)) + 1); + protocol = ((struct ipv6hdr *)(iph))->nexthdr; + } + switch (protocol) { + case IPPROTO_ICMP: + return ntohs(((struct icmphdr *)nexthdr)->un.echo.sequence); + case IPPROTO_ICMPV6: + return ntohs(((struct icmp6hdr *)nexthdr)->icmp6_sequence); + case IPPROTO_TCP: + return ntohl(((struct tcphdr *)nexthdr)->seq); + case IPPROTO_UDP: + return ntohs(((struct udphdr *)nexthdr)->source); + default: + return 0; + } +} +#endif /*TRACEPOINTS_ENABLED*/ + static u64 __esp_seq(struct sk_buff *skb) { u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low); @@ -461,6 +499,13 @@ static int iptfs_skb_add_frags(struct sk_buff *skb, return len; } +/* ================================== */ +/* IPTFS Trace Event Definitions */ +/* ================================== */ + +#define CREATE_TRACE_POINTS +#include "trace_iptfs.h" + /* ================================== */ /* IPTFS Receiving (egress) Functions */ /* ================================== */ @@ -1186,6 +1231,8 @@ static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb) } data = sizeof(*ipth); + trace_iptfs_egress_recv(skb, xtfs, be16_to_cpu(ipth->block_offset)); + /* Set data past the basic header */ if (ipth->subtype == IPTFS_SUBTYPE_CC) { /* Copy the rest of the CC header */ @@ -1830,6 +1877,7 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff */ if (!ok) { nospace: + trace_iptfs_no_queue_space(skb, xtfs, pmtu, was_gso); XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOQSPACE); kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING); continue; @@ -1839,6 +1887,7 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff * enqueue. */ if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) { + trace_iptfs_too_big(skb, xtfs, pmtu, was_gso); kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); continue; } @@ -1847,11 +1896,16 @@ static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff ok = iptfs_enqueue(xtfs, skb); if (!ok) goto nospace; + + trace_iptfs_enqueue(skb, xtfs, pmtu, was_gso); } /* Start a delay timer if we don't have one yet */ - if (!hrtimer_is_queued(&xtfs->iptfs_timer)) + if (!hrtimer_is_queued(&xtfs->iptfs_timer)) { hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, IPTFS_HRTIMER_MODE); + xtfs->iptfs_settime = ktime_get_raw_fast_ns(); + trace_iptfs_timer_start(xtfs, xtfs->init_delay_ns); + } spin_unlock_bh(&x->lock); return 0; @@ -1934,6 +1988,7 @@ static int iptfs_copy_create_frags(struct sk_buff **skbp, struct xfrm_iptfs_data struct sk_buff *nskb = *skbp; u32 copy_len, offset; u32 to_copy = skb->len - mtu; + u32 blkoff = 0; int err = 0; INIT_LIST_HEAD(&sublist); @@ -1945,6 +2000,7 @@ static int iptfs_copy_create_frags(struct sk_buff **skbp, struct xfrm_iptfs_data to_copy = skb->len - offset; while (to_copy) { /* Send all but last fragment to allow agg. append */ + trace_iptfs_first_fragmenting(nskb, mtu, to_copy, NULL); list_add_tail(&nskb->list, &sublist); /* FUTURE: if the packet has an odd/non-aligning length we could @@ -1963,11 +2019,14 @@ static int iptfs_copy_create_frags(struct sk_buff **skbp, struct xfrm_iptfs_data iptfs_output_prepare_skb(nskb, to_copy); offset += copy_len; to_copy -= copy_len; + blkoff = to_copy; } skb_abort_seq_read(&skbseq); /* return last fragment that will be unsent (or NULL) */ *skbp = nskb; + if (nskb) + trace_iptfs_first_final_fragment(nskb, mtu, blkoff, NULL); /* trim the original skb to MTU */ if (!err) @@ -2042,6 +2101,8 @@ static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, /* We've split gso up before queuing */ + trace_iptfs_first_dequeue(skb, mtu, 0, ip_hdr(skb)); + /* Consider the buffer Tx'd and no longer owned */ skb_orphan(skb); @@ -2137,6 +2198,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) */ XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR); + trace_iptfs_first_toobig(skb, mtu, 0, ip_hdr(skb)); kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); continue; } @@ -2183,6 +2245,7 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) * case. */ while ((skb2 = skb_peek(list))) { + trace_iptfs_ingress_nth_peek(skb2, remaining); if (skb2->len > remaining) break; @@ -2220,6 +2283,8 @@ static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) skb->len += skb2->len; remaining -= skb2->len; + trace_iptfs_ingress_nth_add(skb2, share_ok); + if (share_ok) { iptfs_consume_frags(skb, skb2); } else { @@ -2242,6 +2307,7 @@ static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) struct sk_buff_head list; struct xfrm_iptfs_data *xtfs; struct xfrm_state *x; + time64_t settime; xtfs = container_of(me, typeof(*xtfs), iptfs_timer); x = xtfs->x; @@ -2258,6 +2324,7 @@ static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) __skb_queue_head_init(&list); skb_queue_splice_init(&xtfs->queue, &list); xtfs->queue_size = 0; + settime = xtfs->iptfs_settime; spin_unlock(&x->lock); /* After the above unlock, packets can begin queuing again, and the @@ -2266,6 +2333,8 @@ static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) * already). */ + trace_iptfs_timer_expire(xtfs, (unsigned long long)(ktime_get_raw_fast_ns() - settime)); + iptfs_output_queued(x, &list); return HRTIMER_NORESTART; From f9a5b34f9251cf530fecf08ef039be64ead8c459 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 5 Dec 2024 00:09:21 +0200 Subject: [PATCH 0173/1450] net/mlx5: ifc: Reorganize mlx5_ifc_flow_table_context_bits The nested union at the end is not in the same style as the rest of the code, so un-nest it to make the style uniformly applied again. Signed-off-by: Cosmin Ratiu Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498bb..f3650f989e686 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -6324,6 +6324,20 @@ struct mlx5_ifc_modify_other_hca_cap_in_bits { struct mlx5_ifc_other_hca_cap_bits other_capability; }; +struct mlx5_ifc_sw_owner_icm_root_params_bits { + u8 sw_owner_icm_root_1[0x40]; + + u8 sw_owner_icm_root_0[0x40]; +}; + +struct mlx5_ifc_rtc_params_bits { + u8 rtc_id_0[0x20]; + + u8 rtc_id_1[0x20]; + + u8 reserved_at_40[0x40]; +}; + struct mlx5_ifc_flow_table_context_bits { u8 reformat_en[0x1]; u8 decap_en[0x1]; @@ -6342,20 +6356,10 @@ struct mlx5_ifc_flow_table_context_bits { u8 lag_master_next_table_id[0x18]; u8 reserved_at_60[0x60]; - union { - struct { - u8 sw_owner_icm_root_1[0x40]; - - u8 sw_owner_icm_root_0[0x40]; - } sws; - struct { - u8 rtc_id_0[0x20]; - u8 rtc_id_1[0x20]; - - u8 reserved_at_100[0x40]; - - } hws; + union { + struct mlx5_ifc_sw_owner_icm_root_params_bits sws; + struct mlx5_ifc_rtc_params_bits hws; }; }; From e799ac9dd3c485a7cda3586f2a12784b030b9df0 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 5 Dec 2024 00:09:22 +0200 Subject: [PATCH 0174/1450] net/mlx5: Add ConnectX-8 device to ifc In preparation for ConnectX-8 SWS support, add enum for the new device type. Signed-off-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-3-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f3650f989e686..bd9b1833408eb 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1590,6 +1590,7 @@ enum { MLX5_STEERING_FORMAT_CONNECTX_5 = 0, MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, MLX5_STEERING_FORMAT_CONNECTX_7 = 2, + MLX5_STEERING_FORMAT_CONNECTX_8 = 3, }; struct mlx5_ifc_cmd_hca_cap_bits { From 03713108e0cccf325bb71941edd9ed6122142907 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 5 Dec 2024 00:09:23 +0200 Subject: [PATCH 0175/1450] net/mlx5: Add support for new scheduling elements Introduce new scheduling elements in the E-Switch QoS hierarchy to enhance traffic management capabilities. This patch adds support for: - Rate Limit scheduling elements: Enables bandwidth limitation across multiple nodes without a shared ancestor, providing a mechanism for more granular control of bandwidth allocation. - Traffic Class Transmit Scheduling Arbiter (TSAR): Introduces the infrastructure for creating Traffic Class TSARs, allowing hierarchical arbitration based on traffic classes. - Traffic Class Arbiter TSAR: Adds support for a TSAR capable of managing arbitration between multiple traffic classes, enabling improved bandwidth prioritization and traffic management. No functional changes are introduced in this patch. Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-4-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/rl.c | 4 ++++ include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c index e393391966e0f..39a209b9b684b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c @@ -56,6 +56,8 @@ bool mlx5_qos_tsar_type_supported(struct mlx5_core_dev *dev, int type, u8 hierar return cap & TSAR_TYPE_CAP_MASK_ROUND_ROBIN; case TSAR_ELEMENT_TSAR_TYPE_ETS: return cap & TSAR_TYPE_CAP_MASK_ETS; + case TSAR_ELEMENT_TSAR_TYPE_TC_ARB: + return cap & TSAR_TYPE_CAP_MASK_TC_ARB; } return false; @@ -87,6 +89,8 @@ bool mlx5_qos_element_type_supported(struct mlx5_core_dev *dev, int type, u8 hie return cap & ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC; case SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP: return cap & ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP; + case SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT: + return cap & ELEMENT_TYPE_CAP_MASK_RATE_LIMIT; } return false; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bd9b1833408eb..8b202521b774e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1103,7 +1103,8 @@ struct mlx5_ifc_qos_cap_bits { u8 packet_pacing_min_rate[0x20]; - u8 reserved_at_80[0x10]; + u8 reserved_at_80[0xb]; + u8 log_esw_max_rate_limit[0x5]; u8 packet_pacing_rate_table_size[0x10]; u8 esw_element_type[0x10]; @@ -4104,6 +4105,7 @@ enum { SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC = 0x2, SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC = 0x3, SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP = 0x4, + SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT = 0x5, }; enum { @@ -4112,22 +4114,26 @@ enum { ELEMENT_TYPE_CAP_MASK_VPORT_TC = 1 << 2, ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC = 1 << 3, ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4, + ELEMENT_TYPE_CAP_MASK_RATE_LIMIT = 1 << 5, }; enum { TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0, TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1, TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2, + TSAR_ELEMENT_TSAR_TYPE_TC_ARB = 0x3, }; enum { TSAR_TYPE_CAP_MASK_DWRR = 1 << 0, TSAR_TYPE_CAP_MASK_ROUND_ROBIN = 1 << 1, TSAR_TYPE_CAP_MASK_ETS = 1 << 2, + TSAR_TYPE_CAP_MASK_TC_ARB = 1 << 3, }; struct mlx5_ifc_tsar_element_bits { - u8 reserved_at_0[0x8]; + u8 traffic_class[0x4]; + u8 reserved_at_4[0x4]; u8 tsar_type[0x8]; u8 reserved_at_10[0x10]; }; @@ -4164,7 +4170,9 @@ struct mlx5_ifc_scheduling_context_bits { u8 max_average_bw[0x20]; - u8 reserved_at_e0[0x120]; + u8 max_bw_obj_id[0x20]; + + u8 reserved_at_100[0x100]; }; struct mlx5_ifc_rqtc_bits { From f09ed834a946f9c77088d53af4d4806974728d7b Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 5 Dec 2024 00:09:24 +0200 Subject: [PATCH 0176/1450] net/mlx5: qos: Add ifc support for cross-esw scheduling This adds the capability bit and the vport element fields related to cross-esw scheduling. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-5-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8b202521b774e..5451ff1d43560 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1095,7 +1095,9 @@ struct mlx5_ifc_qos_cap_bits { u8 log_esw_max_sched_depth[0x4]; u8 reserved_at_10[0x10]; - u8 reserved_at_20[0xb]; + u8 reserved_at_20[0x9]; + u8 esw_cross_esw_sched[0x1]; + u8 reserved_at_2a[0x1]; u8 log_max_qos_nic_queue_group[0x5]; u8 reserved_at_30[0x10]; @@ -4139,13 +4141,16 @@ struct mlx5_ifc_tsar_element_bits { }; struct mlx5_ifc_vport_element_bits { - u8 reserved_at_0[0x10]; + u8 reserved_at_0[0x4]; + u8 eswitch_owner_vhca_id_valid[0x1]; + u8 eswitch_owner_vhca_id[0xb]; u8 vport_number[0x10]; }; struct mlx5_ifc_vport_tc_element_bits { u8 traffic_class[0x4]; - u8 reserved_at_4[0xc]; + u8 eswitch_owner_vhca_id_valid[0x1]; + u8 eswitch_owner_vhca_id[0xb]; u8 vport_number[0x10]; }; From d8e4771f99c0400a1873235704b28bb803c83d17 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 23 Oct 2024 11:40:56 +0300 Subject: [PATCH 0177/1450] mtd: rawnand: fix double free in atmel_pmecc_create_user() The "user" pointer was converted from being allocated with kzalloc() to being allocated by devm_kzalloc(). Calling kfree(user) will lead to a double free. Fixes: 6d734f1bfc33 ("mtd: rawnand: atmel: Fix possible memory leak") Signed-off-by: Dan Carpenter Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/atmel/pmecc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mtd/nand/raw/atmel/pmecc.c b/drivers/mtd/nand/raw/atmel/pmecc.c index a22aab4ed4e8a..3c7dee1be21df 100644 --- a/drivers/mtd/nand/raw/atmel/pmecc.c +++ b/drivers/mtd/nand/raw/atmel/pmecc.c @@ -380,10 +380,8 @@ atmel_pmecc_create_user(struct atmel_pmecc *pmecc, user->delta = user->dmu + req->ecc.strength + 1; gf_tables = atmel_pmecc_get_gf_tables(req); - if (IS_ERR(gf_tables)) { - kfree(user); + if (IS_ERR(gf_tables)) return ERR_CAST(gf_tables); - } user->gf_tables = gf_tables; From 9b458e8be0d13e81ed03fffa23f8f9b528bbd786 Mon Sep 17 00:00:00 2001 From: Zichen Xie Date: Wed, 23 Oct 2024 16:13:10 -0500 Subject: [PATCH 0178/1450] mtd: diskonchip: Cast an operand to prevent potential overflow There may be a potential integer overflow issue in inftl_partscan(). parts[0].size is defined as "uint64_t" while mtd->erasesize and ip->firstUnit are defined as 32-bit unsigned integer. The result of the calculation will be limited to 32 bits without correct casting. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Zichen Xie Cc: stable@vger.kernel.org Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/diskonchip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/diskonchip.c b/drivers/mtd/nand/raw/diskonchip.c index 8db7fc4245711..70d6c2250f32c 100644 --- a/drivers/mtd/nand/raw/diskonchip.c +++ b/drivers/mtd/nand/raw/diskonchip.c @@ -1098,7 +1098,7 @@ static inline int __init inftl_partscan(struct mtd_info *mtd, struct mtd_partiti (i == 0) && (ip->firstUnit > 0)) { parts[0].name = " DiskOnChip IPL / Media Header partition"; parts[0].offset = 0; - parts[0].size = mtd->erasesize * ip->firstUnit; + parts[0].size = (uint64_t)mtd->erasesize * ip->firstUnit; numparts = 1; } From b086a46dae48829e11c0c02580e30d920b76743c Mon Sep 17 00:00:00 2001 From: Maciej Andrzejewski Date: Mon, 2 Dec 2024 13:51:07 +0100 Subject: [PATCH 0179/1450] mtd: rawnand: arasan: Fix double assertion of chip-select When two chip-selects are configured in the device tree, and the second is a non-native GPIO, both the GPIO-based chip-select and the first native chip-select may be asserted simultaneously. This double assertion causes incorrect read and write operations. The issue occurs because when nfc->ncs <= 2, nfc->spare_cs is always initialized to 0 due to static initialization. Consequently, when the second chip-select (GPIO-based) is selected in anfc_assert_cs(), it is detected by anfc_is_gpio_cs(), and nfc->native_cs is assigned the value 0. This results in both the GPIO-based chip-select being asserted and the NAND controller register receiving 0, erroneously selecting the native chip-select. This patch resolves the issue, as confirmed by oscilloscope testing with configurations involving two or more chip-selects in the device tree. Fixes: acbd3d0945f9 ("mtd: rawnand: arasan: Leverage additional GPIO CS") Cc: stable@vger.kernel.org Signed-off-by: Maciej Andrzejewski Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/arasan-nand-controller.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/arasan-nand-controller.c b/drivers/mtd/nand/raw/arasan-nand-controller.c index db42aa0c7b6b5..26b506107a1a3 100644 --- a/drivers/mtd/nand/raw/arasan-nand-controller.c +++ b/drivers/mtd/nand/raw/arasan-nand-controller.c @@ -1409,8 +1409,8 @@ static int anfc_parse_cs(struct arasan_nfc *nfc) * case, the "not" chosen CS is assigned to nfc->spare_cs and selected * whenever a GPIO CS must be asserted. */ - if (nfc->cs_array && nfc->ncs > 2) { - if (!nfc->cs_array[0] && !nfc->cs_array[1]) { + if (nfc->cs_array) { + if (nfc->ncs > 2 && !nfc->cs_array[0] && !nfc->cs_array[1]) { dev_err(nfc->dev, "Assign a single native CS when using GPIOs\n"); return -EINVAL; From 11e6831fd81468cf48155b9b3c11295c391da723 Mon Sep 17 00:00:00 2001 From: Maciej Andrzejewski Date: Mon, 2 Dec 2024 19:58:36 +0100 Subject: [PATCH 0180/1450] mtd: rawnand: arasan: Fix missing de-registration of NAND The NAND chip-selects are registered for the Arasan driver during initialization but are not de-registered when the driver is unloaded. As a result, if the driver is loaded again, the chip-selects remain registered and busy, making them unavailable for use. Fixes: 197b88fecc50 ("mtd: rawnand: arasan: Add new Arasan NAND controller") Cc: stable@vger.kernel.org Signed-off-by: Maciej Andrzejewski ICEYE Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/arasan-nand-controller.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mtd/nand/raw/arasan-nand-controller.c b/drivers/mtd/nand/raw/arasan-nand-controller.c index 26b506107a1a3..865754737f5fc 100644 --- a/drivers/mtd/nand/raw/arasan-nand-controller.c +++ b/drivers/mtd/nand/raw/arasan-nand-controller.c @@ -1478,8 +1478,15 @@ static int anfc_probe(struct platform_device *pdev) static void anfc_remove(struct platform_device *pdev) { + int i; struct arasan_nfc *nfc = platform_get_drvdata(pdev); + for (i = 0; i < nfc->ncs; i++) { + if (nfc->cs_array[i]) { + gpiod_put(nfc->cs_array[i]); + } + } + anfc_chips_cleanup(nfc); } From 140054a25f85036ec847e722c76cc1bfaf3f0d96 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Tue, 3 Dec 2024 15:33:17 +0200 Subject: [PATCH 0181/1450] mtd: rawnand: omap2: Fix build warnings with W=1 Add kernel-doc for functions to get rid of below warnings when built with W=1. drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:260: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_in_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:304: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_out_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:446: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_in_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'chip' not described in 'omap_nand_data_out_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'buf' not described in 'omap_nand_data_out_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'len' not described in 'omap_nand_data_out_dma_pref' drivers/mtd/nand/raw/omap2.c:467: warning: Function parameter or struct member 'force_8bit' not described in 'omap_nand_data_out_dma_pref' Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412031716.JfNIh1Uu-lkp@intel.com/ Signed-off-by: Roger Quadros Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/omap2.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/mtd/nand/raw/omap2.c b/drivers/mtd/nand/raw/omap2.c index d9141f3c0dd16..b8af3a3533fc5 100644 --- a/drivers/mtd/nand/raw/omap2.c +++ b/drivers/mtd/nand/raw/omap2.c @@ -254,6 +254,10 @@ static int omap_prefetch_reset(int cs, struct omap_nand_info *info) /** * omap_nand_data_in_pref - NAND data in using prefetch engine + * @chip: NAND chip + * @buf: output buffer where NAND data is placed into + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_in_pref(struct nand_chip *chip, void *buf, unsigned int len, bool force_8bit) @@ -297,6 +301,10 @@ static void omap_nand_data_in_pref(struct nand_chip *chip, void *buf, /** * omap_nand_data_out_pref - NAND data out using Write Posting engine + * @chip: NAND chip + * @buf: input buffer that is sent to NAND + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_out_pref(struct nand_chip *chip, const void *buf, unsigned int len, @@ -440,6 +448,10 @@ static inline int omap_nand_dma_transfer(struct nand_chip *chip, /** * omap_nand_data_in_dma_pref - NAND data in using DMA and Prefetch + * @chip: NAND chip + * @buf: output buffer where NAND data is placed into + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_in_dma_pref(struct nand_chip *chip, void *buf, unsigned int len, bool force_8bit) @@ -460,6 +472,10 @@ static void omap_nand_data_in_dma_pref(struct nand_chip *chip, void *buf, /** * omap_nand_data_out_dma_pref - NAND data out using DMA and write posting + * @chip: NAND chip + * @buf: input buffer that is sent to NAND + * @len: length of transfer + * @force_8bit: force 8-bit transfers */ static void omap_nand_data_out_dma_pref(struct nand_chip *chip, const void *buf, unsigned int len, From 0600cf40e9b36fe17f9c9f04d4f9cef249eaa5e7 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 3 Dec 2024 13:49:42 +0100 Subject: [PATCH 0182/1450] include: net: add static inline dst_dev_overhead() to dst.h Add static inline dst_dev_overhead() function to include/net/dst.h. This helper function is used by ioam6_iptunnel, rpl_iptunnel and seg6_iptunnel to get the dev's overhead based on a cache entry (dst_entry). If the cache is empty, the default and generic value skb->mac_len is returned. Otherwise, LL_RESERVED_SPACE() over dst's dev is returned. Signed-off-by: Justin Iurman Cc: Alexander Lobakin Cc: Vadim Fedorenko Signed-off-by: Paolo Abeni --- include/net/dst.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/net/dst.h b/include/net/dst.h index 0f303cc602520..08647c99d79c9 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -440,6 +440,15 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout) dst->expires = expires; } +static inline unsigned int dst_dev_overhead(struct dst_entry *dst, + struct sk_buff *skb) +{ + if (likely(dst)) + return LL_RESERVED_SPACE(dst->dev); + + return skb->mac_len; +} + INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, From dce525185bc92864e5a318040285ee070563fe34 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 3 Dec 2024 13:49:43 +0100 Subject: [PATCH 0183/1450] net: ipv6: ioam6_iptunnel: mitigate 2-realloc issue This patch mitigates the two-reallocations issue with ioam6_iptunnel by providing the dst_entry (in the cache) to the first call to skb_cow_head(). As a result, the very first iteration may still trigger two reallocations (i.e., empty cache), while next iterations would only trigger a single reallocation. Performance tests before/after applying this patch, which clearly shows the improvement: - inline mode: - before: https://ibb.co/LhQ8V63 - after: https://ibb.co/x5YT2bS - encap mode: - before: https://ibb.co/3Cjm5m0 - after: https://ibb.co/TwpsxTC - encap mode with tunsrc: - before: https://ibb.co/Gpy9QPg - after: https://ibb.co/PW1bZFT This patch also fixes an incorrect behavior: after the insertion, the second call to skb_cow_head() makes sure that the dev has enough headroom in the skb for layer 2 and stuff. In that case, the "old" dst_entry was used, which is now fixed. After discussing with Paolo, it appears that both patches can be merged into a single one -this one- (for the sake of readability) and target net-next. Signed-off-by: Justin Iurman Signed-off-by: Paolo Abeni --- net/ipv6/ioam6_iptunnel.c | 73 ++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 9d8422e350f8d..28e5a89dc2557 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -253,14 +253,15 @@ static int ioam6_do_fill(struct net *net, struct sk_buff *skb) } static int ioam6_do_inline(struct net *net, struct sk_buff *skb, - struct ioam6_lwt_encap *tuninfo) + struct ioam6_lwt_encap *tuninfo, + struct dst_entry *cache_dst) { struct ipv6hdr *oldhdr, *hdr; int hdrlen, err; hdrlen = (tuninfo->eh.hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -291,7 +292,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo, bool has_tunsrc, struct in6_addr *tunsrc, - struct in6_addr *tundst) + struct in6_addr *tundst, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr, *inner_hdr; @@ -300,7 +302,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, hdrlen = (tuninfo->eh.hdrlen + 1) << 3; len = sizeof(*hdr) + hdrlen; - err = skb_cow_head(skb, len + skb->mac_len); + err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -334,7 +336,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); + struct dst_entry *dst = skb_dst(skb), *cache_dst; struct in6_addr orig_daddr; struct ioam6_lwt *ilwt; int err = -EINVAL; @@ -352,6 +354,10 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) orig_daddr = ipv6_hdr(skb)->daddr; + local_bh_disable(); + cache_dst = dst_cache_get(&ilwt->cache); + local_bh_enable(); + switch (ilwt->mode) { case IOAM6_IPTUNNEL_MODE_INLINE: do_inline: @@ -359,7 +365,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) goto out; - err = ioam6_do_inline(net, skb, &ilwt->tuninfo); + err = ioam6_do_inline(net, skb, &ilwt->tuninfo, cache_dst); if (unlikely(err)) goto drop; @@ -369,7 +375,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) /* Encapsulation (ip6ip6) */ err = ioam6_do_encap(net, skb, &ilwt->tuninfo, ilwt->has_tunsrc, &ilwt->tunsrc, - &ilwt->tundst); + &ilwt->tundst, cache_dst); if (unlikely(err)) goto drop; @@ -387,41 +393,36 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; + if (unlikely(!cache_dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; + + cache_dst = ip6_route_output(net, NULL, &fl6); + if (cache_dst->error) { + err = cache_dst->error; + dst_release(cache_dst); + goto drop; + } - if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { local_bh_disable(); - dst = dst_cache_get(&ilwt->cache); + dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); local_bh_enable(); - if (unlikely(!dst)) { - struct ipv6hdr *hdr = ipv6_hdr(skb); - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = hdr->daddr; - fl6.saddr = hdr->saddr; - fl6.flowlabel = ip6_flowinfo(hdr); - fl6.flowi6_mark = skb->mark; - fl6.flowi6_proto = hdr->nexthdr; - - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - err = dst->error; - dst_release(dst); - goto drop; - } - - local_bh_disable(); - dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); - local_bh_enable(); - } + err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); + if (unlikely(err)) + goto drop; + } + if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { skb_dst_drop(skb); - skb_dst_set(skb, dst); - + skb_dst_set(skb, cache_dst); return dst_output(net, sk, skb); } out: From 40475b63761abb6f8fdef960d03228a08662c9c4 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 3 Dec 2024 13:49:44 +0100 Subject: [PATCH 0184/1450] net: ipv6: seg6_iptunnel: mitigate 2-realloc issue This patch mitigates the two-reallocations issue with seg6_iptunnel by providing the dst_entry (in the cache) to the first call to skb_cow_head(). As a result, the very first iteration would still trigger two reallocations (i.e., empty cache), while next iterations would only trigger a single reallocation. Performance tests before/after applying this patch, which clearly shows the improvement: - before: https://ibb.co/3Cg4sNH - after: https://ibb.co/8rQ350r Signed-off-by: Justin Iurman Cc: David Lebrun Signed-off-by: Paolo Abeni --- net/ipv6/seg6_iptunnel.c | 85 ++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 33 deletions(-) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 098632adc9b5a..4bf937bfc2633 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -124,8 +124,8 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, return flowlabel; } -/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ -int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(dst->dev); @@ -137,7 +137,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -197,11 +197,18 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) return 0; } + +/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ +int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +{ + return __seg6_do_srh_encap(skb, osrh, proto, NULL); +} EXPORT_SYMBOL_GPL(seg6_do_srh_encap); /* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */ static int seg6_do_srh_encap_red(struct sk_buff *skb, - struct ipv6_sr_hdr *osrh, int proto) + struct ipv6_sr_hdr *osrh, int proto, + struct dst_entry *cache_dst) { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); @@ -230,7 +237,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, tot_len = red_hdrlen + sizeof(struct ipv6hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -317,8 +324,8 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, return 0; } -/* insert an SRH within an IPv6 packet, just after the IPv6 header */ -int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + struct dst_entry *cache_dst) { struct ipv6hdr *hdr, *oldhdr; struct ipv6_sr_hdr *isrh; @@ -326,7 +333,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) hdrlen = (osrh->hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -369,9 +376,8 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) return 0; } -EXPORT_SYMBOL_GPL(seg6_do_srh_inline); -static int seg6_do_srh(struct sk_buff *skb) +static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct seg6_iptunnel_encap *tinfo; @@ -384,7 +390,7 @@ static int seg6_do_srh(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; - err = seg6_do_srh_inline(skb, tinfo->srh); + err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst); if (err) return err; break; @@ -402,9 +408,11 @@ static int seg6_do_srh(struct sk_buff *skb) return -EINVAL; if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, proto); + err = __seg6_do_srh_encap(skb, tinfo->srh, + proto, cache_dst); else - err = seg6_do_srh_encap_red(skb, tinfo->srh, proto); + err = seg6_do_srh_encap_red(skb, tinfo->srh, + proto, cache_dst); if (err) return err; @@ -425,11 +433,13 @@ static int seg6_do_srh(struct sk_buff *skb) skb_push(skb, skb->mac_len); if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, - IPPROTO_ETHERNET); + err = __seg6_do_srh_encap(skb, tinfo->srh, + IPPROTO_ETHERNET, + cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, - IPPROTO_ETHERNET); + IPPROTO_ETHERNET, + cache_dst); if (err) return err; @@ -444,6 +454,13 @@ static int seg6_do_srh(struct sk_buff *skb) return 0; } +/* insert an SRH within an IPv6 packet, just after the IPv6 header */ +int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +{ + return __seg6_do_srh_inline(skb, osrh, NULL); +} +EXPORT_SYMBOL_GPL(seg6_do_srh_inline); + static int seg6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -458,31 +475,33 @@ static int seg6_input_core(struct net *net, struct sock *sk, struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) - goto drop; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); dst = dst_cache_get(&slwt->cache); + local_bh_enable(); + + err = seg6_do_srh(skb, dst); + if (unlikely(err)) + goto drop; if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { + local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } - local_bh_enable(); - - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, @@ -528,16 +547,16 @@ static int seg6_output_core(struct net *net, struct sock *sk, struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) - goto drop; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); + err = seg6_do_srh(skb, dst); + if (unlikely(err)) + goto drop; + if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -559,15 +578,15 @@ static int seg6_output_core(struct net *net, struct sock *sk, local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); local_bh_enable(); + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); From 985ec6f5e6235242191370628acb73d7a9f0c0ea Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 3 Dec 2024 13:49:45 +0100 Subject: [PATCH 0185/1450] net: ipv6: rpl_iptunnel: mitigate 2-realloc issue This patch mitigates the two-reallocations issue with rpl_iptunnel by providing the dst_entry (in the cache) to the first call to skb_cow_head(). As a result, the very first iteration would still trigger two reallocations (i.e., empty cache), while next iterations would only trigger a single reallocation. Performance tests before/after applying this patch, which clearly shows there is no impact (it even shows improvement): - before: https://ibb.co/nQJhqwc - after: https://ibb.co/4ZvW6wV Signed-off-by: Justin Iurman Cc: Alexander Aring Signed-off-by: Paolo Abeni --- net/ipv6/rpl_iptunnel.c | 46 ++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index db3c19a42e1ca..7ba22d2f2bfef 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -125,7 +125,8 @@ static void rpl_destroy_state(struct lwtunnel_state *lwt) } static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, - const struct ipv6_rpl_sr_hdr *srh) + const struct ipv6_rpl_sr_hdr *srh, + struct dst_entry *cache_dst) { struct ipv6_rpl_sr_hdr *isrh, *csrh; const struct ipv6hdr *oldhdr; @@ -153,7 +154,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, hdrlen = ((csrh->hdrlen + 1) << 3); - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) { kfree(buf); return err; @@ -186,7 +187,8 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, return 0; } -static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) +static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct rpl_iptunnel_encap *tinfo; @@ -196,7 +198,7 @@ static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) tinfo = rpl_encap_lwtunnel(dst->lwtstate); - return rpl_do_srh_inline(skb, rlwt, tinfo->srh); + return rpl_do_srh_inline(skb, rlwt, tinfo->srh, cache_dst); } static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -208,14 +210,14 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); - err = rpl_do_srh(skb, rlwt); - if (unlikely(err)) - goto drop; - local_bh_disable(); dst = dst_cache_get(&rlwt->cache); local_bh_enable(); + err = rpl_do_srh(skb, rlwt, dst); + if (unlikely(err)) + goto drop; + if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -237,15 +239,15 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); local_bh_enable(); + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - return dst_output(net, sk, skb); drop: @@ -262,29 +264,31 @@ static int rpl_input(struct sk_buff *skb) rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); - err = rpl_do_srh(skb, rlwt); - if (unlikely(err)) - goto drop; - local_bh_disable(); dst = dst_cache_get(&rlwt->cache); + local_bh_enable(); + + err = rpl_do_srh(skb, rlwt, dst); + if (unlikely(err)) + goto drop; if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { + local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } - local_bh_enable(); - - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; return dst_input(skb); From 4f776d81bf927a4f25d5e32a4d0df08ee509dd6c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V (Arm)" Date: Thu, 28 Nov 2024 20:55:43 +0530 Subject: [PATCH 0186/1450] arm64: dts: fvp: Update PCIe bus-range property These days, the Fixed Virtual Platforms(FVP) Base RevC model supports more PCI devices. Update the max bus number so that Linux can enumerate them correctly. Without this, the kernel throws the below error while booting with the default hierarchy | pci_bus 0000:01: busn_res: [bus 01] end is updated to 01 | pci_bus 0000:02: busn_res: can not insert [bus 02-01] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:02: busn_res: [bus 02-01] end is updated to 02 | pci_bus 0000:02: busn_res: can not insert [bus 02] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:03: busn_res: can not insert [bus 03-01] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:03: busn_res: [bus 03-01] end is updated to 03 | pci_bus 0000:03: busn_res: can not insert [bus 03] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:04: busn_res: can not insert [bus 04-01] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci_bus 0000:04: busn_res: [bus 04-01] end is updated to 04 | pci_bus 0000:04: busn_res: can not insert [bus 04] under | [bus 00-01] (conflicts with (null) [bus 00-01]) | pci 0000:00:01.0: BAR 14: assigned [mem 0x50000000-0x500fffff] | pci-host-generic 40000000.pci: ECAM at [mem 0x40000000-0x4fffffff] | for [bus 00-01] The change is using 0xff as max bus number because the ECAM window is 256MB in size. Below is the lspci output with and without the change: without fix =========== | 00:00.0 Host bridge: ARM Device 00ba (rev 01) | 00:01.0 PCI bridge: ARM Device 0def | 00:02.0 PCI bridge: ARM Device 0def | 00:03.0 PCI bridge: ARM Device 0def | 00:04.0 PCI bridge: ARM Device 0def | 00:1e.0 Unassigned class [ff00]: ARM Device ff80 | 00:1e.1 Unassigned class [ff00]: ARM Device ff80 | 00:1f.0 SATA controller: Device 0abc:aced (rev 01) | 01:00.0 SATA controller: Device 0abc:aced (rev 01) with fix ======== | 00:00.0 Host bridge: ARM Device 00ba (rev 01) | 00:01.0 PCI bridge: ARM Device 0def | 00:02.0 PCI bridge: ARM Device 0def | 00:03.0 PCI bridge: ARM Device 0def | 00:04.0 PCI bridge: ARM Device 0def | 00:1e.0 Unassigned class [ff00]: ARM Device ff80 | 00:1e.1 Unassigned class [ff00]: ARM Device ff80 | 00:1f.0 SATA controller: Device 0abc:aced (rev 01) | 01:00.0 SATA controller: Device 0abc:aced (rev 01) | 02:00.0 Unassigned class [ff00]: ARM Device ff80 | 02:00.4 Unassigned class [ff00]: ARM Device ff80 | 03:00.0 PCI bridge: ARM Device 0def | 04:00.0 PCI bridge: ARM Device 0def | 04:01.0 PCI bridge: ARM Device 0def | 04:02.0 PCI bridge: ARM Device 0def | 05:00.0 SATA controller: Device 0abc:aced (rev 01) | 06:00.0 Unassigned class [ff00]: ARM Device ff80 | 06:00.7 Unassigned class [ff00]: ARM Device ff80 | 07:00.0 Unassigned class [ff00]: ARM Device ff80 | 07:00.3 Unassigned class [ff00]: ARM Device ff80 | 08:00.0 Unassigned class [ff00]: ARM Device ff80 | 08:00.1 Unassigned class [ff00]: ARM Device ff80 Cc: Sudeep Holla Cc: Lorenzo Pieralisi Cc: Rob Herring Cc: Krzysztof Kozlowski Cc: Conor Dooley Reviewed-by: Liviu Dudau Signed-off-by: Aneesh Kumar K.V (Arm) Message-Id: <20241128152543.1821878-1-aneesh.kumar@kernel.org> Signed-off-by: Sudeep Holla --- arch/arm64/boot/dts/arm/fvp-base-revc.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/arm/fvp-base-revc.dts b/arch/arm64/boot/dts/arm/fvp-base-revc.dts index 19973ab4ea6b5..9e10d7a6b5a2c 100644 --- a/arch/arm64/boot/dts/arm/fvp-base-revc.dts +++ b/arch/arm64/boot/dts/arm/fvp-base-revc.dts @@ -233,7 +233,7 @@ #interrupt-cells = <0x1>; compatible = "pci-host-ecam-generic"; device_type = "pci"; - bus-range = <0x0 0x1>; + bus-range = <0x0 0xff>; reg = <0x0 0x40000000 0x0 0x10000000>; ranges = <0x2000000 0x0 0x50000000 0x0 0x50000000 0x0 0x10000000>; interrupt-map = <0 0 0 1 &gic 0 0 GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>, From 48808b55b07c3cea64805267a5547f03e6452a9f Mon Sep 17 00:00:00 2001 From: Valentina Fernandez Date: Mon, 18 Nov 2024 15:53:54 +0000 Subject: [PATCH 0187/1450] firmware: microchip: fix UL_IAP lock check in mpfs_auto_update_state() To verify that Auto Update is possible, the mpfs_auto_update_state() function performs a "Query Security Service Request" to the system controller. Previously, the check was performed on the first element of the response message, which was accessed using a 32-bit pointer. This caused the bitwise operation to reference incorrect data, as the response should be inspected at the byte level. Fixed this by casting the response to a u8 * pointer, ensuring the check correctly inspects the appropriate byte of the response message. Additionally, rename "UL_Auto Update" to "UL_IAP" to match the PolarFire Family System Services User Guide. Signed-off-by: Valentina Fernandez Signed-off-by: Conor Dooley --- drivers/firmware/microchip/mpfs-auto-update.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/microchip/mpfs-auto-update.c b/drivers/firmware/microchip/mpfs-auto-update.c index 38a03698cec95..e194f7acb2a9b 100644 --- a/drivers/firmware/microchip/mpfs-auto-update.c +++ b/drivers/firmware/microchip/mpfs-auto-update.c @@ -402,10 +402,10 @@ static int mpfs_auto_update_available(struct mpfs_auto_update_priv *priv) return -EIO; /* - * Bit 5 of byte 1 is "UL_Auto Update" & if it is set, Auto Update is + * Bit 5 of byte 1 is "UL_IAP" & if it is set, Auto Update is * not possible. */ - if (response_msg[1] & AUTO_UPDATE_FEATURE_ENABLED) + if ((((u8 *)response_msg)[1] & AUTO_UPDATE_FEATURE_ENABLED)) return -EPERM; return 0; From 523d3cc4b6d1ae18bfa516345d48332d455181e6 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:42 -0800 Subject: [PATCH 0188/1450] ynl: support enum-cnt-name attribute in legacy definitions This is similar to existing attr-cnt-name in the attributes to allow changing the name of the 'count' enum entry. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-2-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- Documentation/netlink/genetlink-c.yaml | 3 +++ Documentation/netlink/genetlink-legacy.yaml | 3 +++ Documentation/userspace-api/netlink/c-code-gen.rst | 4 +++- tools/net/ynl/ynl-gen-c.py | 8 ++++++-- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index 4f803eaac6d8b..9660ffb1ed6ac 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -106,6 +106,9 @@ properties: name-prefix: description: For enum the prefix of the values, optional. type: string + enum-cnt-name: + description: Name of the render-max counter enum entry. + type: string # End genetlink-c attribute-sets: diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index 8db0e22fa72c7..16380e12cabe7 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -117,6 +117,9 @@ properties: name-prefix: description: For enum the prefix of the values, optional. type: string + enum-cnt-name: + description: Name of the render-max counter enum entry. + type: string # End genetlink-c # Start genetlink-legacy members: diff --git a/Documentation/userspace-api/netlink/c-code-gen.rst b/Documentation/userspace-api/netlink/c-code-gen.rst index 89de42c133502..46415e6d646d2 100644 --- a/Documentation/userspace-api/netlink/c-code-gen.rst +++ b/Documentation/userspace-api/netlink/c-code-gen.rst @@ -56,7 +56,9 @@ If ``name-prefix`` is specified it replaces the ``$family-$enum`` portion of the entry name. Boolean ``render-max`` controls creation of the max values -(which are enabled by default for attribute enums). +(which are enabled by default for attribute enums). These max +values are named ``__$pfx-MAX`` and ``$pfx-MAX``. The name +of the first value can be overridden via ``enum-cnt-name`` property. Attributes ========== diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index d8201c4b15209..bfe95826ae3e4 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -801,6 +801,7 @@ def __init__(self, family, yaml): self.user_type = 'int' self.value_pfx = yaml.get('name-prefix', f"{family.ident_name}-{yaml['name']}-") + self.enum_cnt_name = yaml.get('enum-cnt-name', None) super().__init__(family, yaml) @@ -2472,9 +2473,12 @@ def render_uapi(family, cw): max_val = f' = {enum.get_mask()},' cw.p(max_name + max_val) else: + cnt_name = enum.enum_cnt_name max_name = c_upper(name_pfx + 'max') - cw.p('__' + max_name + ',') - cw.p(max_name + ' = (__' + max_name + ' - 1)') + if not cnt_name: + cnt_name = '__' + name_pfx + 'max' + cw.p(c_upper(cnt_name) + ',') + cw.p(max_name + ' = (' + c_upper(cnt_name) + ' - 1)') cw.block_end(line=';') cw.nl() elif const['type'] == 'const': From 8c843ecde4e49e11063ad942675246ec685ea19a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:43 -0800 Subject: [PATCH 0189/1450] ynl: skip rendering attributes with header property in uapi mode To allow omitting some of the attributes in the final generated file. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-3-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index bfe95826ae3e4..79829ce39139b 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -801,6 +801,7 @@ def __init__(self, family, yaml): self.user_type = 'int' self.value_pfx = yaml.get('name-prefix', f"{family.ident_name}-{yaml['name']}-") + self.header = yaml.get('header', None) self.enum_cnt_name = yaml.get('enum-cnt-name', None) super().__init__(family, yaml) @@ -2441,6 +2442,9 @@ def render_uapi(family, cw): if const['type'] == 'enum' or const['type'] == 'flags': enum = family.consts[const['name']] + if enum.header: + continue + if enum.has_doc(): if enum.has_entry_doc(): cw.p('/**') From 56881d07f0b4cb97f3c460dc3908eee91fc51a17 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:44 -0800 Subject: [PATCH 0190/1450] ynl: support directional specs in ynl-gen-c.py The intent is to generate ethtool uapi headers. For now, some of the things are hard-coded: - _MSG_{USER,KERNEL}_MAX - the split between USER and KERNEL messages Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-4-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 118 +++++++++++++++++++++++++++---------- 1 file changed, 87 insertions(+), 31 deletions(-) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 79829ce39139b..2bf4d992e54a7 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -2419,6 +2419,87 @@ def uapi_enum_start(family, cw, obj, ckey='', enum_name='enum-name'): cw.block_start(line=start_line) +def render_uapi_unified(family, cw, max_by_define, separate_ntf): + max_name = c_upper(family.get('cmd-max-name', f"{family.op_prefix}MAX")) + cnt_name = c_upper(family.get('cmd-cnt-name', f"__{family.op_prefix}MAX")) + max_value = f"({cnt_name} - 1)" + + uapi_enum_start(family, cw, family['operations'], 'enum-name') + val = 0 + for op in family.msgs.values(): + if separate_ntf and ('notify' in op or 'event' in op): + continue + + suffix = ',' + if op.value != val: + suffix = f" = {op.value}," + val = op.value + cw.p(op.enum_name + suffix) + val += 1 + cw.nl() + cw.p(cnt_name + ('' if max_by_define else ',')) + if not max_by_define: + cw.p(f"{max_name} = {max_value}") + cw.block_end(line=';') + if max_by_define: + cw.p(f"#define {max_name} {max_value}") + cw.nl() + + +def render_uapi_directional(family, cw, max_by_define): + max_name = f"{family.op_prefix}USER_MAX" + cnt_name = f"__{family.op_prefix}USER_CNT" + max_value = f"({cnt_name} - 1)" + + cw.block_start(line='enum') + cw.p(c_upper(f'{family.name}_MSG_USER_NONE = 0,')) + val = 0 + for op in family.msgs.values(): + if 'do' in op and 'event' not in op: + suffix = ',' + if op.value and op.value != val: + suffix = f" = {op.value}," + val = op.value + cw.p(op.enum_name + suffix) + val += 1 + cw.nl() + cw.p(cnt_name + ('' if max_by_define else ',')) + if not max_by_define: + cw.p(f"{max_name} = {max_value}") + cw.block_end(line=';') + if max_by_define: + cw.p(f"#define {max_name} {max_value}") + cw.nl() + + max_name = f"{family.op_prefix}KERNEL_MAX" + cnt_name = f"__{family.op_prefix}KERNEL_CNT" + max_value = f"({cnt_name} - 1)" + + cw.block_start(line='enum') + cw.p(c_upper(f'{family.name}_MSG_KERNEL_NONE = 0,')) + val = 0 + for op in family.msgs.values(): + if ('do' in op and 'reply' in op['do']) or 'notify' in op or 'event' in op: + enum_name = op.enum_name + if 'event' not in op and 'notify' not in op: + enum_name = f'{enum_name}_REPLY' + + suffix = ',' + if op.value and op.value != val: + suffix = f" = {op.value}," + val = op.value + cw.p(enum_name + suffix) + val += 1 + cw.nl() + cw.p(cnt_name + ('' if max_by_define else ',')) + if not max_by_define: + cw.p(f"{max_name} = {max_value}") + cw.block_end(line=';') + if max_by_define: + cw.p(f"#define {max_name} {max_value}") + cw.nl() + + def render_uapi(family, cw): hdr_prot = f"_UAPI_LINUX_{c_upper(family.uapi_header_name)}_H" hdr_prot = hdr_prot.replace('/', '_') @@ -2523,30 +2604,12 @@ def render_uapi(family, cw): # Commands separate_ntf = 'async-prefix' in family['operations'] - max_name = c_upper(family.get('cmd-max-name', f"{family.op_prefix}MAX")) - cnt_name = c_upper(family.get('cmd-cnt-name', f"__{family.op_prefix}MAX")) - max_value = f"({cnt_name} - 1)" - - uapi_enum_start(family, cw, family['operations'], 'enum-name') - val = 0 - for op in family.msgs.values(): - if separate_ntf and ('notify' in op or 'event' in op): - continue - - suffix = ',' - if op.value != val: - suffix = f" = {op.value}," - val = op.value - cw.p(op.enum_name + suffix) - val += 1 - cw.nl() - cw.p(cnt_name + ('' if max_by_define else ',')) - if not max_by_define: - cw.p(f"{max_name} = {max_value}") - cw.block_end(line=';') - if max_by_define: - cw.p(f"#define {max_name} {max_value}") - cw.nl() + if family.msg_id_model == 'unified': + render_uapi_unified(family, cw, max_by_define, separate_ntf) + elif family.msg_id_model == 'directional': + render_uapi_directional(family, cw, max_by_define) + else: + raise Exception(f'Unsupported message enum-model {family.msg_id_model}') if separate_ntf: uapi_enum_start(family, cw, family['operations'], enum_name='async-enum') @@ -2670,13 +2733,6 @@ def main(): os.sys.exit(1) return - supported_models = ['unified'] - if args.mode in ['user', 'kernel']: - supported_models += ['directional'] - if parsed.msg_id_model not in supported_models: - print(f'Message enum-model {parsed.msg_id_model} not supported for {args.mode} generation') - os.sys.exit(1) - cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out)) _, spec_kernel = find_kernel_root(args.spec) From 0187e602c03c876d69cf955dd438bc7fea8c8fd3 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:45 -0800 Subject: [PATCH 0191/1450] ynl: add missing pieces to ethtool spec to better match uapi header - __ETHTOOL_UDP_TUNNEL_TYPE_CNT and render max - skip rendering stringset (empty enum) - skip rendering c33-pse-ext-state (defined in ethtool.h) - rename header flags to ethtool-flag- - add attr-cnt-name to each attribute to use XXX_CNT instead of XXX_MAX - add unspec 0 entry to each attribute - carry some doc entries from the existing header - tcp-header-split Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-5-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 358 ++++++++++++++++++++++- 1 file changed, 346 insertions(+), 12 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 93369f0eb8162..c7634e957d9c4 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -5,6 +5,7 @@ name: ethtool protocol: genetlink-legacy doc: Partial family for Ethtool Netlink. +uapi-header: linux/ethtool_netlink_generated.h definitions: - @@ -12,43 +13,99 @@ definitions: enum-name: type: enum entries: [ vxlan, geneve, vxlan-gpe ] + enum-cnt-name: __ethtool-udp-tunnel-type-cnt + render-max: true - name: stringset type: enum entries: [] + header: linux/ethtool.h # skip rendering, no actual definition - name: header-flags type: flags - entries: [ compact-bitsets, omit-reply, stats ] + name-prefix: ethtool-flag- + doc: common ethtool header flags + entries: + - + name: compact-bitsets + doc: use compact bitsets in reply + - + name: omit-reply + doc: provide optional reply for SET or ACT requests + - + name: stats + doc: request statistics, if supported by the driver - name: module-fw-flash-status type: enum - entries: [ started, in_progress, completed, error ] + doc: plug-in module firmware flashing status + header: linux/ethtool.h + entries: + - + name: started + doc: The firmware flashing process has started. + - + name: in_progress + doc: The firmware flashing process is in progress. + - + name: completed + doc: The firmware flashing process was completed successfully. + - + name: error + doc: The firmware flashing process was stopped due to an error. - name: c33-pse-ext-state - enum-name: + doc: "groups of PSE extended states functions. IEEE 802.3-2022 33.2.4.4 Variables" type: enum name-prefix: ethtool-c33-pse-ext-state- + header: linux/ethtool.h entries: - - none - - error-condition - - mr-mps-valid - - mr-pse-enable - - option-detect-ted - - option-vport-lim - - ovld-detected - - power-not-available - - short-detected + - + name: none + doc: none + - + name: error-condition + doc: Group of error_condition states + - + name: mr-mps-valid + doc: Group of mr_mps_valid states + - + name: mr-pse-enable + doc: Group of mr_pse_enable states + - + name: option-detect-ted + doc: Group of option_detect_ted states + - + name: option-vport-lim + doc: Group of option_vport_lim states + - + name: ovld-detected + doc: Group of ovld_detected states + - + name: power-not-available + doc: Group of power_not_available states + - + name: short-detected + doc: Group of short_detected states - name: phy-upstream-type enum-name: type: enum entries: [ mac, phy ] + - + name: tcp-data-split + type: enum + entries: [ unknown, disabled, enabled ] attribute-sets: - name: header + attr-cnt-name: __ethtool-a-header-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: dev-index type: u32 @@ -65,7 +122,12 @@ attribute-sets: - name: bitset-bit + attr-cnt-name: __ethtool-a-bitset-bit-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: index type: u32 @@ -77,7 +139,12 @@ attribute-sets: type: flag - name: bitset-bits + attr-cnt-name: __ethtool-a-bitset-bits-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: bit type: nest @@ -85,7 +152,12 @@ attribute-sets: nested-attributes: bitset-bit - name: bitset + attr-cnt-name: __ethtool-a-bitset-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: nomask type: flag @@ -104,7 +176,12 @@ attribute-sets: type: binary - name: string + attr-cnt-name: __ethtool-a-string-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: index type: u32 @@ -113,7 +190,16 @@ attribute-sets: type: string - name: strings + attr-cnt-name: __ethtool-a-strings-cnt attributes: + - + name: unspec + type: unused + value: 0 + - + name: unspec + type: unused + value: 0 - name: string type: nest @@ -121,7 +207,12 @@ attribute-sets: nested-attributes: string - name: stringset + attr-cnt-name: __ethtool-a-stringset-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: id type: u32 @@ -135,7 +226,12 @@ attribute-sets: nested-attributes: strings - name: stringsets + attr-cnt-name: __ethtool-a-stringsets-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: stringset type: nest @@ -143,7 +239,12 @@ attribute-sets: nested-attributes: stringset - name: strset + attr-cnt-name: __ethtool-a-strset-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -158,7 +259,12 @@ attribute-sets: - name: privflags + attr-cnt-name: __ethtool-a-privflags-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -170,7 +276,12 @@ attribute-sets: - name: rings + attr-cnt-name: __ethtool-a-rings-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -205,6 +316,7 @@ attribute-sets: - name: tcp-data-split type: u8 + enum: tcp-data-split - name: cqe-size type: u32 @@ -223,31 +335,48 @@ attribute-sets: - name: mm-stat + attr-cnt-name: __ethtool-a-mm-stat-cnt + doc: MAC Merge (802.3) attributes: + - + name: unspec + type: unused + value: 0 - name: pad type: pad - name: reassembly-errors + doc: aMACMergeFrameAssErrorCount type: u64 - name: smd-errors + doc: aMACMergeFrameSmdErrorCount type: u64 - name: reassembly-ok + doc: aMACMergeFrameAssOkCount type: u64 - name: rx-frag-count + doc: aMACMergeFragCountRx type: u64 - name: tx-frag-count + doc: aMACMergeFragCountTx type: u64 - name: hold-count + doc: aMACMergeHoldCount type: u64 - name: mm + attr-cnt-name: __ethtool-a-mm-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -285,7 +414,12 @@ attribute-sets: nested-attributes: mm-stat - name: linkinfo + attr-cnt-name: __ethtool-a-linkinfo-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -307,7 +441,12 @@ attribute-sets: type: u8 - name: linkmodes + attr-cnt-name: __ethtool-a-linkmodes-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -343,7 +482,12 @@ attribute-sets: type: u8 - name: linkstate + attr-cnt-name: __ethtool-a-linkstate-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -368,7 +512,12 @@ attribute-sets: type: u32 - name: debug + attr-cnt-name: __ethtool-a-debug-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -379,7 +528,12 @@ attribute-sets: nested-attributes: bitset - name: wol + attr-cnt-name: __ethtool-a-wol-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -393,7 +547,12 @@ attribute-sets: type: binary - name: features + attr-cnt-name: __ethtool-a-features-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -416,7 +575,12 @@ attribute-sets: nested-attributes: bitset - name: channels + attr-cnt-name: __ethtool-a-channels-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -448,7 +612,12 @@ attribute-sets: - name: irq-moderation + attr-cnt-name: __ethtool-a-irq-moderation-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: usec type: u32 @@ -460,7 +629,12 @@ attribute-sets: type: u32 - name: profile + attr-cnt-name: __ethtool-a-profile-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: irq-moderation type: nest @@ -468,7 +642,12 @@ attribute-sets: nested-attributes: irq-moderation - name: coalesce + attr-cnt-name: __ethtool-a-coalesce-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -565,7 +744,12 @@ attribute-sets: - name: pause-stat + attr-cnt-name: __ethtool-a-pause-stat-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: pad type: pad @@ -577,7 +761,12 @@ attribute-sets: type: u64 - name: pause + attr-cnt-name: __ethtool-a-pause-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -600,7 +789,12 @@ attribute-sets: type: u32 - name: eee + attr-cnt-name: __ethtool-a-eee-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -627,7 +821,12 @@ attribute-sets: type: u32 - name: ts-stat + attr-cnt-name: __ethtool-a-ts-stat-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: tx-pkts type: uint @@ -639,7 +838,12 @@ attribute-sets: type: uint - name: tsinfo + attr-cnt-name: __ethtool-a-tsinfo-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -665,19 +869,32 @@ attribute-sets: nested-attributes: ts-stat - name: cable-result + attr-cnt-name: __ethtool-a-cable-result-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: pair + doc: ETHTOOL_A_CABLE_PAIR type: u8 - name: code + doc: ETHTOOL_A_CABLE_RESULT_CODE type: u8 - name: src + doc: ETHTOOL_A_CABLE_INF_SRC type: u32 - name: cable-fault-length + attr-cnt-name: __ethtool-a-cable-fault-length-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: pair type: u8 @@ -689,7 +906,12 @@ attribute-sets: type: u32 - name: cable-nest + attr-cnt-name: __ethtool-a-cable-nest-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: result type: nest @@ -700,20 +922,31 @@ attribute-sets: nested-attributes: cable-fault-length - name: cable-test + attr-cnt-name: __ethtool-a-cable-test-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest nested-attributes: header - name: cable-test-ntf + attr-cnt-name: __ethtool-a-cable-test-ntf-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest nested-attributes: header - name: status + doc: _STARTED/_COMPLETE type: u8 - name: nest @@ -721,7 +954,12 @@ attribute-sets: nested-attributes: cable-nest - name: cable-test-tdr-cfg + attr-cnt-name: __ethtool-a-cable-test-tdr-cfg-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: first type: u32 @@ -736,7 +974,12 @@ attribute-sets: type: u8 - name: cable-test-tdr-ntf + attr-cnt-name: __ethtool-a-cable-test-tdr-ntf-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -750,7 +993,12 @@ attribute-sets: nested-attributes: cable-nest - name: cable-test-tdr + attr-cnt-name: __ethtool-a-cable-test-tdr-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -761,7 +1009,12 @@ attribute-sets: nested-attributes: cable-test-tdr-cfg - name: tunnel-udp-entry + attr-cnt-name: __ethtool-a-tunnel-udp-entry-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: port type: u16 @@ -772,7 +1025,12 @@ attribute-sets: enum: udp-tunnel-type - name: tunnel-udp-table + attr-cnt-name: __ethtool-a-tunnel-udp-table-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: size type: u32 @@ -787,14 +1045,24 @@ attribute-sets: nested-attributes: tunnel-udp-entry - name: tunnel-udp + attr-cnt-name: __ethtool-a-tunnel-udp-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: table type: nest nested-attributes: tunnel-udp-table - name: tunnel-info + attr-cnt-name: __ethtool-a-tunnel-info-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -805,7 +1073,12 @@ attribute-sets: nested-attributes: tunnel-udp - name: fec-stat + attr-cnt-name: __ethtool-a-fec-stat-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: pad type: pad @@ -823,7 +1096,12 @@ attribute-sets: sub-type: u64 - name: fec + attr-cnt-name: __ethtool-a-fec-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -844,7 +1122,12 @@ attribute-sets: nested-attributes: fec-stat - name: module-eeprom + attr-cnt-name: __ethtool-a-module-eeprom-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -869,7 +1152,12 @@ attribute-sets: type: binary - name: stats-grp + attr-cnt-name: __ethtool-a-stats-grp-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: pad type: pad @@ -912,7 +1200,12 @@ attribute-sets: name: hist-val - name: stats + attr-cnt-name: __ethtool-a-stats-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: pad type: pad @@ -933,7 +1226,12 @@ attribute-sets: type: u32 - name: phc-vclocks + attr-cnt-name: __ethtool-a-phc-vclocks-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -947,7 +1245,12 @@ attribute-sets: sub-type: s32 - name: module + attr-cnt-name: __ethtool-a-module-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -960,7 +1263,13 @@ attribute-sets: type: u8 - name: c33-pse-pw-limit + attr-cnt-name: __ethtool-a-c33-pse-pw-limit-cnt + attr-max-name: __ethtool-a-c33-pse-pw-limit-max attributes: + - + name: unspec + type: unused + value: 0 - name: min type: u32 @@ -969,7 +1278,12 @@ attribute-sets: type: u32 - name: pse + attr-cnt-name: __ethtool-a-pse-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -1027,7 +1341,12 @@ attribute-sets: nested-attributes: c33-pse-pw-limit - name: rss + attr-cnt-name: __ethtool-a-rss-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -1053,7 +1372,12 @@ attribute-sets: type: u32 - name: plca + attr-cnt-name: __ethtool-a-plca-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -1084,7 +1408,12 @@ attribute-sets: type: u32 - name: module-fw-flash + attr-cnt-name: __ethtool-a-module-fw-flash-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest @@ -1110,7 +1439,12 @@ attribute-sets: type: uint - name: phy + attr-cnt-name: __ethtool-a-phy-cnt attributes: + - + name: unspec + type: unused + value: 0 - name: header type: nest From 001b0b59efbbdf54126c2ae512009d4a7c9f9f88 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:46 -0800 Subject: [PATCH 0192/1450] ynl: include uapi header after all dependencies Essentially reverse the order of headers for userspace generated files. Before (make -C tools/net/ynl/; cat tools/net/ynl/ethtool-user.h): #include #include #include #include After: #include #include While at it, make sure we track which headers we've already included and include the headers only once. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-6-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 2bf4d992e54a7..8098bcbb6f404 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -2782,12 +2782,17 @@ def main(): else: cw.p(f'#include "{hdr_file}"') cw.p('#include "ynl.h"') - headers = [parsed.uapi_header] + headers = [] for definition in parsed['definitions']: if 'header' in definition: headers.append(definition['header']) + if args.mode == 'user': + headers.append(parsed.uapi_header) + seen_header = [] for one in headers: - cw.p(f"#include <{one}>") + if one not in seen_header: + cw.p(f"#include <{one}>") + seen_header.append(one) cw.nl() if args.mode == "user": From 49922401c2190713c5cc03902dc68c3ecd3f13e8 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:47 -0800 Subject: [PATCH 0193/1450] ethtool: separate definitions that are gonna be generated Reshuffle definitions that are gonna be generated into ethtool_netlink_generated.h and match ynl spec order. This should make it easier to compare the output of the ynl-gen-c to the existing uapi header. No functional changes. Things that are still remaining to be manually defined: - ETHTOOL_FLAG_ALL - probably no good way to add to spec? - some of the cable test bits (not sure whether it's possible to move to spec) - some of the stats definitions (no way currently to move to spec) Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-7-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- include/uapi/linux/ethtool_netlink.h | 893 +---------------- .../uapi/linux/ethtool_netlink_generated.h | 899 ++++++++++++++++++ 3 files changed, 901 insertions(+), 893 deletions(-) create mode 100644 include/uapi/linux/ethtool_netlink_generated.h diff --git a/MAINTAINERS b/MAINTAINERS index 686109008d8e0..79756f2100e00 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16280,7 +16280,7 @@ F: include/linux/inetdevice.h F: include/linux/netdev* F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h -F: include/uapi/linux/ethtool_netlink.h +F: include/uapi/linux/ethtool_netlink* F: include/uapi/linux/if_* F: include/uapi/linux/net_shaper.h F: include/uapi/linux/netdev* diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 283305f6b0635..9c909ce733a50 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -10,545 +10,12 @@ #define _UAPI_LINUX_ETHTOOL_NETLINK_H_ #include - -/* message types - userspace to kernel */ -enum { - ETHTOOL_MSG_USER_NONE, - ETHTOOL_MSG_STRSET_GET, - ETHTOOL_MSG_LINKINFO_GET, - ETHTOOL_MSG_LINKINFO_SET, - ETHTOOL_MSG_LINKMODES_GET, - ETHTOOL_MSG_LINKMODES_SET, - ETHTOOL_MSG_LINKSTATE_GET, - ETHTOOL_MSG_DEBUG_GET, - ETHTOOL_MSG_DEBUG_SET, - ETHTOOL_MSG_WOL_GET, - ETHTOOL_MSG_WOL_SET, - ETHTOOL_MSG_FEATURES_GET, - ETHTOOL_MSG_FEATURES_SET, - ETHTOOL_MSG_PRIVFLAGS_GET, - ETHTOOL_MSG_PRIVFLAGS_SET, - ETHTOOL_MSG_RINGS_GET, - ETHTOOL_MSG_RINGS_SET, - ETHTOOL_MSG_CHANNELS_GET, - ETHTOOL_MSG_CHANNELS_SET, - ETHTOOL_MSG_COALESCE_GET, - ETHTOOL_MSG_COALESCE_SET, - ETHTOOL_MSG_PAUSE_GET, - ETHTOOL_MSG_PAUSE_SET, - ETHTOOL_MSG_EEE_GET, - ETHTOOL_MSG_EEE_SET, - ETHTOOL_MSG_TSINFO_GET, - ETHTOOL_MSG_CABLE_TEST_ACT, - ETHTOOL_MSG_CABLE_TEST_TDR_ACT, - ETHTOOL_MSG_TUNNEL_INFO_GET, - ETHTOOL_MSG_FEC_GET, - ETHTOOL_MSG_FEC_SET, - ETHTOOL_MSG_MODULE_EEPROM_GET, - ETHTOOL_MSG_STATS_GET, - ETHTOOL_MSG_PHC_VCLOCKS_GET, - ETHTOOL_MSG_MODULE_GET, - ETHTOOL_MSG_MODULE_SET, - ETHTOOL_MSG_PSE_GET, - ETHTOOL_MSG_PSE_SET, - ETHTOOL_MSG_RSS_GET, - ETHTOOL_MSG_PLCA_GET_CFG, - ETHTOOL_MSG_PLCA_SET_CFG, - ETHTOOL_MSG_PLCA_GET_STATUS, - ETHTOOL_MSG_MM_GET, - ETHTOOL_MSG_MM_SET, - ETHTOOL_MSG_MODULE_FW_FLASH_ACT, - ETHTOOL_MSG_PHY_GET, - - /* add new constants above here */ - __ETHTOOL_MSG_USER_CNT, - ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 -}; - -/* message types - kernel to userspace */ -enum { - ETHTOOL_MSG_KERNEL_NONE, - ETHTOOL_MSG_STRSET_GET_REPLY, - ETHTOOL_MSG_LINKINFO_GET_REPLY, - ETHTOOL_MSG_LINKINFO_NTF, - ETHTOOL_MSG_LINKMODES_GET_REPLY, - ETHTOOL_MSG_LINKMODES_NTF, - ETHTOOL_MSG_LINKSTATE_GET_REPLY, - ETHTOOL_MSG_DEBUG_GET_REPLY, - ETHTOOL_MSG_DEBUG_NTF, - ETHTOOL_MSG_WOL_GET_REPLY, - ETHTOOL_MSG_WOL_NTF, - ETHTOOL_MSG_FEATURES_GET_REPLY, - ETHTOOL_MSG_FEATURES_SET_REPLY, - ETHTOOL_MSG_FEATURES_NTF, - ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, - ETHTOOL_MSG_PRIVFLAGS_NTF, - ETHTOOL_MSG_RINGS_GET_REPLY, - ETHTOOL_MSG_RINGS_NTF, - ETHTOOL_MSG_CHANNELS_GET_REPLY, - ETHTOOL_MSG_CHANNELS_NTF, - ETHTOOL_MSG_COALESCE_GET_REPLY, - ETHTOOL_MSG_COALESCE_NTF, - ETHTOOL_MSG_PAUSE_GET_REPLY, - ETHTOOL_MSG_PAUSE_NTF, - ETHTOOL_MSG_EEE_GET_REPLY, - ETHTOOL_MSG_EEE_NTF, - ETHTOOL_MSG_TSINFO_GET_REPLY, - ETHTOOL_MSG_CABLE_TEST_NTF, - ETHTOOL_MSG_CABLE_TEST_TDR_NTF, - ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, - ETHTOOL_MSG_FEC_GET_REPLY, - ETHTOOL_MSG_FEC_NTF, - ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, - ETHTOOL_MSG_STATS_GET_REPLY, - ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, - ETHTOOL_MSG_MODULE_GET_REPLY, - ETHTOOL_MSG_MODULE_NTF, - ETHTOOL_MSG_PSE_GET_REPLY, - ETHTOOL_MSG_RSS_GET_REPLY, - ETHTOOL_MSG_PLCA_GET_CFG_REPLY, - ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, - ETHTOOL_MSG_PLCA_NTF, - ETHTOOL_MSG_MM_GET_REPLY, - ETHTOOL_MSG_MM_NTF, - ETHTOOL_MSG_MODULE_FW_FLASH_NTF, - ETHTOOL_MSG_PHY_GET_REPLY, - ETHTOOL_MSG_PHY_NTF, - - /* add new constants above here */ - __ETHTOOL_MSG_KERNEL_CNT, - ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 -}; - -/* request header */ - -enum ethtool_header_flags { - ETHTOOL_FLAG_COMPACT_BITSETS = 1 << 0, /* use compact bitsets in reply */ - ETHTOOL_FLAG_OMIT_REPLY = 1 << 1, /* provide optional reply for SET or ACT requests */ - ETHTOOL_FLAG_STATS = 1 << 2, /* request statistics, if supported by the driver */ -}; +#include #define ETHTOOL_FLAG_ALL (ETHTOOL_FLAG_COMPACT_BITSETS | \ ETHTOOL_FLAG_OMIT_REPLY | \ ETHTOOL_FLAG_STATS) -enum { - ETHTOOL_A_HEADER_UNSPEC, - ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ - ETHTOOL_A_HEADER_DEV_NAME, /* string */ - ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ - ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_HEADER_CNT, - ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 -}; - -/* bit sets */ - -enum { - ETHTOOL_A_BITSET_BIT_UNSPEC, - ETHTOOL_A_BITSET_BIT_INDEX, /* u32 */ - ETHTOOL_A_BITSET_BIT_NAME, /* string */ - ETHTOOL_A_BITSET_BIT_VALUE, /* flag */ - - /* add new constants above here */ - __ETHTOOL_A_BITSET_BIT_CNT, - ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 -}; - -enum { - ETHTOOL_A_BITSET_BITS_UNSPEC, - ETHTOOL_A_BITSET_BITS_BIT, /* nest - _A_BITSET_BIT_* */ - - /* add new constants above here */ - __ETHTOOL_A_BITSET_BITS_CNT, - ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 -}; - -enum { - ETHTOOL_A_BITSET_UNSPEC, - ETHTOOL_A_BITSET_NOMASK, /* flag */ - ETHTOOL_A_BITSET_SIZE, /* u32 */ - ETHTOOL_A_BITSET_BITS, /* nest - _A_BITSET_BITS_* */ - ETHTOOL_A_BITSET_VALUE, /* binary */ - ETHTOOL_A_BITSET_MASK, /* binary */ - - /* add new constants above here */ - __ETHTOOL_A_BITSET_CNT, - ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 -}; - -/* string sets */ - -enum { - ETHTOOL_A_STRING_UNSPEC, - ETHTOOL_A_STRING_INDEX, /* u32 */ - ETHTOOL_A_STRING_VALUE, /* string */ - - /* add new constants above here */ - __ETHTOOL_A_STRING_CNT, - ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 -}; - -enum { - ETHTOOL_A_STRINGS_UNSPEC, - ETHTOOL_A_STRINGS_STRING, /* nest - _A_STRINGS_* */ - - /* add new constants above here */ - __ETHTOOL_A_STRINGS_CNT, - ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 -}; - -enum { - ETHTOOL_A_STRINGSET_UNSPEC, - ETHTOOL_A_STRINGSET_ID, /* u32 */ - ETHTOOL_A_STRINGSET_COUNT, /* u32 */ - ETHTOOL_A_STRINGSET_STRINGS, /* nest - _A_STRINGS_* */ - - /* add new constants above here */ - __ETHTOOL_A_STRINGSET_CNT, - ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 -}; - -enum { - ETHTOOL_A_STRINGSETS_UNSPEC, - ETHTOOL_A_STRINGSETS_STRINGSET, /* nest - _A_STRINGSET_* */ - - /* add new constants above here */ - __ETHTOOL_A_STRINGSETS_CNT, - ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 -}; - -/* STRSET */ - -enum { - ETHTOOL_A_STRSET_UNSPEC, - ETHTOOL_A_STRSET_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STRSET_STRINGSETS, /* nest - _A_STRINGSETS_* */ - ETHTOOL_A_STRSET_COUNTS_ONLY, /* flag */ - - /* add new constants above here */ - __ETHTOOL_A_STRSET_CNT, - ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 -}; - -/* LINKINFO */ - -enum { - ETHTOOL_A_LINKINFO_UNSPEC, - ETHTOOL_A_LINKINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKINFO_PORT, /* u8 */ - ETHTOOL_A_LINKINFO_PHYADDR, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, /* u8 */ - ETHTOOL_A_LINKINFO_TRANSCEIVER, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_LINKINFO_CNT, - ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 -}; - -/* LINKMODES */ - -enum { - ETHTOOL_A_LINKMODES_UNSPEC, - ETHTOOL_A_LINKMODES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKMODES_AUTONEG, /* u8 */ - ETHTOOL_A_LINKMODES_OURS, /* bitset */ - ETHTOOL_A_LINKMODES_PEER, /* bitset */ - ETHTOOL_A_LINKMODES_SPEED, /* u32 */ - ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ - ETHTOOL_A_LINKMODES_LANES, /* u32 */ - ETHTOOL_A_LINKMODES_RATE_MATCHING, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_LINKMODES_CNT, - ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 -}; - -/* LINKSTATE */ - -enum { - ETHTOOL_A_LINKSTATE_UNSPEC, - ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKSTATE_LINK, /* u8 */ - ETHTOOL_A_LINKSTATE_SQI, /* u32 */ - ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ - ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_LINKSTATE_CNT, - ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 -}; - -/* DEBUG */ - -enum { - ETHTOOL_A_DEBUG_UNSPEC, - ETHTOOL_A_DEBUG_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_DEBUG_MSGMASK, /* bitset */ - - /* add new constants above here */ - __ETHTOOL_A_DEBUG_CNT, - ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 -}; - -/* WOL */ - -enum { - ETHTOOL_A_WOL_UNSPEC, - ETHTOOL_A_WOL_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_WOL_MODES, /* bitset */ - ETHTOOL_A_WOL_SOPASS, /* binary */ - - /* add new constants above here */ - __ETHTOOL_A_WOL_CNT, - ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 -}; - -/* FEATURES */ - -enum { - ETHTOOL_A_FEATURES_UNSPEC, - ETHTOOL_A_FEATURES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEATURES_HW, /* bitset */ - ETHTOOL_A_FEATURES_WANTED, /* bitset */ - ETHTOOL_A_FEATURES_ACTIVE, /* bitset */ - ETHTOOL_A_FEATURES_NOCHANGE, /* bitset */ - - /* add new constants above here */ - __ETHTOOL_A_FEATURES_CNT, - ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 -}; - -/* PRIVFLAGS */ - -enum { - ETHTOOL_A_PRIVFLAGS_UNSPEC, - ETHTOOL_A_PRIVFLAGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PRIVFLAGS_FLAGS, /* bitset */ - - /* add new constants above here */ - __ETHTOOL_A_PRIVFLAGS_CNT, - ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 -}; - -/* RINGS */ - -enum { - ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0, - ETHTOOL_TCP_DATA_SPLIT_DISABLED, - ETHTOOL_TCP_DATA_SPLIT_ENABLED, -}; - -enum { - ETHTOOL_A_RINGS_UNSPEC, - ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_RINGS_RX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO_MAX, /* u32 */ - ETHTOOL_A_RINGS_TX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ - ETHTOOL_A_RINGS_TX, /* u32 */ - ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ - ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_RX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_RINGS_CNT, - ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) -}; - -/* CHANNELS */ - -enum { - ETHTOOL_A_CHANNELS_UNSPEC, - ETHTOOL_A_CHANNELS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CHANNELS_RX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_TX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_RX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_TX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_COUNT, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_CHANNELS_CNT, - ETHTOOL_A_CHANNELS_MAX = (__ETHTOOL_A_CHANNELS_CNT - 1) -}; - -/* COALESCE */ - -enum { - ETHTOOL_A_COALESCE_UNSPEC, - ETHTOOL_A_COALESCE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_COALESCE_RX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, /* u32 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, /* u8 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, /* u8 */ - ETHTOOL_A_COALESCE_PKT_RATE_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_PKT_RATE_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, /* u32 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, /* u8 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, /* u8 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, /* u32 */ - /* nest - _A_PROFILE_IRQ_MODERATION */ - ETHTOOL_A_COALESCE_RX_PROFILE, - /* nest - _A_PROFILE_IRQ_MODERATION */ - ETHTOOL_A_COALESCE_TX_PROFILE, - - /* add new constants above here */ - __ETHTOOL_A_COALESCE_CNT, - ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) -}; - -enum { - ETHTOOL_A_PROFILE_UNSPEC, - /* nest, _A_IRQ_MODERATION_* */ - ETHTOOL_A_PROFILE_IRQ_MODERATION, - __ETHTOOL_A_PROFILE_CNT, - ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) -}; - -enum { - ETHTOOL_A_IRQ_MODERATION_UNSPEC, - ETHTOOL_A_IRQ_MODERATION_USEC, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_PKTS, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_COMPS, /* u32 */ - - __ETHTOOL_A_IRQ_MODERATION_CNT, - ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1) -}; - -/* PAUSE */ - -enum { - ETHTOOL_A_PAUSE_UNSPEC, - ETHTOOL_A_PAUSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ - ETHTOOL_A_PAUSE_RX, /* u8 */ - ETHTOOL_A_PAUSE_TX, /* u8 */ - ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ - ETHTOOL_A_PAUSE_STATS_SRC, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_PAUSE_CNT, - ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) -}; - -enum { - ETHTOOL_A_PAUSE_STAT_UNSPEC, - ETHTOOL_A_PAUSE_STAT_PAD, - - ETHTOOL_A_PAUSE_STAT_TX_FRAMES, - ETHTOOL_A_PAUSE_STAT_RX_FRAMES, - - /* add new constants above here - * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats! - */ - __ETHTOOL_A_PAUSE_STAT_CNT, - ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) -}; - -/* EEE */ - -enum { - ETHTOOL_A_EEE_UNSPEC, - ETHTOOL_A_EEE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_EEE_MODES_OURS, /* bitset */ - ETHTOOL_A_EEE_MODES_PEER, /* bitset */ - ETHTOOL_A_EEE_ACTIVE, /* u8 */ - ETHTOOL_A_EEE_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_TIMER, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_EEE_CNT, - ETHTOOL_A_EEE_MAX = (__ETHTOOL_A_EEE_CNT - 1) -}; - -/* TSINFO */ - -enum { - ETHTOOL_A_TSINFO_UNSPEC, - ETHTOOL_A_TSINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_TSINFO_TIMESTAMPING, /* bitset */ - ETHTOOL_A_TSINFO_TX_TYPES, /* bitset */ - ETHTOOL_A_TSINFO_RX_FILTERS, /* bitset */ - ETHTOOL_A_TSINFO_PHC_INDEX, /* u32 */ - ETHTOOL_A_TSINFO_STATS, /* nest - _A_TSINFO_STAT */ - - /* add new constants above here */ - __ETHTOOL_A_TSINFO_CNT, - ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) -}; - -enum { - ETHTOOL_A_TS_STAT_UNSPEC, - - ETHTOOL_A_TS_STAT_TX_PKTS, /* uint */ - ETHTOOL_A_TS_STAT_TX_LOST, /* uint */ - ETHTOOL_A_TS_STAT_TX_ERR, /* uint */ - - /* add new constants above here */ - __ETHTOOL_A_TS_STAT_CNT, - ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) - -}; - -/* PHC VCLOCKS */ - -enum { - ETHTOOL_A_PHC_VCLOCKS_UNSPEC, - ETHTOOL_A_PHC_VCLOCKS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHC_VCLOCKS_NUM, /* u32 */ - ETHTOOL_A_PHC_VCLOCKS_INDEX, /* array, s32 */ - - /* add new constants above here */ - __ETHTOOL_A_PHC_VCLOCKS_CNT, - ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1) -}; - -/* CABLE TEST */ - -enum { - ETHTOOL_A_CABLE_TEST_UNSPEC, - ETHTOOL_A_CABLE_TEST_HEADER, /* nest - _A_HEADER_* */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_CNT, - ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 -}; - /* CABLE TEST NOTIFY */ enum { ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC, @@ -582,74 +49,12 @@ enum { ETHTOOL_A_CABLE_INF_SRC_ALCD, }; -enum { - ETHTOOL_A_CABLE_RESULT_UNSPEC, - ETHTOOL_A_CABLE_RESULT_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_RESULT_CODE, /* u8 ETHTOOL_A_CABLE_RESULT_CODE_ */ - ETHTOOL_A_CABLE_RESULT_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ - - __ETHTOOL_A_CABLE_RESULT_CNT, - ETHTOOL_A_CABLE_RESULT_MAX = (__ETHTOOL_A_CABLE_RESULT_CNT - 1) -}; - -enum { - ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC, - ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_FAULT_LENGTH_CM, /* u32 */ - ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ - - __ETHTOOL_A_CABLE_FAULT_LENGTH_CNT, - ETHTOOL_A_CABLE_FAULT_LENGTH_MAX = (__ETHTOOL_A_CABLE_FAULT_LENGTH_CNT - 1) -}; - enum { ETHTOOL_A_CABLE_TEST_NTF_STATUS_UNSPEC, ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED, ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED }; -enum { - ETHTOOL_A_CABLE_NEST_UNSPEC, - ETHTOOL_A_CABLE_NEST_RESULT, /* nest - ETHTOOL_A_CABLE_RESULT_ */ - ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, /* nest - ETHTOOL_A_CABLE_FAULT_LENGTH_ */ - __ETHTOOL_A_CABLE_NEST_CNT, - ETHTOOL_A_CABLE_NEST_MAX = (__ETHTOOL_A_CABLE_NEST_CNT - 1) -}; - -enum { - ETHTOOL_A_CABLE_TEST_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_NTF_NEST, /* nest - of results: */ - - __ETHTOOL_A_CABLE_TEST_NTF_CNT, - ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) -}; - -/* CABLE TEST TDR */ - -enum { - ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, - ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 -}; - -enum { - ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_TDR_CNT, - ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 -}; - /* CABLE TEST TDR NOTIFY */ enum { @@ -689,132 +94,6 @@ enum { ETHTOOL_A_CABLE_TDR_NEST_MAX = (__ETHTOOL_A_CABLE_TDR_NEST_CNT - 1) }; -enum { - ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, /* nest - of results: */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, - ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 -}; - -/* TUNNEL INFO */ - -enum { - ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, - ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, - ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, - - __ETHTOOL_UDP_TUNNEL_TYPE_CNT -}; - -enum { - ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, - - ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ - ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, - ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) -}; - -enum { - ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, - - ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ - ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ - ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, - ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) -}; - -enum { - ETHTOOL_A_TUNNEL_UDP_UNSPEC, - - ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_UDP_CNT, - ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) -}; - -enum { - ETHTOOL_A_TUNNEL_INFO_UNSPEC, - ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ - - ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_INFO_CNT, - ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) -}; - -/* FEC */ - -enum { - ETHTOOL_A_FEC_UNSPEC, - ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEC_MODES, /* bitset */ - ETHTOOL_A_FEC_AUTO, /* u8 */ - ETHTOOL_A_FEC_ACTIVE, /* u32 */ - ETHTOOL_A_FEC_STATS, /* nest - _A_FEC_STAT */ - - __ETHTOOL_A_FEC_CNT, - ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) -}; - -enum { - ETHTOOL_A_FEC_STAT_UNSPEC, - ETHTOOL_A_FEC_STAT_PAD, - - ETHTOOL_A_FEC_STAT_CORRECTED, /* array, u64 */ - ETHTOOL_A_FEC_STAT_UNCORR, /* array, u64 */ - ETHTOOL_A_FEC_STAT_CORR_BITS, /* array, u64 */ - - /* add new constants above here */ - __ETHTOOL_A_FEC_STAT_CNT, - ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) -}; - -/* MODULE EEPROM */ - -enum { - ETHTOOL_A_MODULE_EEPROM_UNSPEC, - ETHTOOL_A_MODULE_EEPROM_HEADER, /* nest - _A_HEADER_* */ - - ETHTOOL_A_MODULE_EEPROM_OFFSET, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_LENGTH, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_PAGE, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_BANK, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_DATA, /* binary */ - - __ETHTOOL_A_MODULE_EEPROM_CNT, - ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) -}; - -/* STATS */ - -enum { - ETHTOOL_A_STATS_UNSPEC, - ETHTOOL_A_STATS_PAD, - ETHTOOL_A_STATS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STATS_GROUPS, /* bitset */ - - ETHTOOL_A_STATS_GRP, /* nest - _A_STATS_GRP_* */ - - ETHTOOL_A_STATS_SRC, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_STATS_CNT, - ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) -}; - enum { ETHTOOL_STATS_ETH_PHY, ETHTOOL_STATS_ETH_MAC, @@ -825,27 +104,6 @@ enum { __ETHTOOL_STATS_CNT }; -enum { - ETHTOOL_A_STATS_GRP_UNSPEC, - ETHTOOL_A_STATS_GRP_PAD, - - ETHTOOL_A_STATS_GRP_ID, /* u32 */ - ETHTOOL_A_STATS_GRP_SS_ID, /* u32 */ - - ETHTOOL_A_STATS_GRP_STAT, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_RX, /* nest */ - ETHTOOL_A_STATS_GRP_HIST_TX, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_BKT_HI, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_VAL, /* u64 */ - - /* add new constants above here */ - __ETHTOOL_A_STATS_GRP_CNT, - ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_GRP_CNT - 1) -}; - enum { /* 30.3.2.1.5 aSymbolErrorDuringCarrier */ ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, @@ -935,155 +193,6 @@ enum { ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1) }; -/* MODULE */ - -enum { - ETHTOOL_A_MODULE_UNSPEC, - ETHTOOL_A_MODULE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_POWER_MODE_POLICY, /* u8 */ - ETHTOOL_A_MODULE_POWER_MODE, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_MODULE_CNT, - ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) -}; - -/* Power Sourcing Equipment */ -enum { - ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, - ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, /* u32 */ -}; - -enum { - ETHTOOL_A_PSE_UNSPEC, - ETHTOOL_A_PSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PODL_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_PODL_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_C33_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_PW_CLASS, /* u32 */ - ETHTOOL_A_C33_PSE_ACTUAL_PW, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_SUBSTATE, /* u32 */ - ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, /* nest - _C33_PSE_PW_LIMIT_* */ - - /* add new constants above here */ - __ETHTOOL_A_PSE_CNT, - ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) -}; - -enum { - ETHTOOL_A_RSS_UNSPEC, - ETHTOOL_A_RSS_HEADER, - ETHTOOL_A_RSS_CONTEXT, /* u32 */ - ETHTOOL_A_RSS_HFUNC, /* u32 */ - ETHTOOL_A_RSS_INDIR, /* binary */ - ETHTOOL_A_RSS_HKEY, /* binary */ - ETHTOOL_A_RSS_INPUT_XFRM, /* u32 */ - ETHTOOL_A_RSS_START_CONTEXT, /* u32 */ - - __ETHTOOL_A_RSS_CNT, - ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), -}; - -/* PLCA */ - -enum { - ETHTOOL_A_PLCA_UNSPEC, - ETHTOOL_A_PLCA_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PLCA_VERSION, /* u16 */ - ETHTOOL_A_PLCA_ENABLED, /* u8 */ - ETHTOOL_A_PLCA_STATUS, /* u8 */ - ETHTOOL_A_PLCA_NODE_CNT, /* u32 */ - ETHTOOL_A_PLCA_NODE_ID, /* u32 */ - ETHTOOL_A_PLCA_TO_TMR, /* u32 */ - ETHTOOL_A_PLCA_BURST_CNT, /* u32 */ - ETHTOOL_A_PLCA_BURST_TMR, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_PLCA_CNT, - ETHTOOL_A_PLCA_MAX = (__ETHTOOL_A_PLCA_CNT - 1) -}; - -/* MAC Merge (802.3) */ - -enum { - ETHTOOL_A_MM_STAT_UNSPEC, - ETHTOOL_A_MM_STAT_PAD, - - /* aMACMergeFrameAssErrorCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, /* u64 */ - /* aMACMergeFrameSmdErrorCount */ - ETHTOOL_A_MM_STAT_SMD_ERRORS, /* u64 */ - /* aMACMergeFrameAssOkCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_OK, /* u64 */ - /* aMACMergeFragCountRx */ - ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, /* u64 */ - /* aMACMergeFragCountTx */ - ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, /* u64 */ - /* aMACMergeHoldCount */ - ETHTOOL_A_MM_STAT_HOLD_COUNT, /* u64 */ - - /* add new constants above here */ - __ETHTOOL_A_MM_STAT_CNT, - ETHTOOL_A_MM_STAT_MAX = (__ETHTOOL_A_MM_STAT_CNT - 1) -}; - -enum { - ETHTOOL_A_MM_UNSPEC, - ETHTOOL_A_MM_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MM_PMAC_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ACTIVE, /* u8 */ - ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_VERIFY_ENABLED, /* u8 */ - ETHTOOL_A_MM_VERIFY_STATUS, /* u8 */ - ETHTOOL_A_MM_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_MAX_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_STATS, /* nest - _A_MM_STAT_* */ - - /* add new constants above here */ - __ETHTOOL_A_MM_CNT, - ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) -}; - -/* MODULE_FW_FLASH */ - -enum { - ETHTOOL_A_MODULE_FW_FLASH_UNSPEC, - ETHTOOL_A_MODULE_FW_FLASH_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_DONE, /* uint */ - ETHTOOL_A_MODULE_FW_FLASH_TOTAL, /* uint */ - - /* add new constants above here */ - __ETHTOOL_A_MODULE_FW_FLASH_CNT, - ETHTOOL_A_MODULE_FW_FLASH_MAX = (__ETHTOOL_A_MODULE_FW_FLASH_CNT - 1) -}; - -enum { - ETHTOOL_A_PHY_UNSPEC, - ETHTOOL_A_PHY_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHY_INDEX, /* u32 */ - ETHTOOL_A_PHY_DRVNAME, /* string */ - ETHTOOL_A_PHY_NAME, /* string */ - ETHTOOL_A_PHY_UPSTREAM_TYPE, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_INDEX, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, /* string */ - ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, /* string */ - - /* add new constants above here */ - __ETHTOOL_A_PHY_CNT, - ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) -}; /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h new file mode 100644 index 0000000000000..4b4bf17d1a88e --- /dev/null +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -0,0 +1,899 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +#ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H +#define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H + +/* TUNNEL INFO */ + +enum { + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, + ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, + + __ETHTOOL_UDP_TUNNEL_TYPE_CNT +}; + +/* request header */ + +enum ethtool_header_flags { + ETHTOOL_FLAG_COMPACT_BITSETS = 1 << 0, /* use compact bitsets in reply */ + ETHTOOL_FLAG_OMIT_REPLY = 1 << 1, /* provide optional reply for SET or ACT requests */ + ETHTOOL_FLAG_STATS = 1 << 2, /* request statistics, if supported by the driver */ +}; + +enum { + ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0, + ETHTOOL_TCP_DATA_SPLIT_DISABLED, + ETHTOOL_TCP_DATA_SPLIT_ENABLED, +}; + +enum { + ETHTOOL_A_HEADER_UNSPEC, + ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ + ETHTOOL_A_HEADER_DEV_NAME, /* string */ + ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ + ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_HEADER_CNT, + ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 +}; + +/* bit sets */ + +enum { + ETHTOOL_A_BITSET_BIT_UNSPEC, + ETHTOOL_A_BITSET_BIT_INDEX, /* u32 */ + ETHTOOL_A_BITSET_BIT_NAME, /* string */ + ETHTOOL_A_BITSET_BIT_VALUE, /* flag */ + + /* add new constants above here */ + __ETHTOOL_A_BITSET_BIT_CNT, + ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 +}; + +enum { + ETHTOOL_A_BITSET_BITS_UNSPEC, + ETHTOOL_A_BITSET_BITS_BIT, /* nest - _A_BITSET_BIT_* */ + + /* add new constants above here */ + __ETHTOOL_A_BITSET_BITS_CNT, + ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 +}; + +enum { + ETHTOOL_A_BITSET_UNSPEC, + ETHTOOL_A_BITSET_NOMASK, /* flag */ + ETHTOOL_A_BITSET_SIZE, /* u32 */ + ETHTOOL_A_BITSET_BITS, /* nest - _A_BITSET_BITS_* */ + ETHTOOL_A_BITSET_VALUE, /* binary */ + ETHTOOL_A_BITSET_MASK, /* binary */ + + /* add new constants above here */ + __ETHTOOL_A_BITSET_CNT, + ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 +}; + +/* string sets */ + +enum { + ETHTOOL_A_STRING_UNSPEC, + ETHTOOL_A_STRING_INDEX, /* u32 */ + ETHTOOL_A_STRING_VALUE, /* string */ + + /* add new constants above here */ + __ETHTOOL_A_STRING_CNT, + ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 +}; + +enum { + ETHTOOL_A_STRINGS_UNSPEC, + ETHTOOL_A_STRINGS_STRING, /* nest - _A_STRINGS_* */ + + /* add new constants above here */ + __ETHTOOL_A_STRINGS_CNT, + ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 +}; + +enum { + ETHTOOL_A_STRINGSET_UNSPEC, + ETHTOOL_A_STRINGSET_ID, /* u32 */ + ETHTOOL_A_STRINGSET_COUNT, /* u32 */ + ETHTOOL_A_STRINGSET_STRINGS, /* nest - _A_STRINGS_* */ + + /* add new constants above here */ + __ETHTOOL_A_STRINGSET_CNT, + ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 +}; + +enum { + ETHTOOL_A_STRINGSETS_UNSPEC, + ETHTOOL_A_STRINGSETS_STRINGSET, /* nest - _A_STRINGSET_* */ + + /* add new constants above here */ + __ETHTOOL_A_STRINGSETS_CNT, + ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 +}; + +/* STRSET */ + +enum { + ETHTOOL_A_STRSET_UNSPEC, + ETHTOOL_A_STRSET_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_STRSET_STRINGSETS, /* nest - _A_STRINGSETS_* */ + ETHTOOL_A_STRSET_COUNTS_ONLY, /* flag */ + + /* add new constants above here */ + __ETHTOOL_A_STRSET_CNT, + ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 +}; + +/* PRIVFLAGS */ + +enum { + ETHTOOL_A_PRIVFLAGS_UNSPEC, + ETHTOOL_A_PRIVFLAGS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PRIVFLAGS_FLAGS, /* bitset */ + + /* add new constants above here */ + __ETHTOOL_A_PRIVFLAGS_CNT, + ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 +}; + +/* RINGS */ + +enum { + ETHTOOL_A_RINGS_UNSPEC, + ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_RINGS_RX_MAX, /* u32 */ + ETHTOOL_A_RINGS_RX_MINI_MAX, /* u32 */ + ETHTOOL_A_RINGS_RX_JUMBO_MAX, /* u32 */ + ETHTOOL_A_RINGS_TX_MAX, /* u32 */ + ETHTOOL_A_RINGS_RX, /* u32 */ + ETHTOOL_A_RINGS_RX_MINI, /* u32 */ + ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ + ETHTOOL_A_RINGS_TX, /* u32 */ + ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ + ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ + ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ + ETHTOOL_A_RINGS_TX_PUSH, /* u8 */ + ETHTOOL_A_RINGS_RX_PUSH, /* u8 */ + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, /* u32 */ + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_RINGS_CNT, + ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) +}; + +/* MAC Merge (802.3) */ + +enum { + ETHTOOL_A_MM_STAT_UNSPEC, + ETHTOOL_A_MM_STAT_PAD, + + /* aMACMergeFrameAssErrorCount */ + ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, /* u64 */ + /* aMACMergeFrameSmdErrorCount */ + ETHTOOL_A_MM_STAT_SMD_ERRORS, /* u64 */ + /* aMACMergeFrameAssOkCount */ + ETHTOOL_A_MM_STAT_REASSEMBLY_OK, /* u64 */ + /* aMACMergeFragCountRx */ + ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, /* u64 */ + /* aMACMergeFragCountTx */ + ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, /* u64 */ + /* aMACMergeHoldCount */ + ETHTOOL_A_MM_STAT_HOLD_COUNT, /* u64 */ + + /* add new constants above here */ + __ETHTOOL_A_MM_STAT_CNT, + ETHTOOL_A_MM_STAT_MAX = (__ETHTOOL_A_MM_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_MM_UNSPEC, + ETHTOOL_A_MM_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_MM_PMAC_ENABLED, /* u8 */ + ETHTOOL_A_MM_TX_ENABLED, /* u8 */ + ETHTOOL_A_MM_TX_ACTIVE, /* u8 */ + ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, /* u32 */ + ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, /* u32 */ + ETHTOOL_A_MM_VERIFY_ENABLED, /* u8 */ + ETHTOOL_A_MM_VERIFY_STATUS, /* u8 */ + ETHTOOL_A_MM_VERIFY_TIME, /* u32 */ + ETHTOOL_A_MM_MAX_VERIFY_TIME, /* u32 */ + ETHTOOL_A_MM_STATS, /* nest - _A_MM_STAT_* */ + + /* add new constants above here */ + __ETHTOOL_A_MM_CNT, + ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) +}; + +/* LINKINFO */ + +enum { + ETHTOOL_A_LINKINFO_UNSPEC, + ETHTOOL_A_LINKINFO_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_LINKINFO_PORT, /* u8 */ + ETHTOOL_A_LINKINFO_PHYADDR, /* u8 */ + ETHTOOL_A_LINKINFO_TP_MDIX, /* u8 */ + ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, /* u8 */ + ETHTOOL_A_LINKINFO_TRANSCEIVER, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_LINKINFO_CNT, + ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 +}; + +/* LINKMODES */ + +enum { + ETHTOOL_A_LINKMODES_UNSPEC, + ETHTOOL_A_LINKMODES_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_LINKMODES_AUTONEG, /* u8 */ + ETHTOOL_A_LINKMODES_OURS, /* bitset */ + ETHTOOL_A_LINKMODES_PEER, /* bitset */ + ETHTOOL_A_LINKMODES_SPEED, /* u32 */ + ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ + ETHTOOL_A_LINKMODES_LANES, /* u32 */ + ETHTOOL_A_LINKMODES_RATE_MATCHING, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_LINKMODES_CNT, + ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 +}; + +/* LINKSTATE */ + +enum { + ETHTOOL_A_LINKSTATE_UNSPEC, + ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_LINKSTATE_LINK, /* u8 */ + ETHTOOL_A_LINKSTATE_SQI, /* u32 */ + ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ + ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_LINKSTATE_CNT, + ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 +}; + +/* DEBUG */ + +enum { + ETHTOOL_A_DEBUG_UNSPEC, + ETHTOOL_A_DEBUG_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_DEBUG_MSGMASK, /* bitset */ + + /* add new constants above here */ + __ETHTOOL_A_DEBUG_CNT, + ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 +}; + +/* WOL */ + +enum { + ETHTOOL_A_WOL_UNSPEC, + ETHTOOL_A_WOL_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_WOL_MODES, /* bitset */ + ETHTOOL_A_WOL_SOPASS, /* binary */ + + /* add new constants above here */ + __ETHTOOL_A_WOL_CNT, + ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 +}; + +/* FEATURES */ + +enum { + ETHTOOL_A_FEATURES_UNSPEC, + ETHTOOL_A_FEATURES_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_FEATURES_HW, /* bitset */ + ETHTOOL_A_FEATURES_WANTED, /* bitset */ + ETHTOOL_A_FEATURES_ACTIVE, /* bitset */ + ETHTOOL_A_FEATURES_NOCHANGE, /* bitset */ + + /* add new constants above here */ + __ETHTOOL_A_FEATURES_CNT, + ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 +}; + +/* CHANNELS */ + +enum { + ETHTOOL_A_CHANNELS_UNSPEC, + ETHTOOL_A_CHANNELS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CHANNELS_RX_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_TX_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_OTHER_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_COMBINED_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_RX_COUNT, /* u32 */ + ETHTOOL_A_CHANNELS_TX_COUNT, /* u32 */ + ETHTOOL_A_CHANNELS_OTHER_COUNT, /* u32 */ + ETHTOOL_A_CHANNELS_COMBINED_COUNT, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_CHANNELS_CNT, + ETHTOOL_A_CHANNELS_MAX = (__ETHTOOL_A_CHANNELS_CNT - 1) +}; + +enum { + ETHTOOL_A_IRQ_MODERATION_UNSPEC, + ETHTOOL_A_IRQ_MODERATION_USEC, /* u32 */ + ETHTOOL_A_IRQ_MODERATION_PKTS, /* u32 */ + ETHTOOL_A_IRQ_MODERATION_COMPS, /* u32 */ + + __ETHTOOL_A_IRQ_MODERATION_CNT, + ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1) +}; + +enum { + ETHTOOL_A_PROFILE_UNSPEC, + /* nest, _A_IRQ_MODERATION_* */ + ETHTOOL_A_PROFILE_IRQ_MODERATION, + __ETHTOOL_A_PROFILE_CNT, + ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) +}; + +/* COALESCE */ + +enum { + ETHTOOL_A_COALESCE_UNSPEC, + ETHTOOL_A_COALESCE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_COALESCE_RX_USECS, /* u32 */ + ETHTOOL_A_COALESCE_RX_MAX_FRAMES, /* u32 */ + ETHTOOL_A_COALESCE_RX_USECS_IRQ, /* u32 */ + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, /* u32 */ + ETHTOOL_A_COALESCE_TX_USECS, /* u32 */ + ETHTOOL_A_COALESCE_TX_MAX_FRAMES, /* u32 */ + ETHTOOL_A_COALESCE_TX_USECS_IRQ, /* u32 */ + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, /* u32 */ + ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, /* u32 */ + ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, /* u8 */ + ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, /* u8 */ + ETHTOOL_A_COALESCE_PKT_RATE_LOW, /* u32 */ + ETHTOOL_A_COALESCE_RX_USECS_LOW, /* u32 */ + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, /* u32 */ + ETHTOOL_A_COALESCE_TX_USECS_LOW, /* u32 */ + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, /* u32 */ + ETHTOOL_A_COALESCE_PKT_RATE_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_RX_USECS_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_TX_USECS_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, /* u32 */ + ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, /* u8 */ + ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, /* u8 */ + ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, /* u32 */ + ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, /* u32 */ + ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, /* u32 */ + /* nest - _A_PROFILE_IRQ_MODERATION */ + ETHTOOL_A_COALESCE_RX_PROFILE, + /* nest - _A_PROFILE_IRQ_MODERATION */ + ETHTOOL_A_COALESCE_TX_PROFILE, + + /* add new constants above here */ + __ETHTOOL_A_COALESCE_CNT, + ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) +}; + +/* PAUSE */ + +enum { + ETHTOOL_A_PAUSE_STAT_UNSPEC, + ETHTOOL_A_PAUSE_STAT_PAD, + + ETHTOOL_A_PAUSE_STAT_TX_FRAMES, + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, + + /* add new constants above here + * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats! + */ + __ETHTOOL_A_PAUSE_STAT_CNT, + ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_PAUSE_UNSPEC, + ETHTOOL_A_PAUSE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ + ETHTOOL_A_PAUSE_RX, /* u8 */ + ETHTOOL_A_PAUSE_TX, /* u8 */ + ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ + ETHTOOL_A_PAUSE_STATS_SRC, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_PAUSE_CNT, + ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) +}; + +/* EEE */ + +enum { + ETHTOOL_A_EEE_UNSPEC, + ETHTOOL_A_EEE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_EEE_MODES_OURS, /* bitset */ + ETHTOOL_A_EEE_MODES_PEER, /* bitset */ + ETHTOOL_A_EEE_ACTIVE, /* u8 */ + ETHTOOL_A_EEE_ENABLED, /* u8 */ + ETHTOOL_A_EEE_TX_LPI_ENABLED, /* u8 */ + ETHTOOL_A_EEE_TX_LPI_TIMER, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_EEE_CNT, + ETHTOOL_A_EEE_MAX = (__ETHTOOL_A_EEE_CNT - 1) +}; + +/* TSINFO */ + +enum { + ETHTOOL_A_TS_STAT_UNSPEC, + + ETHTOOL_A_TS_STAT_TX_PKTS, /* uint */ + ETHTOOL_A_TS_STAT_TX_LOST, /* uint */ + ETHTOOL_A_TS_STAT_TX_ERR, /* uint */ + + /* add new constants above here */ + __ETHTOOL_A_TS_STAT_CNT, + ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) + +}; + +enum { + ETHTOOL_A_TSINFO_UNSPEC, + ETHTOOL_A_TSINFO_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_TSINFO_TIMESTAMPING, /* bitset */ + ETHTOOL_A_TSINFO_TX_TYPES, /* bitset */ + ETHTOOL_A_TSINFO_RX_FILTERS, /* bitset */ + ETHTOOL_A_TSINFO_PHC_INDEX, /* u32 */ + ETHTOOL_A_TSINFO_STATS, /* nest - _A_TSINFO_STAT */ + + /* add new constants above here */ + __ETHTOOL_A_TSINFO_CNT, + ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_RESULT_UNSPEC, + ETHTOOL_A_CABLE_RESULT_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ + ETHTOOL_A_CABLE_RESULT_CODE, /* u8 ETHTOOL_A_CABLE_RESULT_CODE_ */ + ETHTOOL_A_CABLE_RESULT_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ + + __ETHTOOL_A_CABLE_RESULT_CNT, + ETHTOOL_A_CABLE_RESULT_MAX = (__ETHTOOL_A_CABLE_RESULT_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC, + ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ + ETHTOOL_A_CABLE_FAULT_LENGTH_CM, /* u32 */ + ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ + + __ETHTOOL_A_CABLE_FAULT_LENGTH_CNT, + ETHTOOL_A_CABLE_FAULT_LENGTH_MAX = (__ETHTOOL_A_CABLE_FAULT_LENGTH_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_NEST_UNSPEC, + ETHTOOL_A_CABLE_NEST_RESULT, /* nest - ETHTOOL_A_CABLE_RESULT_ */ + ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, /* nest - ETHTOOL_A_CABLE_FAULT_LENGTH_ */ + __ETHTOOL_A_CABLE_NEST_CNT, + ETHTOOL_A_CABLE_NEST_MAX = (__ETHTOOL_A_CABLE_NEST_CNT - 1) +}; + +/* CABLE TEST */ + +enum { + ETHTOOL_A_CABLE_TEST_UNSPEC, + ETHTOOL_A_CABLE_TEST_HEADER, /* nest - _A_HEADER_* */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_CNT, + ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 +}; + +enum { + ETHTOOL_A_CABLE_TEST_NTF_UNSPEC, + ETHTOOL_A_CABLE_TEST_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ + ETHTOOL_A_CABLE_TEST_NTF_NEST, /* nest - of results: */ + + __ETHTOOL_A_CABLE_TEST_NTF_CNT, + ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) +}; + +/* CABLE TEST TDR */ + +enum { + ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, + ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 +}; + +enum { + ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, /* nest - of results: */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, + ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 +}; + +enum { + ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CNT, + ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ + ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, + ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ + ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ + ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, + ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_CNT, + ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_INFO_UNSPEC, + ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ + + ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_INFO_CNT, + ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) +}; + +/* FEC */ + +enum { + ETHTOOL_A_FEC_STAT_UNSPEC, + ETHTOOL_A_FEC_STAT_PAD, + + ETHTOOL_A_FEC_STAT_CORRECTED, /* array, u64 */ + ETHTOOL_A_FEC_STAT_UNCORR, /* array, u64 */ + ETHTOOL_A_FEC_STAT_CORR_BITS, /* array, u64 */ + + /* add new constants above here */ + __ETHTOOL_A_FEC_STAT_CNT, + ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_FEC_UNSPEC, + ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_FEC_MODES, /* bitset */ + ETHTOOL_A_FEC_AUTO, /* u8 */ + ETHTOOL_A_FEC_ACTIVE, /* u32 */ + ETHTOOL_A_FEC_STATS, /* nest - _A_FEC_STAT */ + + __ETHTOOL_A_FEC_CNT, + ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) +}; + +/* MODULE EEPROM */ + +enum { + ETHTOOL_A_MODULE_EEPROM_UNSPEC, + ETHTOOL_A_MODULE_EEPROM_HEADER, /* nest - _A_HEADER_* */ + + ETHTOOL_A_MODULE_EEPROM_OFFSET, /* u32 */ + ETHTOOL_A_MODULE_EEPROM_LENGTH, /* u32 */ + ETHTOOL_A_MODULE_EEPROM_PAGE, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_BANK, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_DATA, /* binary */ + + __ETHTOOL_A_MODULE_EEPROM_CNT, + ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) +}; + + +enum { + ETHTOOL_A_STATS_GRP_UNSPEC, + ETHTOOL_A_STATS_GRP_PAD, + + ETHTOOL_A_STATS_GRP_ID, /* u32 */ + ETHTOOL_A_STATS_GRP_SS_ID, /* u32 */ + + ETHTOOL_A_STATS_GRP_STAT, /* nest */ + + ETHTOOL_A_STATS_GRP_HIST_RX, /* nest */ + ETHTOOL_A_STATS_GRP_HIST_TX, /* nest */ + + ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, /* u32 */ + ETHTOOL_A_STATS_GRP_HIST_BKT_HI, /* u32 */ + ETHTOOL_A_STATS_GRP_HIST_VAL, /* u64 */ + + /* add new constants above here */ + __ETHTOOL_A_STATS_GRP_CNT, + ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_GRP_CNT - 1) +}; + +/* STATS */ + +enum { + ETHTOOL_A_STATS_UNSPEC, + ETHTOOL_A_STATS_PAD, + ETHTOOL_A_STATS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_STATS_GROUPS, /* bitset */ + + ETHTOOL_A_STATS_GRP, /* nest - _A_STATS_GRP_* */ + + ETHTOOL_A_STATS_SRC, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_STATS_CNT, + ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) +}; + +/* PHC VCLOCKS */ + +enum { + ETHTOOL_A_PHC_VCLOCKS_UNSPEC, + ETHTOOL_A_PHC_VCLOCKS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PHC_VCLOCKS_NUM, /* u32 */ + ETHTOOL_A_PHC_VCLOCKS_INDEX, /* array, s32 */ + + /* add new constants above here */ + __ETHTOOL_A_PHC_VCLOCKS_CNT, + ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1) +}; + +/* MODULE */ + +enum { + ETHTOOL_A_MODULE_UNSPEC, + ETHTOOL_A_MODULE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_MODULE_POWER_MODE_POLICY, /* u8 */ + ETHTOOL_A_MODULE_POWER_MODE, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_MODULE_CNT, + ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) +}; + +/* Power Sourcing Equipment */ +enum { + ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, + ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, /* u32 */ + ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, /* u32 */ +}; + +enum { + ETHTOOL_A_PSE_UNSPEC, + ETHTOOL_A_PSE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PODL_PSE_ADMIN_STATE, /* u32 */ + ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, /* u32 */ + ETHTOOL_A_PODL_PSE_PW_D_STATUS, /* u32 */ + ETHTOOL_A_C33_PSE_ADMIN_STATE, /* u32 */ + ETHTOOL_A_C33_PSE_ADMIN_CONTROL, /* u32 */ + ETHTOOL_A_C33_PSE_PW_D_STATUS, /* u32 */ + ETHTOOL_A_C33_PSE_PW_CLASS, /* u32 */ + ETHTOOL_A_C33_PSE_ACTUAL_PW, /* u32 */ + ETHTOOL_A_C33_PSE_EXT_STATE, /* u32 */ + ETHTOOL_A_C33_PSE_EXT_SUBSTATE, /* u32 */ + ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, /* u32 */ + ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, /* nest - _C33_PSE_PW_LIMIT_* */ + + /* add new constants above here */ + __ETHTOOL_A_PSE_CNT, + ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) +}; + +enum { + ETHTOOL_A_RSS_UNSPEC, + ETHTOOL_A_RSS_HEADER, + ETHTOOL_A_RSS_CONTEXT, /* u32 */ + ETHTOOL_A_RSS_HFUNC, /* u32 */ + ETHTOOL_A_RSS_INDIR, /* binary */ + ETHTOOL_A_RSS_HKEY, /* binary */ + ETHTOOL_A_RSS_INPUT_XFRM, /* u32 */ + ETHTOOL_A_RSS_START_CONTEXT, /* u32 */ + + __ETHTOOL_A_RSS_CNT, + ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), +}; + +/* PLCA */ + +enum { + ETHTOOL_A_PLCA_UNSPEC, + ETHTOOL_A_PLCA_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PLCA_VERSION, /* u16 */ + ETHTOOL_A_PLCA_ENABLED, /* u8 */ + ETHTOOL_A_PLCA_STATUS, /* u8 */ + ETHTOOL_A_PLCA_NODE_CNT, /* u32 */ + ETHTOOL_A_PLCA_NODE_ID, /* u32 */ + ETHTOOL_A_PLCA_TO_TMR, /* u32 */ + ETHTOOL_A_PLCA_BURST_CNT, /* u32 */ + ETHTOOL_A_PLCA_BURST_TMR, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_PLCA_CNT, + ETHTOOL_A_PLCA_MAX = (__ETHTOOL_A_PLCA_CNT - 1) +}; + +/* MODULE_FW_FLASH */ + +enum { + ETHTOOL_A_MODULE_FW_FLASH_UNSPEC, + ETHTOOL_A_MODULE_FW_FLASH_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, /* string */ + ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, /* u32 */ + ETHTOOL_A_MODULE_FW_FLASH_STATUS, /* u32 */ + ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, /* string */ + ETHTOOL_A_MODULE_FW_FLASH_DONE, /* uint */ + ETHTOOL_A_MODULE_FW_FLASH_TOTAL, /* uint */ + + /* add new constants above here */ + __ETHTOOL_A_MODULE_FW_FLASH_CNT, + ETHTOOL_A_MODULE_FW_FLASH_MAX = (__ETHTOOL_A_MODULE_FW_FLASH_CNT - 1) +}; + +enum { + ETHTOOL_A_PHY_UNSPEC, + ETHTOOL_A_PHY_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PHY_INDEX, /* u32 */ + ETHTOOL_A_PHY_DRVNAME, /* string */ + ETHTOOL_A_PHY_NAME, /* string */ + ETHTOOL_A_PHY_UPSTREAM_TYPE, /* u32 */ + ETHTOOL_A_PHY_UPSTREAM_INDEX, /* u32 */ + ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, /* string */ + ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, /* string */ + + /* add new constants above here */ + __ETHTOOL_A_PHY_CNT, + ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) +}; + +/* message types - userspace to kernel */ +enum { + ETHTOOL_MSG_USER_NONE, + ETHTOOL_MSG_STRSET_GET, + ETHTOOL_MSG_LINKINFO_GET, + ETHTOOL_MSG_LINKINFO_SET, + ETHTOOL_MSG_LINKMODES_GET, + ETHTOOL_MSG_LINKMODES_SET, + ETHTOOL_MSG_LINKSTATE_GET, + ETHTOOL_MSG_DEBUG_GET, + ETHTOOL_MSG_DEBUG_SET, + ETHTOOL_MSG_WOL_GET, + ETHTOOL_MSG_WOL_SET, + ETHTOOL_MSG_FEATURES_GET, + ETHTOOL_MSG_FEATURES_SET, + ETHTOOL_MSG_PRIVFLAGS_GET, + ETHTOOL_MSG_PRIVFLAGS_SET, + ETHTOOL_MSG_RINGS_GET, + ETHTOOL_MSG_RINGS_SET, + ETHTOOL_MSG_CHANNELS_GET, + ETHTOOL_MSG_CHANNELS_SET, + ETHTOOL_MSG_COALESCE_GET, + ETHTOOL_MSG_COALESCE_SET, + ETHTOOL_MSG_PAUSE_GET, + ETHTOOL_MSG_PAUSE_SET, + ETHTOOL_MSG_EEE_GET, + ETHTOOL_MSG_EEE_SET, + ETHTOOL_MSG_TSINFO_GET, + ETHTOOL_MSG_CABLE_TEST_ACT, + ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_MSG_FEC_GET, + ETHTOOL_MSG_FEC_SET, + ETHTOOL_MSG_MODULE_EEPROM_GET, + ETHTOOL_MSG_STATS_GET, + ETHTOOL_MSG_PHC_VCLOCKS_GET, + ETHTOOL_MSG_MODULE_GET, + ETHTOOL_MSG_MODULE_SET, + ETHTOOL_MSG_PSE_GET, + ETHTOOL_MSG_PSE_SET, + ETHTOOL_MSG_RSS_GET, + ETHTOOL_MSG_PLCA_GET_CFG, + ETHTOOL_MSG_PLCA_SET_CFG, + ETHTOOL_MSG_PLCA_GET_STATUS, + ETHTOOL_MSG_MM_GET, + ETHTOOL_MSG_MM_SET, + ETHTOOL_MSG_MODULE_FW_FLASH_ACT, + ETHTOOL_MSG_PHY_GET, + + /* add new constants above here */ + __ETHTOOL_MSG_USER_CNT, + ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 +}; + +/* message types - kernel to userspace */ +enum { + ETHTOOL_MSG_KERNEL_NONE, + ETHTOOL_MSG_STRSET_GET_REPLY, + ETHTOOL_MSG_LINKINFO_GET_REPLY, + ETHTOOL_MSG_LINKINFO_NTF, + ETHTOOL_MSG_LINKMODES_GET_REPLY, + ETHTOOL_MSG_LINKMODES_NTF, + ETHTOOL_MSG_LINKSTATE_GET_REPLY, + ETHTOOL_MSG_DEBUG_GET_REPLY, + ETHTOOL_MSG_DEBUG_NTF, + ETHTOOL_MSG_WOL_GET_REPLY, + ETHTOOL_MSG_WOL_NTF, + ETHTOOL_MSG_FEATURES_GET_REPLY, + ETHTOOL_MSG_FEATURES_SET_REPLY, + ETHTOOL_MSG_FEATURES_NTF, + ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, + ETHTOOL_MSG_PRIVFLAGS_NTF, + ETHTOOL_MSG_RINGS_GET_REPLY, + ETHTOOL_MSG_RINGS_NTF, + ETHTOOL_MSG_CHANNELS_GET_REPLY, + ETHTOOL_MSG_CHANNELS_NTF, + ETHTOOL_MSG_COALESCE_GET_REPLY, + ETHTOOL_MSG_COALESCE_NTF, + ETHTOOL_MSG_PAUSE_GET_REPLY, + ETHTOOL_MSG_PAUSE_NTF, + ETHTOOL_MSG_EEE_GET_REPLY, + ETHTOOL_MSG_EEE_NTF, + ETHTOOL_MSG_TSINFO_GET_REPLY, + ETHTOOL_MSG_CABLE_TEST_NTF, + ETHTOOL_MSG_CABLE_TEST_TDR_NTF, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, + ETHTOOL_MSG_FEC_GET_REPLY, + ETHTOOL_MSG_FEC_NTF, + ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, + ETHTOOL_MSG_STATS_GET_REPLY, + ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, + ETHTOOL_MSG_MODULE_GET_REPLY, + ETHTOOL_MSG_MODULE_NTF, + ETHTOOL_MSG_PSE_GET_REPLY, + ETHTOOL_MSG_RSS_GET_REPLY, + ETHTOOL_MSG_PLCA_GET_CFG_REPLY, + ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, + ETHTOOL_MSG_PLCA_NTF, + ETHTOOL_MSG_MM_GET_REPLY, + ETHTOOL_MSG_MM_NTF, + ETHTOOL_MSG_MODULE_FW_FLASH_NTF, + ETHTOOL_MSG_PHY_GET_REPLY, + ETHTOOL_MSG_PHY_NTF, + + /* add new constants above here */ + __ETHTOOL_MSG_KERNEL_CNT, + ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 +}; + +#endif /* _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H */ From dd7cde36de15b071b5f9163d21d7c9142089b424 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:48 -0800 Subject: [PATCH 0194/1450] ethtool: remove the comments that are not gonna be generated Cleanup the header manually to make it easier to review the changes that ynl generator brings in. No functional changes. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-8-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- .../uapi/linux/ethtool_netlink_generated.h | 678 +++++++----------- 1 file changed, 274 insertions(+), 404 deletions(-) diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 4b4bf17d1a88e..35a24d490efef 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -2,8 +2,6 @@ #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H -/* TUNNEL INFO */ - enum { ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, @@ -12,8 +10,6 @@ enum { __ETHTOOL_UDP_TUNNEL_TYPE_CNT }; -/* request header */ - enum ethtool_header_flags { ETHTOOL_FLAG_COMPACT_BITSETS = 1 << 0, /* use compact bitsets in reply */ ETHTOOL_FLAG_OMIT_REPLY = 1 << 1, /* provide optional reply for SET or ACT requests */ @@ -28,303 +24,250 @@ enum { enum { ETHTOOL_A_HEADER_UNSPEC, - ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ - ETHTOOL_A_HEADER_DEV_NAME, /* string */ - ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ - ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ + ETHTOOL_A_HEADER_DEV_INDEX, + ETHTOOL_A_HEADER_DEV_NAME, + ETHTOOL_A_HEADER_FLAGS, + ETHTOOL_A_HEADER_PHY_INDEX, - /* add new constants above here */ __ETHTOOL_A_HEADER_CNT, ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 }; -/* bit sets */ - enum { ETHTOOL_A_BITSET_BIT_UNSPEC, - ETHTOOL_A_BITSET_BIT_INDEX, /* u32 */ - ETHTOOL_A_BITSET_BIT_NAME, /* string */ - ETHTOOL_A_BITSET_BIT_VALUE, /* flag */ + ETHTOOL_A_BITSET_BIT_INDEX, + ETHTOOL_A_BITSET_BIT_NAME, + ETHTOOL_A_BITSET_BIT_VALUE, - /* add new constants above here */ __ETHTOOL_A_BITSET_BIT_CNT, ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 }; enum { ETHTOOL_A_BITSET_BITS_UNSPEC, - ETHTOOL_A_BITSET_BITS_BIT, /* nest - _A_BITSET_BIT_* */ + ETHTOOL_A_BITSET_BITS_BIT, - /* add new constants above here */ __ETHTOOL_A_BITSET_BITS_CNT, ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 }; enum { ETHTOOL_A_BITSET_UNSPEC, - ETHTOOL_A_BITSET_NOMASK, /* flag */ - ETHTOOL_A_BITSET_SIZE, /* u32 */ - ETHTOOL_A_BITSET_BITS, /* nest - _A_BITSET_BITS_* */ - ETHTOOL_A_BITSET_VALUE, /* binary */ - ETHTOOL_A_BITSET_MASK, /* binary */ + ETHTOOL_A_BITSET_NOMASK, + ETHTOOL_A_BITSET_SIZE, + ETHTOOL_A_BITSET_BITS, + ETHTOOL_A_BITSET_VALUE, + ETHTOOL_A_BITSET_MASK, - /* add new constants above here */ __ETHTOOL_A_BITSET_CNT, ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 }; -/* string sets */ - enum { ETHTOOL_A_STRING_UNSPEC, - ETHTOOL_A_STRING_INDEX, /* u32 */ - ETHTOOL_A_STRING_VALUE, /* string */ + ETHTOOL_A_STRING_INDEX, + ETHTOOL_A_STRING_VALUE, - /* add new constants above here */ __ETHTOOL_A_STRING_CNT, ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 }; enum { ETHTOOL_A_STRINGS_UNSPEC, - ETHTOOL_A_STRINGS_STRING, /* nest - _A_STRINGS_* */ + ETHTOOL_A_STRINGS_STRING, - /* add new constants above here */ __ETHTOOL_A_STRINGS_CNT, ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 }; enum { ETHTOOL_A_STRINGSET_UNSPEC, - ETHTOOL_A_STRINGSET_ID, /* u32 */ - ETHTOOL_A_STRINGSET_COUNT, /* u32 */ - ETHTOOL_A_STRINGSET_STRINGS, /* nest - _A_STRINGS_* */ + ETHTOOL_A_STRINGSET_ID, + ETHTOOL_A_STRINGSET_COUNT, + ETHTOOL_A_STRINGSET_STRINGS, - /* add new constants above here */ __ETHTOOL_A_STRINGSET_CNT, ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 }; enum { ETHTOOL_A_STRINGSETS_UNSPEC, - ETHTOOL_A_STRINGSETS_STRINGSET, /* nest - _A_STRINGSET_* */ + ETHTOOL_A_STRINGSETS_STRINGSET, - /* add new constants above here */ __ETHTOOL_A_STRINGSETS_CNT, ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 }; -/* STRSET */ - enum { ETHTOOL_A_STRSET_UNSPEC, - ETHTOOL_A_STRSET_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STRSET_STRINGSETS, /* nest - _A_STRINGSETS_* */ - ETHTOOL_A_STRSET_COUNTS_ONLY, /* flag */ + ETHTOOL_A_STRSET_HEADER, + ETHTOOL_A_STRSET_STRINGSETS, + ETHTOOL_A_STRSET_COUNTS_ONLY, - /* add new constants above here */ __ETHTOOL_A_STRSET_CNT, ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 }; -/* PRIVFLAGS */ - enum { ETHTOOL_A_PRIVFLAGS_UNSPEC, - ETHTOOL_A_PRIVFLAGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PRIVFLAGS_FLAGS, /* bitset */ + ETHTOOL_A_PRIVFLAGS_HEADER, + ETHTOOL_A_PRIVFLAGS_FLAGS, - /* add new constants above here */ __ETHTOOL_A_PRIVFLAGS_CNT, ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 }; -/* RINGS */ - enum { ETHTOOL_A_RINGS_UNSPEC, - ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_RINGS_RX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO_MAX, /* u32 */ - ETHTOOL_A_RINGS_TX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ - ETHTOOL_A_RINGS_TX, /* u32 */ - ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ - ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_RX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_RINGS_HEADER, + ETHTOOL_A_RINGS_RX_MAX, + ETHTOOL_A_RINGS_RX_MINI_MAX, + ETHTOOL_A_RINGS_RX_JUMBO_MAX, + ETHTOOL_A_RINGS_TX_MAX, + ETHTOOL_A_RINGS_RX, + ETHTOOL_A_RINGS_RX_MINI, + ETHTOOL_A_RINGS_RX_JUMBO, + ETHTOOL_A_RINGS_TX, + ETHTOOL_A_RINGS_RX_BUF_LEN, + ETHTOOL_A_RINGS_TCP_DATA_SPLIT, + ETHTOOL_A_RINGS_CQE_SIZE, + ETHTOOL_A_RINGS_TX_PUSH, + ETHTOOL_A_RINGS_RX_PUSH, + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, + __ETHTOOL_A_RINGS_CNT, ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) }; -/* MAC Merge (802.3) */ - enum { ETHTOOL_A_MM_STAT_UNSPEC, ETHTOOL_A_MM_STAT_PAD, + ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, + ETHTOOL_A_MM_STAT_SMD_ERRORS, + ETHTOOL_A_MM_STAT_REASSEMBLY_OK, + ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, + ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, + ETHTOOL_A_MM_STAT_HOLD_COUNT, - /* aMACMergeFrameAssErrorCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, /* u64 */ - /* aMACMergeFrameSmdErrorCount */ - ETHTOOL_A_MM_STAT_SMD_ERRORS, /* u64 */ - /* aMACMergeFrameAssOkCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_OK, /* u64 */ - /* aMACMergeFragCountRx */ - ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, /* u64 */ - /* aMACMergeFragCountTx */ - ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, /* u64 */ - /* aMACMergeHoldCount */ - ETHTOOL_A_MM_STAT_HOLD_COUNT, /* u64 */ - - /* add new constants above here */ __ETHTOOL_A_MM_STAT_CNT, ETHTOOL_A_MM_STAT_MAX = (__ETHTOOL_A_MM_STAT_CNT - 1) }; enum { ETHTOOL_A_MM_UNSPEC, - ETHTOOL_A_MM_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MM_PMAC_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ACTIVE, /* u8 */ - ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_VERIFY_ENABLED, /* u8 */ - ETHTOOL_A_MM_VERIFY_STATUS, /* u8 */ - ETHTOOL_A_MM_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_MAX_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_STATS, /* nest - _A_MM_STAT_* */ - - /* add new constants above here */ + ETHTOOL_A_MM_HEADER, + ETHTOOL_A_MM_PMAC_ENABLED, + ETHTOOL_A_MM_TX_ENABLED, + ETHTOOL_A_MM_TX_ACTIVE, + ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, + ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, + ETHTOOL_A_MM_VERIFY_ENABLED, + ETHTOOL_A_MM_VERIFY_STATUS, + ETHTOOL_A_MM_VERIFY_TIME, + ETHTOOL_A_MM_MAX_VERIFY_TIME, + ETHTOOL_A_MM_STATS, + __ETHTOOL_A_MM_CNT, ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) }; -/* LINKINFO */ - enum { ETHTOOL_A_LINKINFO_UNSPEC, - ETHTOOL_A_LINKINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKINFO_PORT, /* u8 */ - ETHTOOL_A_LINKINFO_PHYADDR, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, /* u8 */ - ETHTOOL_A_LINKINFO_TRANSCEIVER, /* u8 */ - - /* add new constants above here */ + ETHTOOL_A_LINKINFO_HEADER, + ETHTOOL_A_LINKINFO_PORT, + ETHTOOL_A_LINKINFO_PHYADDR, + ETHTOOL_A_LINKINFO_TP_MDIX, + ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, + ETHTOOL_A_LINKINFO_TRANSCEIVER, + __ETHTOOL_A_LINKINFO_CNT, ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 }; -/* LINKMODES */ - enum { ETHTOOL_A_LINKMODES_UNSPEC, - ETHTOOL_A_LINKMODES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKMODES_AUTONEG, /* u8 */ - ETHTOOL_A_LINKMODES_OURS, /* bitset */ - ETHTOOL_A_LINKMODES_PEER, /* bitset */ - ETHTOOL_A_LINKMODES_SPEED, /* u32 */ - ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ - ETHTOOL_A_LINKMODES_LANES, /* u32 */ - ETHTOOL_A_LINKMODES_RATE_MATCHING, /* u8 */ - - /* add new constants above here */ + ETHTOOL_A_LINKMODES_HEADER, + ETHTOOL_A_LINKMODES_AUTONEG, + ETHTOOL_A_LINKMODES_OURS, + ETHTOOL_A_LINKMODES_PEER, + ETHTOOL_A_LINKMODES_SPEED, + ETHTOOL_A_LINKMODES_DUPLEX, + ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, + ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, + ETHTOOL_A_LINKMODES_LANES, + ETHTOOL_A_LINKMODES_RATE_MATCHING, + __ETHTOOL_A_LINKMODES_CNT, ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 }; -/* LINKSTATE */ - enum { ETHTOOL_A_LINKSTATE_UNSPEC, - ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKSTATE_LINK, /* u8 */ - ETHTOOL_A_LINKSTATE_SQI, /* u32 */ - ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ - ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_LINKSTATE_HEADER, + ETHTOOL_A_LINKSTATE_LINK, + ETHTOOL_A_LINKSTATE_SQI, + ETHTOOL_A_LINKSTATE_SQI_MAX, + ETHTOOL_A_LINKSTATE_EXT_STATE, + ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, + ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, + __ETHTOOL_A_LINKSTATE_CNT, ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 }; -/* DEBUG */ - enum { ETHTOOL_A_DEBUG_UNSPEC, - ETHTOOL_A_DEBUG_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_DEBUG_MSGMASK, /* bitset */ + ETHTOOL_A_DEBUG_HEADER, + ETHTOOL_A_DEBUG_MSGMASK, - /* add new constants above here */ __ETHTOOL_A_DEBUG_CNT, ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 }; -/* WOL */ - enum { ETHTOOL_A_WOL_UNSPEC, - ETHTOOL_A_WOL_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_WOL_MODES, /* bitset */ - ETHTOOL_A_WOL_SOPASS, /* binary */ + ETHTOOL_A_WOL_HEADER, + ETHTOOL_A_WOL_MODES, + ETHTOOL_A_WOL_SOPASS, - /* add new constants above here */ __ETHTOOL_A_WOL_CNT, ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 }; -/* FEATURES */ - enum { ETHTOOL_A_FEATURES_UNSPEC, - ETHTOOL_A_FEATURES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEATURES_HW, /* bitset */ - ETHTOOL_A_FEATURES_WANTED, /* bitset */ - ETHTOOL_A_FEATURES_ACTIVE, /* bitset */ - ETHTOOL_A_FEATURES_NOCHANGE, /* bitset */ + ETHTOOL_A_FEATURES_HEADER, + ETHTOOL_A_FEATURES_HW, + ETHTOOL_A_FEATURES_WANTED, + ETHTOOL_A_FEATURES_ACTIVE, + ETHTOOL_A_FEATURES_NOCHANGE, - /* add new constants above here */ __ETHTOOL_A_FEATURES_CNT, ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 }; -/* CHANNELS */ - enum { ETHTOOL_A_CHANNELS_UNSPEC, - ETHTOOL_A_CHANNELS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CHANNELS_RX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_TX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_RX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_TX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_COUNT, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_CHANNELS_HEADER, + ETHTOOL_A_CHANNELS_RX_MAX, + ETHTOOL_A_CHANNELS_TX_MAX, + ETHTOOL_A_CHANNELS_OTHER_MAX, + ETHTOOL_A_CHANNELS_COMBINED_MAX, + ETHTOOL_A_CHANNELS_RX_COUNT, + ETHTOOL_A_CHANNELS_TX_COUNT, + ETHTOOL_A_CHANNELS_OTHER_COUNT, + ETHTOOL_A_CHANNELS_COMBINED_COUNT, + __ETHTOOL_A_CHANNELS_CNT, ETHTOOL_A_CHANNELS_MAX = (__ETHTOOL_A_CHANNELS_CNT - 1) }; enum { ETHTOOL_A_IRQ_MODERATION_UNSPEC, - ETHTOOL_A_IRQ_MODERATION_USEC, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_PKTS, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_COMPS, /* u32 */ + ETHTOOL_A_IRQ_MODERATION_USEC, + ETHTOOL_A_IRQ_MODERATION_PKTS, + ETHTOOL_A_IRQ_MODERATION_COMPS, __ETHTOOL_A_IRQ_MODERATION_CNT, ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1) @@ -332,111 +275,91 @@ enum { enum { ETHTOOL_A_PROFILE_UNSPEC, - /* nest, _A_IRQ_MODERATION_* */ ETHTOOL_A_PROFILE_IRQ_MODERATION, __ETHTOOL_A_PROFILE_CNT, ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) }; -/* COALESCE */ - enum { ETHTOOL_A_COALESCE_UNSPEC, - ETHTOOL_A_COALESCE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_COALESCE_RX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, /* u32 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, /* u8 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, /* u8 */ - ETHTOOL_A_COALESCE_PKT_RATE_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_PKT_RATE_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, /* u32 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, /* u8 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, /* u8 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, /* u32 */ - /* nest - _A_PROFILE_IRQ_MODERATION */ + ETHTOOL_A_COALESCE_HEADER, + ETHTOOL_A_COALESCE_RX_USECS, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES, + ETHTOOL_A_COALESCE_RX_USECS_IRQ, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, + ETHTOOL_A_COALESCE_TX_USECS, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES, + ETHTOOL_A_COALESCE_TX_USECS_IRQ, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, + ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, + ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, + ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, + ETHTOOL_A_COALESCE_PKT_RATE_LOW, + ETHTOOL_A_COALESCE_RX_USECS_LOW, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, + ETHTOOL_A_COALESCE_TX_USECS_LOW, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, + ETHTOOL_A_COALESCE_PKT_RATE_HIGH, + ETHTOOL_A_COALESCE_RX_USECS_HIGH, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, + ETHTOOL_A_COALESCE_TX_USECS_HIGH, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, + ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, + ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, + ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, + ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, + ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, + ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, ETHTOOL_A_COALESCE_RX_PROFILE, - /* nest - _A_PROFILE_IRQ_MODERATION */ ETHTOOL_A_COALESCE_TX_PROFILE, - /* add new constants above here */ __ETHTOOL_A_COALESCE_CNT, ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -/* PAUSE */ - enum { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, - ETHTOOL_A_PAUSE_STAT_TX_FRAMES, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, - /* add new constants above here - * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats! - */ __ETHTOOL_A_PAUSE_STAT_CNT, ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; enum { ETHTOOL_A_PAUSE_UNSPEC, - ETHTOOL_A_PAUSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ - ETHTOOL_A_PAUSE_RX, /* u8 */ - ETHTOOL_A_PAUSE_TX, /* u8 */ - ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ - ETHTOOL_A_PAUSE_STATS_SRC, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_PAUSE_HEADER, + ETHTOOL_A_PAUSE_AUTONEG, + ETHTOOL_A_PAUSE_RX, + ETHTOOL_A_PAUSE_TX, + ETHTOOL_A_PAUSE_STATS, + ETHTOOL_A_PAUSE_STATS_SRC, + __ETHTOOL_A_PAUSE_CNT, ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) }; -/* EEE */ - enum { ETHTOOL_A_EEE_UNSPEC, - ETHTOOL_A_EEE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_EEE_MODES_OURS, /* bitset */ - ETHTOOL_A_EEE_MODES_PEER, /* bitset */ - ETHTOOL_A_EEE_ACTIVE, /* u8 */ - ETHTOOL_A_EEE_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_TIMER, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_EEE_HEADER, + ETHTOOL_A_EEE_MODES_OURS, + ETHTOOL_A_EEE_MODES_PEER, + ETHTOOL_A_EEE_ACTIVE, + ETHTOOL_A_EEE_ENABLED, + ETHTOOL_A_EEE_TX_LPI_ENABLED, + ETHTOOL_A_EEE_TX_LPI_TIMER, + __ETHTOOL_A_EEE_CNT, ETHTOOL_A_EEE_MAX = (__ETHTOOL_A_EEE_CNT - 1) }; -/* TSINFO */ - enum { ETHTOOL_A_TS_STAT_UNSPEC, + ETHTOOL_A_TS_STAT_TX_PKTS, + ETHTOOL_A_TS_STAT_TX_LOST, + ETHTOOL_A_TS_STAT_TX_ERR, - ETHTOOL_A_TS_STAT_TX_PKTS, /* uint */ - ETHTOOL_A_TS_STAT_TX_LOST, /* uint */ - ETHTOOL_A_TS_STAT_TX_ERR, /* uint */ - - /* add new constants above here */ __ETHTOOL_A_TS_STAT_CNT, ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) @@ -444,23 +367,22 @@ enum { enum { ETHTOOL_A_TSINFO_UNSPEC, - ETHTOOL_A_TSINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_TSINFO_TIMESTAMPING, /* bitset */ - ETHTOOL_A_TSINFO_TX_TYPES, /* bitset */ - ETHTOOL_A_TSINFO_RX_FILTERS, /* bitset */ - ETHTOOL_A_TSINFO_PHC_INDEX, /* u32 */ - ETHTOOL_A_TSINFO_STATS, /* nest - _A_TSINFO_STAT */ - - /* add new constants above here */ + ETHTOOL_A_TSINFO_HEADER, + ETHTOOL_A_TSINFO_TIMESTAMPING, + ETHTOOL_A_TSINFO_TX_TYPES, + ETHTOOL_A_TSINFO_RX_FILTERS, + ETHTOOL_A_TSINFO_PHC_INDEX, + ETHTOOL_A_TSINFO_STATS, + __ETHTOOL_A_TSINFO_CNT, ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) }; enum { ETHTOOL_A_CABLE_RESULT_UNSPEC, - ETHTOOL_A_CABLE_RESULT_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_RESULT_CODE, /* u8 ETHTOOL_A_CABLE_RESULT_CODE_ */ - ETHTOOL_A_CABLE_RESULT_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ + ETHTOOL_A_CABLE_RESULT_PAIR, + ETHTOOL_A_CABLE_RESULT_CODE, + ETHTOOL_A_CABLE_RESULT_SRC, __ETHTOOL_A_CABLE_RESULT_CNT, ETHTOOL_A_CABLE_RESULT_MAX = (__ETHTOOL_A_CABLE_RESULT_CNT - 1) @@ -468,9 +390,9 @@ enum { enum { ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC, - ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_FAULT_LENGTH_CM, /* u32 */ - ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ + ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, + ETHTOOL_A_CABLE_FAULT_LENGTH_CM, + ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, __ETHTOOL_A_CABLE_FAULT_LENGTH_CNT, ETHTOOL_A_CABLE_FAULT_LENGTH_MAX = (__ETHTOOL_A_CABLE_FAULT_LENGTH_CNT - 1) @@ -478,245 +400,204 @@ enum { enum { ETHTOOL_A_CABLE_NEST_UNSPEC, - ETHTOOL_A_CABLE_NEST_RESULT, /* nest - ETHTOOL_A_CABLE_RESULT_ */ - ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, /* nest - ETHTOOL_A_CABLE_FAULT_LENGTH_ */ + ETHTOOL_A_CABLE_NEST_RESULT, + ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, + __ETHTOOL_A_CABLE_NEST_CNT, ETHTOOL_A_CABLE_NEST_MAX = (__ETHTOOL_A_CABLE_NEST_CNT - 1) }; -/* CABLE TEST */ - enum { ETHTOOL_A_CABLE_TEST_UNSPEC, - ETHTOOL_A_CABLE_TEST_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_HEADER, - /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_CNT, ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 }; enum { ETHTOOL_A_CABLE_TEST_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_NTF_NEST, /* nest - of results: */ + ETHTOOL_A_CABLE_TEST_NTF_HEADER, + ETHTOOL_A_CABLE_TEST_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_NTF_NEST, __ETHTOOL_A_CABLE_TEST_NTF_CNT, ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) }; -/* CABLE TEST TDR */ - enum { ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, + ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, + ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, + ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, - /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 }; enum { ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, /* nest - of results: */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, + ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, - /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 }; enum { ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ + ETHTOOL_A_CABLE_TEST_TDR_HEADER, + ETHTOOL_A_CABLE_TEST_TDR_CFG, - /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_CNT, ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 }; enum { ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, + ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, - ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ - ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ - - /* add new constants above here */ __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) }; enum { ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, + ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, + ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, - ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ - ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ - ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ - - /* add new constants above here */ __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) }; enum { ETHTOOL_A_TUNNEL_UDP_UNSPEC, + ETHTOOL_A_TUNNEL_UDP_TABLE, - ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ - - /* add new constants above here */ __ETHTOOL_A_TUNNEL_UDP_CNT, ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) }; enum { ETHTOOL_A_TUNNEL_INFO_UNSPEC, - ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_TUNNEL_INFO_HEADER, + ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, - ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ - - /* add new constants above here */ __ETHTOOL_A_TUNNEL_INFO_CNT, ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; -/* FEC */ - enum { ETHTOOL_A_FEC_STAT_UNSPEC, ETHTOOL_A_FEC_STAT_PAD, + ETHTOOL_A_FEC_STAT_CORRECTED, + ETHTOOL_A_FEC_STAT_UNCORR, + ETHTOOL_A_FEC_STAT_CORR_BITS, - ETHTOOL_A_FEC_STAT_CORRECTED, /* array, u64 */ - ETHTOOL_A_FEC_STAT_UNCORR, /* array, u64 */ - ETHTOOL_A_FEC_STAT_CORR_BITS, /* array, u64 */ - - /* add new constants above here */ __ETHTOOL_A_FEC_STAT_CNT, ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) }; enum { ETHTOOL_A_FEC_UNSPEC, - ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEC_MODES, /* bitset */ - ETHTOOL_A_FEC_AUTO, /* u8 */ - ETHTOOL_A_FEC_ACTIVE, /* u32 */ - ETHTOOL_A_FEC_STATS, /* nest - _A_FEC_STAT */ + ETHTOOL_A_FEC_HEADER, + ETHTOOL_A_FEC_MODES, + ETHTOOL_A_FEC_AUTO, + ETHTOOL_A_FEC_ACTIVE, + ETHTOOL_A_FEC_STATS, __ETHTOOL_A_FEC_CNT, ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) }; -/* MODULE EEPROM */ - enum { ETHTOOL_A_MODULE_EEPROM_UNSPEC, - ETHTOOL_A_MODULE_EEPROM_HEADER, /* nest - _A_HEADER_* */ - - ETHTOOL_A_MODULE_EEPROM_OFFSET, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_LENGTH, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_PAGE, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_BANK, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_DATA, /* binary */ + ETHTOOL_A_MODULE_EEPROM_HEADER, + ETHTOOL_A_MODULE_EEPROM_OFFSET, + ETHTOOL_A_MODULE_EEPROM_LENGTH, + ETHTOOL_A_MODULE_EEPROM_PAGE, + ETHTOOL_A_MODULE_EEPROM_BANK, + ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, + ETHTOOL_A_MODULE_EEPROM_DATA, __ETHTOOL_A_MODULE_EEPROM_CNT, ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) }; - enum { ETHTOOL_A_STATS_GRP_UNSPEC, ETHTOOL_A_STATS_GRP_PAD, + ETHTOOL_A_STATS_GRP_ID, + ETHTOOL_A_STATS_GRP_SS_ID, + ETHTOOL_A_STATS_GRP_STAT, + ETHTOOL_A_STATS_GRP_HIST_RX, + ETHTOOL_A_STATS_GRP_HIST_TX, + ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, + ETHTOOL_A_STATS_GRP_HIST_BKT_HI, + ETHTOOL_A_STATS_GRP_HIST_VAL, - ETHTOOL_A_STATS_GRP_ID, /* u32 */ - ETHTOOL_A_STATS_GRP_SS_ID, /* u32 */ - - ETHTOOL_A_STATS_GRP_STAT, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_RX, /* nest */ - ETHTOOL_A_STATS_GRP_HIST_TX, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_BKT_HI, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_VAL, /* u64 */ - - /* add new constants above here */ __ETHTOOL_A_STATS_GRP_CNT, ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_GRP_CNT - 1) }; -/* STATS */ - enum { ETHTOOL_A_STATS_UNSPEC, ETHTOOL_A_STATS_PAD, - ETHTOOL_A_STATS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STATS_GROUPS, /* bitset */ + ETHTOOL_A_STATS_HEADER, + ETHTOOL_A_STATS_GROUPS, + ETHTOOL_A_STATS_GRP, + ETHTOOL_A_STATS_SRC, - ETHTOOL_A_STATS_GRP, /* nest - _A_STATS_GRP_* */ - - ETHTOOL_A_STATS_SRC, /* u32 */ - - /* add new constants above here */ __ETHTOOL_A_STATS_CNT, ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) }; -/* PHC VCLOCKS */ - enum { ETHTOOL_A_PHC_VCLOCKS_UNSPEC, - ETHTOOL_A_PHC_VCLOCKS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHC_VCLOCKS_NUM, /* u32 */ - ETHTOOL_A_PHC_VCLOCKS_INDEX, /* array, s32 */ + ETHTOOL_A_PHC_VCLOCKS_HEADER, + ETHTOOL_A_PHC_VCLOCKS_NUM, + ETHTOOL_A_PHC_VCLOCKS_INDEX, - /* add new constants above here */ __ETHTOOL_A_PHC_VCLOCKS_CNT, ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1) }; -/* MODULE */ - enum { ETHTOOL_A_MODULE_UNSPEC, - ETHTOOL_A_MODULE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_POWER_MODE_POLICY, /* u8 */ - ETHTOOL_A_MODULE_POWER_MODE, /* u8 */ + ETHTOOL_A_MODULE_HEADER, + ETHTOOL_A_MODULE_POWER_MODE_POLICY, + ETHTOOL_A_MODULE_POWER_MODE, - /* add new constants above here */ __ETHTOOL_A_MODULE_CNT, ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) }; -/* Power Sourcing Equipment */ enum { ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, - ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, /* u32 */ + ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, + ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, }; enum { ETHTOOL_A_PSE_UNSPEC, - ETHTOOL_A_PSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PODL_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_PODL_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_C33_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_PW_CLASS, /* u32 */ - ETHTOOL_A_C33_PSE_ACTUAL_PW, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_SUBSTATE, /* u32 */ - ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, /* nest - _C33_PSE_PW_LIMIT_* */ - - /* add new constants above here */ + ETHTOOL_A_PSE_HEADER, + ETHTOOL_A_PODL_PSE_ADMIN_STATE, + ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, + ETHTOOL_A_PODL_PSE_PW_D_STATUS, + ETHTOOL_A_C33_PSE_ADMIN_STATE, + ETHTOOL_A_C33_PSE_ADMIN_CONTROL, + ETHTOOL_A_C33_PSE_PW_D_STATUS, + ETHTOOL_A_C33_PSE_PW_CLASS, + ETHTOOL_A_C33_PSE_ACTUAL_PW, + ETHTOOL_A_C33_PSE_EXT_STATE, + ETHTOOL_A_C33_PSE_EXT_SUBSTATE, + ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, + ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, + __ETHTOOL_A_PSE_CNT, ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) }; @@ -724,70 +605,62 @@ enum { enum { ETHTOOL_A_RSS_UNSPEC, ETHTOOL_A_RSS_HEADER, - ETHTOOL_A_RSS_CONTEXT, /* u32 */ - ETHTOOL_A_RSS_HFUNC, /* u32 */ - ETHTOOL_A_RSS_INDIR, /* binary */ - ETHTOOL_A_RSS_HKEY, /* binary */ - ETHTOOL_A_RSS_INPUT_XFRM, /* u32 */ - ETHTOOL_A_RSS_START_CONTEXT, /* u32 */ + ETHTOOL_A_RSS_CONTEXT, + ETHTOOL_A_RSS_HFUNC, + ETHTOOL_A_RSS_INDIR, + ETHTOOL_A_RSS_HKEY, + ETHTOOL_A_RSS_INPUT_XFRM, + ETHTOOL_A_RSS_START_CONTEXT, __ETHTOOL_A_RSS_CNT, ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), }; -/* PLCA */ - enum { ETHTOOL_A_PLCA_UNSPEC, - ETHTOOL_A_PLCA_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PLCA_VERSION, /* u16 */ - ETHTOOL_A_PLCA_ENABLED, /* u8 */ - ETHTOOL_A_PLCA_STATUS, /* u8 */ - ETHTOOL_A_PLCA_NODE_CNT, /* u32 */ - ETHTOOL_A_PLCA_NODE_ID, /* u32 */ - ETHTOOL_A_PLCA_TO_TMR, /* u32 */ - ETHTOOL_A_PLCA_BURST_CNT, /* u32 */ - ETHTOOL_A_PLCA_BURST_TMR, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_PLCA_HEADER, + ETHTOOL_A_PLCA_VERSION, + ETHTOOL_A_PLCA_ENABLED, + ETHTOOL_A_PLCA_STATUS, + ETHTOOL_A_PLCA_NODE_CNT, + ETHTOOL_A_PLCA_NODE_ID, + ETHTOOL_A_PLCA_TO_TMR, + ETHTOOL_A_PLCA_BURST_CNT, + ETHTOOL_A_PLCA_BURST_TMR, + __ETHTOOL_A_PLCA_CNT, ETHTOOL_A_PLCA_MAX = (__ETHTOOL_A_PLCA_CNT - 1) }; -/* MODULE_FW_FLASH */ - enum { ETHTOOL_A_MODULE_FW_FLASH_UNSPEC, - ETHTOOL_A_MODULE_FW_FLASH_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_DONE, /* uint */ - ETHTOOL_A_MODULE_FW_FLASH_TOTAL, /* uint */ - - /* add new constants above here */ + ETHTOOL_A_MODULE_FW_FLASH_HEADER, + ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, + ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, + ETHTOOL_A_MODULE_FW_FLASH_STATUS, + ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, + ETHTOOL_A_MODULE_FW_FLASH_DONE, + ETHTOOL_A_MODULE_FW_FLASH_TOTAL, + __ETHTOOL_A_MODULE_FW_FLASH_CNT, ETHTOOL_A_MODULE_FW_FLASH_MAX = (__ETHTOOL_A_MODULE_FW_FLASH_CNT - 1) }; enum { ETHTOOL_A_PHY_UNSPEC, - ETHTOOL_A_PHY_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHY_INDEX, /* u32 */ - ETHTOOL_A_PHY_DRVNAME, /* string */ - ETHTOOL_A_PHY_NAME, /* string */ - ETHTOOL_A_PHY_UPSTREAM_TYPE, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_INDEX, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, /* string */ - ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, /* string */ - - /* add new constants above here */ + ETHTOOL_A_PHY_HEADER, + ETHTOOL_A_PHY_INDEX, + ETHTOOL_A_PHY_DRVNAME, + ETHTOOL_A_PHY_NAME, + ETHTOOL_A_PHY_UPSTREAM_TYPE, + ETHTOOL_A_PHY_UPSTREAM_INDEX, + ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, + ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, + __ETHTOOL_A_PHY_CNT, ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) }; -/* message types - userspace to kernel */ enum { ETHTOOL_MSG_USER_NONE, ETHTOOL_MSG_STRSET_GET, @@ -836,12 +709,10 @@ enum { ETHTOOL_MSG_MODULE_FW_FLASH_ACT, ETHTOOL_MSG_PHY_GET, - /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 }; -/* message types - kernel to userspace */ enum { ETHTOOL_MSG_KERNEL_NONE, ETHTOOL_MSG_STRSET_GET_REPLY, @@ -891,7 +762,6 @@ enum { ETHTOOL_MSG_PHY_GET_REPLY, ETHTOOL_MSG_PHY_NTF, - /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 }; From 8d0580c6ebdd27879c83483f53bc71e2e470f6fe Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:49 -0800 Subject: [PATCH 0195/1450] ethtool: regenerate uapi header from the spec No functional changes. Mostly the following formatting: - extra docs - extra enums - XXX_MAX = __XXX_CNT - 1 -> XXX_MAX = (__XXX_CNT - 1) - newlines Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-9-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- .../uapi/linux/ethtool_netlink_generated.h | 89 ++++++++++++------- 1 file changed, 56 insertions(+), 33 deletions(-) diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 35a24d490efef..b58f352fe4f2a 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -1,23 +1,43 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ethtool.yaml */ +/* YNL-GEN uapi header */ + #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H +#define ETHTOOL_FAMILY_NAME "ethtool" +#define ETHTOOL_FAMILY_VERSION 1 + enum { ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, - __ETHTOOL_UDP_TUNNEL_TYPE_CNT + /* private: */ + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + ETHTOOL_UDP_TUNNEL_TYPE_MAX = (__ETHTOOL_UDP_TUNNEL_TYPE_CNT - 1) }; +/** + * enum ethtool_header_flags - common ethtool header flags + * @ETHTOOL_FLAG_COMPACT_BITSETS: use compact bitsets in reply + * @ETHTOOL_FLAG_OMIT_REPLY: provide optional reply for SET or ACT requests + * @ETHTOOL_FLAG_STATS: request statistics, if supported by the driver + */ enum ethtool_header_flags { - ETHTOOL_FLAG_COMPACT_BITSETS = 1 << 0, /* use compact bitsets in reply */ - ETHTOOL_FLAG_OMIT_REPLY = 1 << 1, /* provide optional reply for SET or ACT requests */ - ETHTOOL_FLAG_STATS = 1 << 2, /* request statistics, if supported by the driver */ + ETHTOOL_FLAG_COMPACT_BITSETS = 1, + ETHTOOL_FLAG_OMIT_REPLY = 2, + ETHTOOL_FLAG_STATS = 4, }; enum { - ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0, + ETHTOOL_PHY_UPSTREAM_TYPE_MAC, + ETHTOOL_PHY_UPSTREAM_TYPE_PHY, +}; + +enum ethtool_tcp_data_split { + ETHTOOL_TCP_DATA_SPLIT_UNKNOWN, ETHTOOL_TCP_DATA_SPLIT_DISABLED, ETHTOOL_TCP_DATA_SPLIT_ENABLED, }; @@ -30,7 +50,7 @@ enum { ETHTOOL_A_HEADER_PHY_INDEX, __ETHTOOL_A_HEADER_CNT, - ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 + ETHTOOL_A_HEADER_MAX = (__ETHTOOL_A_HEADER_CNT - 1) }; enum { @@ -40,7 +60,7 @@ enum { ETHTOOL_A_BITSET_BIT_VALUE, __ETHTOOL_A_BITSET_BIT_CNT, - ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 + ETHTOOL_A_BITSET_BIT_MAX = (__ETHTOOL_A_BITSET_BIT_CNT - 1) }; enum { @@ -48,7 +68,7 @@ enum { ETHTOOL_A_BITSET_BITS_BIT, __ETHTOOL_A_BITSET_BITS_CNT, - ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 + ETHTOOL_A_BITSET_BITS_MAX = (__ETHTOOL_A_BITSET_BITS_CNT - 1) }; enum { @@ -60,7 +80,7 @@ enum { ETHTOOL_A_BITSET_MASK, __ETHTOOL_A_BITSET_CNT, - ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 + ETHTOOL_A_BITSET_MAX = (__ETHTOOL_A_BITSET_CNT - 1) }; enum { @@ -69,7 +89,7 @@ enum { ETHTOOL_A_STRING_VALUE, __ETHTOOL_A_STRING_CNT, - ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 + ETHTOOL_A_STRING_MAX = (__ETHTOOL_A_STRING_CNT - 1) }; enum { @@ -77,7 +97,7 @@ enum { ETHTOOL_A_STRINGS_STRING, __ETHTOOL_A_STRINGS_CNT, - ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 + ETHTOOL_A_STRINGS_MAX = (__ETHTOOL_A_STRINGS_CNT - 1) }; enum { @@ -87,7 +107,7 @@ enum { ETHTOOL_A_STRINGSET_STRINGS, __ETHTOOL_A_STRINGSET_CNT, - ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 + ETHTOOL_A_STRINGSET_MAX = (__ETHTOOL_A_STRINGSET_CNT - 1) }; enum { @@ -95,7 +115,7 @@ enum { ETHTOOL_A_STRINGSETS_STRINGSET, __ETHTOOL_A_STRINGSETS_CNT, - ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 + ETHTOOL_A_STRINGSETS_MAX = (__ETHTOOL_A_STRINGSETS_CNT - 1) }; enum { @@ -105,7 +125,7 @@ enum { ETHTOOL_A_STRSET_COUNTS_ONLY, __ETHTOOL_A_STRSET_CNT, - ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 + ETHTOOL_A_STRSET_MAX = (__ETHTOOL_A_STRSET_CNT - 1) }; enum { @@ -114,7 +134,7 @@ enum { ETHTOOL_A_PRIVFLAGS_FLAGS, __ETHTOOL_A_PRIVFLAGS_CNT, - ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 + ETHTOOL_A_PRIVFLAGS_MAX = (__ETHTOOL_A_PRIVFLAGS_CNT - 1) }; enum { @@ -182,7 +202,7 @@ enum { ETHTOOL_A_LINKINFO_TRANSCEIVER, __ETHTOOL_A_LINKINFO_CNT, - ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 + ETHTOOL_A_LINKINFO_MAX = (__ETHTOOL_A_LINKINFO_CNT - 1) }; enum { @@ -199,7 +219,7 @@ enum { ETHTOOL_A_LINKMODES_RATE_MATCHING, __ETHTOOL_A_LINKMODES_CNT, - ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 + ETHTOOL_A_LINKMODES_MAX = (__ETHTOOL_A_LINKMODES_CNT - 1) }; enum { @@ -213,7 +233,7 @@ enum { ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, __ETHTOOL_A_LINKSTATE_CNT, - ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 + ETHTOOL_A_LINKSTATE_MAX = (__ETHTOOL_A_LINKSTATE_CNT - 1) }; enum { @@ -222,7 +242,7 @@ enum { ETHTOOL_A_DEBUG_MSGMASK, __ETHTOOL_A_DEBUG_CNT, - ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 + ETHTOOL_A_DEBUG_MAX = (__ETHTOOL_A_DEBUG_CNT - 1) }; enum { @@ -232,7 +252,7 @@ enum { ETHTOOL_A_WOL_SOPASS, __ETHTOOL_A_WOL_CNT, - ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 + ETHTOOL_A_WOL_MAX = (__ETHTOOL_A_WOL_CNT - 1) }; enum { @@ -244,7 +264,7 @@ enum { ETHTOOL_A_FEATURES_NOCHANGE, __ETHTOOL_A_FEATURES_CNT, - ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 + ETHTOOL_A_FEATURES_MAX = (__ETHTOOL_A_FEATURES_CNT - 1) }; enum { @@ -276,6 +296,7 @@ enum { enum { ETHTOOL_A_PROFILE_UNSPEC, ETHTOOL_A_PROFILE_IRQ_MODERATION, + __ETHTOOL_A_PROFILE_CNT, ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) }; @@ -362,7 +383,6 @@ enum { __ETHTOOL_A_TS_STAT_CNT, ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) - }; enum { @@ -412,7 +432,7 @@ enum { ETHTOOL_A_CABLE_TEST_HEADER, __ETHTOOL_A_CABLE_TEST_CNT, - ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 + ETHTOOL_A_CABLE_TEST_MAX = (__ETHTOOL_A_CABLE_TEST_CNT - 1) }; enum { @@ -433,7 +453,7 @@ enum { ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, - ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 + ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = (__ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1) }; enum { @@ -443,7 +463,7 @@ enum { ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, - ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 + ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1) }; enum { @@ -452,7 +472,7 @@ enum { ETHTOOL_A_CABLE_TEST_TDR_CFG, __ETHTOOL_A_CABLE_TEST_TDR_CNT, - ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 + ETHTOOL_A_CABLE_TEST_TDR_MAX = (__ETHTOOL_A_CABLE_TEST_TDR_CNT - 1) }; enum { @@ -580,6 +600,9 @@ enum { ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, + + __ETHTOOL_A_C33_PSE_PW_LIMIT_CNT, + __ETHTOOL_A_C33_PSE_PW_LIMIT_MAX = (__ETHTOOL_A_C33_PSE_PW_LIMIT_CNT - 1) }; enum { @@ -613,7 +636,7 @@ enum { ETHTOOL_A_RSS_START_CONTEXT, __ETHTOOL_A_RSS_CNT, - ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), + ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1) }; enum { @@ -662,8 +685,8 @@ enum { }; enum { - ETHTOOL_MSG_USER_NONE, - ETHTOOL_MSG_STRSET_GET, + ETHTOOL_MSG_USER_NONE = 0, + ETHTOOL_MSG_STRSET_GET = 1, ETHTOOL_MSG_LINKINFO_GET, ETHTOOL_MSG_LINKINFO_SET, ETHTOOL_MSG_LINKMODES_GET, @@ -710,12 +733,12 @@ enum { ETHTOOL_MSG_PHY_GET, __ETHTOOL_MSG_USER_CNT, - ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 + ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) }; enum { - ETHTOOL_MSG_KERNEL_NONE, - ETHTOOL_MSG_STRSET_GET_REPLY, + ETHTOOL_MSG_KERNEL_NONE = 0, + ETHTOOL_MSG_STRSET_GET_REPLY = 1, ETHTOOL_MSG_LINKINFO_GET_REPLY, ETHTOOL_MSG_LINKINFO_NTF, ETHTOOL_MSG_LINKMODES_GET_REPLY, @@ -763,7 +786,7 @@ enum { ETHTOOL_MSG_PHY_NTF, __ETHTOOL_MSG_KERNEL_CNT, - ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 + ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) }; #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H */ From a5686ae820fa7ab03226a3b0ff529720b7bac599 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Wed, 4 Dec 2024 18:32:10 +0200 Subject: [PATCH 0196/1450] wifi: ath12k: move ATH12K_FLAG_REGISTERED handling to ath12k_mac_register() When hardware device group abstraction is introduced, a group abstraction is registered to mac80211 rather than a particular single device. So we cannot set the device registered when the QMI firmware ready event is received, only after all the devices in group have received the event. To do that set and unset ATH12K_FLAG_REGISTERED flag inside ath12k_mac_register() and ath12k_mac_unregister() respectively. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241204163216.433795-2-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 4 ++++ drivers/net/wireless/ath/ath12k/qmi.c | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 129607ac6c1a1..1180070278dac 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -10839,6 +10839,8 @@ int ath12k_mac_register(struct ath12k_base *ab) goto err; } + set_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags); + return 0; err: @@ -10858,6 +10860,8 @@ void ath12k_mac_unregister(struct ath12k_base *ab) struct ath12k_hw *ah; int i; + clear_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags); + for (i = ath12k_get_num_hw(ab) - 1; i >= 0; i--) { ah = ath12k_ab_to_ah(ab, i); if (!ah) diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c index 77d8ee14bf33e..20382b751829e 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.c +++ b/drivers/net/wireless/ath/ath12k/qmi.c @@ -3349,11 +3349,9 @@ static void ath12k_qmi_driver_event_work(struct work_struct *work) &ab->dev_flags); clear_bit(ATH12K_FLAG_RECOVERY, &ab->dev_flags); ret = ath12k_core_qmi_firmware_ready(ab); - if (!ret) { + if (!ret) set_bit(ATH12K_FLAG_QMI_FW_READY_COMPLETE, &ab->dev_flags); - set_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags); - } break; default: From 46d16f7e1d1413ad7ff99c1334d8874623717745 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Wed, 4 Dec 2024 18:32:11 +0200 Subject: [PATCH 0197/1450] wifi: ath12k: rename mlo_capable_flags to single_chip_mlo_supp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At present, the mlo_capable_flags in ath12k_base is used to indicate whether the chip supports inter (QCN9274) or intra (WCN7850) chip MLO. However, it’s possible that the chip supports neither, especially with older firmware versions. Additionally, if intra chip MLO is not supported, inter chip MLO will also be non-functional. Therefore, having two separate flags for this is unnecessary. Therefore, rename this flag to single_chip_mlo_supp. At the same time convert it into a bool data type. Also, get rid of the enums defined earlier. For the QCN9274 family of chipsets, this will be set only when firmware advertises the support during the QMI exchange. For the WCN7850 family of chipsets, since the event is not supported, assumption is made that single chip MLO is supported. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Aditya Kumar Singh Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241204163216.433795-3-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 2 +- drivers/net/wireless/ath/ath12k/core.h | 23 ++--------------------- drivers/net/wireless/ath/ath12k/qmi.c | 13 +++++-------- 3 files changed, 8 insertions(+), 30 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 4da147f7bfac6..568c9b6e2c1ce 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -1325,7 +1325,7 @@ struct ath12k_base *ath12k_core_alloc(struct device *dev, size_t priv_size, ab->dev = dev; ab->hif.bus = bus; ab->qmi.num_radios = U8_MAX; - ab->mlo_capable_flags = ATH12K_INTRA_DEVICE_MLO_SUPPORT; + ab->single_chip_mlo_supp = false; /* Device index used to identify the devices in a group. * diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 9ddced140056c..d93ba844f61da 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -816,21 +816,6 @@ struct ath12k_soc_dp_stats { struct ath12k_soc_dp_tx_err_stats tx_err; }; -/** - * enum ath12k_link_capable_flags - link capable flags - * - * Single/Multi link capability information - * - * @ATH12K_INTRA_DEVICE_MLO_SUPPORT: SLO/MLO form between the radio, where all - * the links (radios) present within a device. - * @ATH12K_INTER_DEVICE_MLO_SUPPORT: SLO/MLO form between the radio, where all - * the links (radios) present across the devices. - */ -enum ath12k_link_capable_flags { - ATH12K_INTRA_DEVICE_MLO_SUPPORT = BIT(0), - ATH12K_INTER_DEVICE_MLO_SUPPORT = BIT(1), -}; - /* Master structure to hold the hw data which may be used in core module */ struct ath12k_base { enum ath12k_hw_rev hw_rev; @@ -996,12 +981,8 @@ struct ath12k_base { const struct hal_rx_ops *hal_rx_ops; - /* mlo_capable_flags denotes the single/multi link operation - * capabilities of the Device. - * - * See enum ath12k_link_capable_flags - */ - u8 mlo_capable_flags; + /* Denotes the whether MLO is possible within the chip */ + bool single_chip_mlo_supp; struct completion restart_completed; diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c index 20382b751829e..efcf2dfac4ac9 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.c +++ b/drivers/net/wireless/ath/ath12k/qmi.c @@ -2023,14 +2023,14 @@ static void ath12k_host_cap_parse_mlo(struct ath12k_base *ab, u8 hw_link_id = 0; int i; - if (!(ab->mlo_capable_flags & ATH12K_INTRA_DEVICE_MLO_SUPPORT)) { + if (!ab->single_chip_mlo_supp) { ath12k_dbg(ab, ATH12K_DBG_QMI, "intra device MLO is disabled hence skip QMI MLO cap"); return; } if (!ab->qmi.num_radios || ab->qmi.num_radios == U8_MAX) { - ab->mlo_capable_flags = 0; + ab->single_chip_mlo_supp = false; ath12k_dbg(ab, ATH12K_DBG_QMI, "skip QMI MLO cap due to invalid num_radio %d\n", @@ -2176,12 +2176,9 @@ static void ath12k_qmi_phy_cap_send(struct ath12k_base *ab) goto out; } - if (resp.single_chip_mlo_support_valid) { - if (resp.single_chip_mlo_support) - ab->mlo_capable_flags |= ATH12K_INTRA_DEVICE_MLO_SUPPORT; - else - ab->mlo_capable_flags &= ~ATH12K_INTRA_DEVICE_MLO_SUPPORT; - } + if (resp.single_chip_mlo_support_valid && + resp.single_chip_mlo_support) + ab->single_chip_mlo_supp = true; if (!resp.num_phy_valid) { ret = -ENODATA; From 6f245ea0ec6c29b90c8fa4fdf6e178c646125d7e Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Wed, 4 Dec 2024 18:32:12 +0200 Subject: [PATCH 0198/1450] wifi: ath12k: introduce device group abstraction Currently, single device is probed, and once firmware is ready, the device is registered to mac80211. For Multi-Link Operation, different bands of different devices or same device are part of a single wiphy and for this, hardware device group abstraction is needed. Hardware device group abstraction - when there are multiple devices (with single radio or dual radio) that are connected by any means of interface for communicating between them, then these devices can be combined together as a single group using a group id to form a group abstraction and register to mac80211. The grouping information of multiple devices would be based on device tree during device probe (will be implemented in future patches). If no such information is available, then a single device will be part of group abstraction and registered to mac80211, else multiple devices advertised in device tree are combined and then registered to mac80211. For device group abstraction, a base structure ath12k_hw_group (ag) and the helpers are implemented. These helpers are used during device probe and mapping the group to the devices involved. An illustration of how multiple devices might be combined together in future based on group id: +------------------------------------------------------------------------+ | +-------------------------------------+ +-------------------+ | | | +-----------+ | | +-----------+ | | +-----------+ | | | | | ar (2GHz) | | | | ar (5GHz) | | | | ar (6GHz) | | | | | +-----------+ | | +-----------+ | | +-----------+ | | | | ath12k_base (ab) | | ath12k_base (ab) | | | | (Dual band device) | | | | | +-------------------------------------+ +-------------------+ | | ath12k_hw_group (ag) based on group id | +------------------------------------------------------------------------+ In the above representation, two devices are combined into single group based on group id. Add base code changes where single device would be part of a group with an invalid group id forming an group abstraction. Multi device grouping will be introduced in future. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Co-developed-by: Harshitha Prem Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241204163216.433795-4-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 231 +++++++++++++++++++++++-- drivers/net/wireless/ath/ath12k/core.h | 17 ++ drivers/net/wireless/ath/ath12k/pci.c | 1 + 3 files changed, 236 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 568c9b6e2c1ce..41e3454b60f50 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -22,6 +22,11 @@ unsigned int ath12k_debug_mask; module_param_named(debug_mask, ath12k_debug_mask, uint, 0644); MODULE_PARM_DESC(debug_mask, "Debugging mask"); +/* protected with ath12k_hw_group_mutex */ +static struct list_head ath12k_hw_group_list = LIST_HEAD_INIT(ath12k_hw_group_list); + +static DEFINE_MUTEX(ath12k_hw_group_mutex); + static int ath12k_core_rfkill_config(struct ath12k_base *ab) { struct ath12k *ar; @@ -1244,27 +1249,112 @@ static void ath12k_core_panic_notifier_unregister(struct ath12k_base *ab) &ab->panic_nb); } -int ath12k_core_init(struct ath12k_base *ab) +static inline +bool ath12k_core_hw_group_create_ready(struct ath12k_hw_group *ag) { - int ret; + lockdep_assert_held(&ag->mutex); - ret = ath12k_core_soc_create(ab); - if (ret) { - ath12k_err(ab, "failed to create soc core: %d\n", ret); - return ret; + return (ag->num_probed == ag->num_devices); +} + +static struct ath12k_hw_group *ath12k_core_hw_group_alloc(u8 id, u8 max_devices) +{ + struct ath12k_hw_group *ag; + + lockdep_assert_held(&ath12k_hw_group_mutex); + + ag = kzalloc(sizeof(*ag), GFP_KERNEL); + if (!ag) + return NULL; + + ag->id = id; + ag->num_devices = max_devices; + list_add(&ag->list, &ath12k_hw_group_list); + mutex_init(&ag->mutex); + + return ag; +} + +static void ath12k_core_hw_group_free(struct ath12k_hw_group *ag) +{ + mutex_lock(&ath12k_hw_group_mutex); + + list_del(&ag->list); + kfree(ag); + + mutex_unlock(&ath12k_hw_group_mutex); +} + +static struct ath12k_hw_group *ath12k_core_hw_group_assign(struct ath12k_base *ab) +{ + u32 group_id = ATH12K_INVALID_GROUP_ID; + struct ath12k_hw_group *ag; + + lockdep_assert_held(&ath12k_hw_group_mutex); + + /* The grouping of multiple devices will be done based on device tree file. + * TODO: device tree file parsing to know about the devices involved in group. + * + * The platforms that do not have any valid group information would have each + * device to be part of its own invalid group. + * + * Currently, we are not parsing any device tree information and hence, grouping + * of multiple devices is not involved. Thus, single device is added to device + * group. + */ + ag = ath12k_core_hw_group_alloc(group_id, 1); + if (!ag) { + ath12k_warn(ab, "unable to create new hw group\n"); + return NULL; } - ret = ath12k_core_panic_notifier_register(ab); - if (ret) - ath12k_warn(ab, "failed to register panic handler: %d\n", ret); + ath12k_dbg(ab, ATH12K_DBG_BOOT, "single device added to hardware group\n"); - return 0; + ab->device_id = ag->num_probed++; + ag->ab[ab->device_id] = ab; + ab->ag = ag; + + return ag; } -void ath12k_core_deinit(struct ath12k_base *ab) +void ath12k_core_hw_group_unassign(struct ath12k_base *ab) { - ath12k_core_panic_notifier_unregister(ab); + struct ath12k_hw_group *ag = ab->ag; + u8 device_id = ab->device_id; + int num_probed; + + if (!ag) + return; + + mutex_lock(&ag->mutex); + + if (WARN_ON(device_id >= ag->num_devices)) { + mutex_unlock(&ag->mutex); + return; + } + + if (WARN_ON(ag->ab[device_id] != ab)) { + mutex_unlock(&ag->mutex); + return; + } + + ag->ab[device_id] = NULL; + ab->ag = NULL; + ab->device_id = ATH12K_INVALID_DEVICE_ID; + + if (ag->num_probed) + ag->num_probed--; + num_probed = ag->num_probed; + + mutex_unlock(&ag->mutex); + + if (!num_probed) + ath12k_core_hw_group_free(ag); +} + +static void ath12k_core_device_cleanup(struct ath12k_base *ab) +{ mutex_lock(&ab->core_lock); ath12k_hif_irq_disable(ab); @@ -1274,8 +1364,123 @@ void ath12k_core_deinit(struct ath12k_base *ab) ath12k_core_stop(ab); mutex_unlock(&ab->core_lock); +} - ath12k_core_soc_destroy(ab); +static void ath12k_core_hw_group_destroy(struct ath12k_hw_group *ag) +{ + struct ath12k_base *ab; + int i; + + if (WARN_ON(!ag)) + return; + + for (i = 0; i < ag->num_devices; i++) { + ab = ag->ab[i]; + if (!ab) + continue; + + ath12k_core_soc_destroy(ab); + } +} + +static void ath12k_core_hw_group_cleanup(struct ath12k_hw_group *ag) +{ + struct ath12k_base *ab; + int i; + + if (!ag) + return; + + mutex_lock(&ag->mutex); + + for (i = 0; i < ag->num_devices; i++) { + ab = ag->ab[i]; + if (!ab) + continue; + + ath12k_core_device_cleanup(ab); + } + + mutex_unlock(&ag->mutex); +} + +static int ath12k_core_hw_group_create(struct ath12k_hw_group *ag) +{ + struct ath12k_base *ab; + int i, ret; + + lockdep_assert_held(&ag->mutex); + + for (i = 0; i < ag->num_devices; i++) { + ab = ag->ab[i]; + if (!ab) + continue; + + mutex_lock(&ab->core_lock); + + ret = ath12k_core_soc_create(ab); + if (ret) { + mutex_unlock(&ab->core_lock); + ath12k_err(ab, "failed to create soc core: %d\n", ret); + return ret; + } + + mutex_unlock(&ab->core_lock); + } + + return 0; +} + +int ath12k_core_init(struct ath12k_base *ab) +{ + struct ath12k_hw_group *ag; + int ret; + + ret = ath12k_core_panic_notifier_register(ab); + if (ret) + ath12k_warn(ab, "failed to register panic handler: %d\n", ret); + + mutex_lock(&ath12k_hw_group_mutex); + + ag = ath12k_core_hw_group_assign(ab); + if (!ag) { + mutex_unlock(&ath12k_hw_group_mutex); + ath12k_warn(ab, "unable to get hw group\n"); + return -ENODEV; + } + + mutex_unlock(&ath12k_hw_group_mutex); + + mutex_lock(&ag->mutex); + + ath12k_dbg(ab, ATH12K_DBG_BOOT, "num devices %d num probed %d\n", + ag->num_devices, ag->num_probed); + + if (ath12k_core_hw_group_create_ready(ag)) { + ret = ath12k_core_hw_group_create(ag); + if (ret) { + mutex_unlock(&ag->mutex); + ath12k_warn(ab, "unable to create hw group\n"); + goto err; + } + } + + mutex_unlock(&ag->mutex); + + return 0; + +err: + ath12k_core_hw_group_destroy(ab->ag); + ath12k_core_hw_group_unassign(ab); + return ret; +} + +void ath12k_core_deinit(struct ath12k_base *ab) +{ + ath12k_core_panic_notifier_unregister(ab); + ath12k_core_hw_group_cleanup(ab->ag); + ath12k_core_hw_group_destroy(ab->ag); + ath12k_core_hw_group_unassign(ab); } void ath12k_core_free(struct ath12k_base *ab) diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index d93ba844f61da..dca4b9a3538f4 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -816,6 +816,20 @@ struct ath12k_soc_dp_stats { struct ath12k_soc_dp_tx_err_stats tx_err; }; +/* Holds info on the group of devices that are registered as a single + * wiphy, protected with struct ath12k_hw_group::mutex. + */ +struct ath12k_hw_group { + struct list_head list; + u8 id; + u8 num_devices; + u8 num_probed; + struct ath12k_base *ab[ATH12K_MAX_SOCS]; + + /* protects access to this struct */ + struct mutex mutex; +}; + /* Master structure to hold the hw data which may be used in core module */ struct ath12k_base { enum ath12k_hw_rev hw_rev; @@ -1005,6 +1019,8 @@ struct ath12k_base { struct notifier_block panic_nb; + struct ath12k_hw_group *ag; + /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; @@ -1035,6 +1051,7 @@ int ath12k_core_resume_early(struct ath12k_base *ab); int ath12k_core_resume(struct ath12k_base *ab); int ath12k_core_suspend(struct ath12k_base *ab); int ath12k_core_suspend_late(struct ath12k_base *ab); +void ath12k_core_hw_group_unassign(struct ath12k_base *ab); const struct firmware *ath12k_core_firmware_request(struct ath12k_base *ab, const char *filename); diff --git a/drivers/net/wireless/ath/ath12k/pci.c b/drivers/net/wireless/ath/ath12k/pci.c index 8dbc7377ae7ca..06cff3849ab8d 100644 --- a/drivers/net/wireless/ath/ath12k/pci.c +++ b/drivers/net/wireless/ath/ath12k/pci.c @@ -1725,6 +1725,7 @@ static void ath12k_pci_remove(struct pci_dev *pdev) if (test_bit(ATH12K_FLAG_QMI_FAIL, &ab->dev_flags)) { ath12k_pci_power_down(ab, false); ath12k_qmi_deinit_service(ab); + ath12k_core_hw_group_unassign(ab); goto qmi_fail; } From ee146e11b4d9183e01d8b7e4963941730ed4af6d Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Wed, 4 Dec 2024 18:32:13 +0200 Subject: [PATCH 0199/1450] wifi: ath12k: refactor core start based on hardware group Currently, mac allocate/register and core_pdev_create are initiated immediately when QMI firmware ready event is received for a particular device. With hardware device group abstraction, QMI firmware ready event can be received simultaneously for different devices in the group and so, it should not be registered immediately rather it has to be deferred until all devices in the group has received QMI firmware ready. To handle this, refactor the code of core start to have registering within ath12k_core_hw_group_start() and unregistering in ath12k_core_hw_group_stop(). Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Co-developed-by: Harshitha Prem Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241204163216.433795-5-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 195 ++++++++++++++++++------- drivers/net/wireless/ath/ath12k/core.h | 22 +++ drivers/net/wireless/ath/ath12k/qmi.c | 4 +- 3 files changed, 170 insertions(+), 51 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 41e3454b60f50..dea2c53bcc079 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -604,6 +604,8 @@ u32 ath12k_core_get_max_num_tids(struct ath12k_base *ab) static void ath12k_core_stop(struct ath12k_base *ab) { + ath12k_core_stopped(ab); + if (!test_bit(ATH12K_FLAG_CRASH_FLUSH, &ab->dev_flags)) ath12k_qmi_firmware_stop(ab); @@ -743,6 +745,8 @@ static int ath12k_core_start(struct ath12k_base *ab, { int ret; + lockdep_assert_held(&ab->core_lock); + ret = ath12k_wmi_attach(ab); if (ret) { ath12k_err(ab, "failed to attach wmi: %d\n", ret); @@ -836,6 +840,10 @@ static int ath12k_core_start(struct ath12k_base *ab, /* ACPI is optional so continue in case of an error */ ath12k_dbg(ab, ATH12K_DBG_BOOT, "acpi failed: %d\n", ret); + if (!test_bit(ATH12K_FLAG_RECOVERY, &ab->dev_flags)) + /* Indicate the core start in the appropriate group */ + ath12k_core_started(ab); + return 0; err_reo_cleanup: @@ -847,6 +855,96 @@ static int ath12k_core_start(struct ath12k_base *ab, return ret; } +static void ath12k_core_device_cleanup(struct ath12k_base *ab) +{ + mutex_lock(&ab->core_lock); + + ath12k_hif_irq_disable(ab); + ath12k_core_pdev_destroy(ab); + ath12k_mac_unregister(ab); + ath12k_mac_destroy(ab); + + mutex_unlock(&ab->core_lock); +} + +static void ath12k_core_hw_group_stop(struct ath12k_hw_group *ag) +{ + struct ath12k_base *ab; + int i; + + lockdep_assert_held(&ag->mutex); + + for (i = ag->num_devices - 1; i >= 0; i--) { + ab = ag->ab[i]; + if (!ab) + continue; + ath12k_core_device_cleanup(ab); + } +} + +static int ath12k_core_hw_group_start(struct ath12k_hw_group *ag) +{ + struct ath12k_base *ab; + int ret, i; + + lockdep_assert_held(&ag->mutex); + + for (i = 0; i < ag->num_devices; i++) { + ab = ag->ab[i]; + if (!ab) + continue; + + mutex_lock(&ab->core_lock); + + /* Check if already registered or not, since same flow + * execute for HW restart case. + */ + if (test_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags)) + goto core_pdev_create; + + ret = ath12k_mac_allocate(ab); + if (ret) { + ath12k_err(ab, "failed to create new hw device with mac80211 :%d\n", + ret); + mutex_unlock(&ab->core_lock); + return ret; + } + + ret = ath12k_mac_register(ab); + if (ret) { + ath12k_err(ab, "failed to register radio with mac80211: %d\n", + ret); + mutex_unlock(&ab->core_lock); + goto err; + } + +core_pdev_create: + ret = ath12k_core_pdev_create(ab); + if (ret) { + ath12k_err(ab, "failed to create pdev core %d\n", ret); + mutex_unlock(&ab->core_lock); + goto err; + } + + ath12k_hif_irq_enable(ab); + + ret = ath12k_core_rfkill_config(ab); + if (ret && ret != -EOPNOTSUPP) { + mutex_unlock(&ab->core_lock); + goto err; + } + + mutex_unlock(&ab->core_lock); + } + + return 0; + +err: + ath12k_core_hw_group_stop(ag); + + return ret; +} + static int ath12k_core_start_firmware(struct ath12k_base *ab, enum ath12k_firmware_mode mode) { @@ -864,9 +962,18 @@ static int ath12k_core_start_firmware(struct ath12k_base *ab, return ret; } +static inline +bool ath12k_core_hw_group_start_ready(struct ath12k_hw_group *ag) +{ + lockdep_assert_held(&ag->mutex); + + return (ag->num_started == ag->num_devices); +} + int ath12k_core_qmi_firmware_ready(struct ath12k_base *ab) { - int ret; + struct ath12k_hw_group *ag = ath12k_ab_to_ag(ab); + int ret, i; ret = ath12k_core_start_firmware(ab, ATH12K_FIRMWARE_MODE_NORMAL); if (ret) { @@ -886,59 +993,50 @@ int ath12k_core_qmi_firmware_ready(struct ath12k_base *ab) goto err_firmware_stop; } + mutex_lock(&ag->mutex); mutex_lock(&ab->core_lock); + ret = ath12k_core_start(ab, ATH12K_FIRMWARE_MODE_NORMAL); if (ret) { ath12k_err(ab, "failed to start core: %d\n", ret); goto err_dp_free; } - ret = ath12k_mac_allocate(ab); - if (ret) { - ath12k_err(ab, "failed to create new hw device with mac80211 :%d\n", - ret); - goto err_core_stop; - } - - ret = ath12k_mac_register(ab); - if (ret) { - ath12k_err(ab, "failed register the radio with mac80211: %d\n", ret); - goto err_mac_destroy; - } - - ret = ath12k_core_pdev_create(ab); - if (ret) { - ath12k_err(ab, "failed to create pdev core: %d\n", ret); - goto err_mac_unregister; - } - - ath12k_hif_irq_enable(ab); + mutex_unlock(&ab->core_lock); - ret = ath12k_core_rfkill_config(ab); - if (ret && ret != -EOPNOTSUPP) { - ath12k_err(ab, "failed to config rfkill: %d\n", ret); - goto err_hif_irq_disable; + if (ath12k_core_hw_group_start_ready(ag)) { + ret = ath12k_core_hw_group_start(ag); + if (ret) { + ath12k_warn(ab, "unable to start hw group\n"); + goto err_core_stop; + } + ath12k_dbg(ab, ATH12K_DBG_BOOT, "group %d started\n", ag->id); } - mutex_unlock(&ab->core_lock); + mutex_unlock(&ag->mutex); return 0; -err_hif_irq_disable: - ath12k_hif_irq_disable(ab); - ath12k_core_pdev_destroy(ab); -err_mac_unregister: - ath12k_mac_unregister(ab); -err_mac_destroy: - ath12k_mac_destroy(ab); err_core_stop: - ath12k_core_stop(ab); + for (i = ag->num_devices - 1; i >= 0; i--) { + ab = ag->ab[i]; + if (!ab) + continue; + + mutex_lock(&ab->core_lock); + ath12k_core_stop(ab); + mutex_unlock(&ab->core_lock); + } + goto exit; + err_dp_free: ath12k_dp_free(ab); mutex_unlock(&ab->core_lock); err_firmware_stop: ath12k_qmi_firmware_stop(ab); +exit: + mutex_unlock(&ag->mutex); return ret; } @@ -1135,6 +1233,14 @@ static void ath12k_core_restart(struct work_struct *work) } if (ab->is_reset) { + if (!test_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags)) { + atomic_dec(&ab->reset_count); + complete(&ab->reset_complete); + ab->is_reset = false; + atomic_set(&ab->fail_cont_count, 0); + ath12k_dbg(ab, ATH12K_DBG_BOOT, "reset success\n"); + } + for (i = 0; i < ath12k_get_num_hw(ab); i++) { ah = ath12k_ab_to_ah(ab, i); ieee80211_restart_hw(ah->hw); @@ -1319,7 +1425,7 @@ static struct ath12k_hw_group *ath12k_core_hw_group_assign(struct ath12k_base *a void ath12k_core_hw_group_unassign(struct ath12k_base *ab) { - struct ath12k_hw_group *ag = ab->ag; + struct ath12k_hw_group *ag = ath12k_ab_to_ag(ab); u8 device_id = ab->device_id; int num_probed; @@ -1353,19 +1459,6 @@ void ath12k_core_hw_group_unassign(struct ath12k_base *ab) ath12k_core_hw_group_free(ag); } -static void ath12k_core_device_cleanup(struct ath12k_base *ab) -{ - mutex_lock(&ab->core_lock); - - ath12k_hif_irq_disable(ab); - ath12k_core_pdev_destroy(ab); - ath12k_mac_unregister(ab); - ath12k_mac_destroy(ab); - ath12k_core_stop(ab); - - mutex_unlock(&ab->core_lock); -} - static void ath12k_core_hw_group_destroy(struct ath12k_hw_group *ag) { struct ath12k_base *ab; @@ -1393,12 +1486,16 @@ static void ath12k_core_hw_group_cleanup(struct ath12k_hw_group *ag) mutex_lock(&ag->mutex); + ath12k_core_hw_group_stop(ag); + for (i = 0; i < ag->num_devices; i++) { ab = ag->ab[i]; if (!ab) continue; - ath12k_core_device_cleanup(ab); + mutex_lock(&ab->core_lock); + ath12k_core_stop(ab); + mutex_unlock(&ab->core_lock); } mutex_unlock(&ag->mutex); diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index dca4b9a3538f4..b8b1ee8d33028 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -824,6 +824,8 @@ struct ath12k_hw_group { u8 id; u8 num_devices; u8 num_probed; + u8 num_started; + unsigned long flags; struct ath12k_base *ab[ATH12K_MAX_SOCS]; /* protects access to this struct */ @@ -1175,4 +1177,24 @@ static inline int ath12k_get_num_hw(struct ath12k_base *ab) { return ab->num_hw; } + +static inline struct ath12k_hw_group *ath12k_ab_to_ag(struct ath12k_base *ab) +{ + return ab->ag; +} + +static inline void ath12k_core_started(struct ath12k_base *ab) +{ + lockdep_assert_held(&ab->ag->mutex); + + ab->ag->num_started++; +} + +static inline void ath12k_core_stopped(struct ath12k_base *ab) +{ + lockdep_assert_held(&ab->ag->mutex); + + ab->ag->num_started--; +} + #endif /* _CORE_H_ */ diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c index efcf2dfac4ac9..8b4d500fe426a 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.c +++ b/drivers/net/wireless/ath/ath12k/qmi.c @@ -3321,7 +3321,6 @@ static void ath12k_qmi_driver_event_work(struct work_struct *work) break; case ATH12K_QMI_EVENT_SERVER_EXIT: set_bit(ATH12K_FLAG_CRASH_FLUSH, &ab->dev_flags); - set_bit(ATH12K_FLAG_RECOVERY, &ab->dev_flags); break; case ATH12K_QMI_EVENT_REQUEST_MEM: ret = ath12k_qmi_event_mem_request(qmi); @@ -3338,13 +3337,14 @@ static void ath12k_qmi_driver_event_work(struct work_struct *work) if (test_bit(ATH12K_FLAG_QMI_FW_READY_COMPLETE, &ab->dev_flags)) { if (ab->is_reset) ath12k_hal_dump_srng_stats(ab); + + set_bit(ATH12K_FLAG_RECOVERY, &ab->dev_flags); queue_work(ab->workqueue, &ab->restart_work); break; } clear_bit(ATH12K_FLAG_CRASH_FLUSH, &ab->dev_flags); - clear_bit(ATH12K_FLAG_RECOVERY, &ab->dev_flags); ret = ath12k_core_qmi_firmware_ready(ab); if (!ret) set_bit(ATH12K_FLAG_QMI_FW_READY_COMPLETE, From a343d97f27f514015e6d5e789672cf4ab4111720 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Wed, 4 Dec 2024 18:32:14 +0200 Subject: [PATCH 0200/1450] wifi: ath12k: move struct ath12k_hw from per device to group Currently, hardware abstractions (ah) of different radio bands are tightly coupled to a single device (ab). But, with hardware device group abstraction (ag), multiple radios across different devices in a group can form different combinations of hardware abstractions (ah) within the group. Hence, the mapping between ah to ab can be removed and instead it can be mapped with struct ath12k_hw_group (ag). Current mapping between struct ath12k_hw (ah), struct ath12k_base (ab) and struct ath12k_hw_group (ag): +------------------------------------------------+ | +-------------------------------------+ | | | +---------------+ +---------------+ | | | | |ath12k_hw (ah) | |ath12k_hw (ah) | | | | | +---------------+ +---------------+ | | | | | | | | +-----------+ | +-----------+ | | | | | ar (2GHz) | | | ar (5GHz) | | | | | +-----------+ | +-----------+ | | | | Dual band device-1 (ab) | | | +-------------------------------------+ | | ath12k_hw_group (ag) based on group id | +------------------------------------------------+ After hardware device group abstraction moving ah array out of ab to ag: +----------------------------------------------+ | +---------------+ +---------------+ | | |ath12k_hw (ah) | |ath12k_hw (ah) | | | +---------------+ +---------------+ | | +-------------------------------------+ | | | +-----------+ +-----------+ | | | | | ar (2GHz) | | ar (5GHz) | | | | | +-----------+ +-----------+ | | | | Dual band device-1 (ab) | | | +-------------------------------------+ | | ath12k_hw_group (ag) based on group id | +----------------------------------------------+ This decoupling of struct ath12k_hw (ah) from struct ath12k_base (ab) and mapping it to struct ath12k_hw_group (ag) will help in forming different combinations of multi-link devices. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241204163216.433795-6-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 49 +++++++------- drivers/net/wireless/ath/ath12k/core.h | 29 +++++---- drivers/net/wireless/ath/ath12k/dp.c | 19 ++---- drivers/net/wireless/ath/ath12k/dp.h | 2 +- drivers/net/wireless/ath/ath12k/mac.c | 89 ++++++++++++++++++-------- drivers/net/wireless/ath/ath12k/mac.h | 9 +-- 6 files changed, 115 insertions(+), 82 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index dea2c53bcc079..bbfa57d097af4 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -861,8 +861,6 @@ static void ath12k_core_device_cleanup(struct ath12k_base *ab) ath12k_hif_irq_disable(ab); ath12k_core_pdev_destroy(ab); - ath12k_mac_unregister(ab); - ath12k_mac_destroy(ab); mutex_unlock(&ab->core_lock); } @@ -874,12 +872,18 @@ static void ath12k_core_hw_group_stop(struct ath12k_hw_group *ag) lockdep_assert_held(&ag->mutex); + clear_bit(ATH12K_GROUP_FLAG_REGISTERED, &ag->flags); + + ath12k_mac_unregister(ag); + for (i = ag->num_devices - 1; i >= 0; i--) { ab = ag->ab[i]; if (!ab) continue; ath12k_core_device_cleanup(ab); } + + ath12k_mac_destroy(ag); } static int ath12k_core_hw_group_start(struct ath12k_hw_group *ag) @@ -889,6 +893,20 @@ static int ath12k_core_hw_group_start(struct ath12k_hw_group *ag) lockdep_assert_held(&ag->mutex); + if (test_bit(ATH12K_GROUP_FLAG_REGISTERED, &ag->flags)) + goto core_pdev_create; + + ret = ath12k_mac_allocate(ag); + if (WARN_ON(ret)) + return ret; + + ret = ath12k_mac_register(ag); + if (WARN_ON(ret)) + goto err_mac_destroy; + + set_bit(ATH12K_GROUP_FLAG_REGISTERED, &ag->flags); + +core_pdev_create: for (i = 0; i < ag->num_devices; i++) { ab = ag->ab[i]; if (!ab) @@ -896,29 +914,6 @@ static int ath12k_core_hw_group_start(struct ath12k_hw_group *ag) mutex_lock(&ab->core_lock); - /* Check if already registered or not, since same flow - * execute for HW restart case. - */ - if (test_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags)) - goto core_pdev_create; - - ret = ath12k_mac_allocate(ab); - if (ret) { - ath12k_err(ab, "failed to create new hw device with mac80211 :%d\n", - ret); - mutex_unlock(&ab->core_lock); - return ret; - } - - ret = ath12k_mac_register(ab); - if (ret) { - ath12k_err(ab, "failed to register radio with mac80211: %d\n", - ret); - mutex_unlock(&ab->core_lock); - goto err; - } - -core_pdev_create: ret = ath12k_core_pdev_create(ab); if (ret) { ath12k_err(ab, "failed to create pdev core %d\n", ret); @@ -941,6 +936,10 @@ static int ath12k_core_hw_group_start(struct ath12k_hw_group *ag) err: ath12k_core_hw_group_stop(ag); + return ret; + +err_mac_destroy: + ath12k_mac_destroy(ag); return ret; } diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index b8b1ee8d33028..64252d6491cd8 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -64,6 +64,7 @@ #define ATH12K_RECOVER_START_TIMEOUT_HZ (20 * HZ) #define ATH12K_MAX_SOCS 3 +#define ATH12K_GROUP_MAX_RADIO (ATH12K_MAX_SOCS * MAX_RADIOS) #define ATH12K_INVALID_GROUP_ID 0xFF #define ATH12K_INVALID_DEVICE_ID 0xFF @@ -216,6 +217,10 @@ enum ath12k_scan_state { ATH12K_SCAN_ABORTING, }; +enum ath12k_hw_group_flags { + ATH12K_GROUP_FLAG_REGISTERED, +}; + enum ath12k_dev_flags { ATH12K_CAC_RUNNING, ATH12K_FLAG_CRASH_FLUSH, @@ -830,6 +835,15 @@ struct ath12k_hw_group { /* protects access to this struct */ struct mutex mutex; + + /* Holds information of wiphy (hw) registration. + * + * In Multi/Single Link Operation case, all pdevs are registered as + * a single wiphy. In other (legacy/Non-MLO) cases, each pdev is + * registered as separate wiphys. + */ + struct ath12k_hw *ah[ATH12K_GROUP_MAX_RADIO]; + u8 num_hw; }; /* Master structure to hold the hw data which may be used in core module */ @@ -895,15 +909,6 @@ struct ath12k_base { struct ath12k_pdev __rcu *pdevs_active[MAX_RADIOS]; - /* Holds information of wiphy (hw) registration. - * - * In Multi/Single Link Operation case, all pdevs are registered as - * a single wiphy. In other (legacy/Non-MLO) cases, each pdev is - * registered as separate wiphys. - */ - struct ath12k_hw *ah[MAX_RADIOS]; - u8 num_hw; - struct ath12k_wmi_hal_reg_capabilities_ext_arg hal_reg_cap[MAX_RADIOS]; unsigned long long free_vdev_map; unsigned long long free_vdev_stats_id_map; @@ -1164,18 +1169,18 @@ static inline struct ieee80211_hw *ath12k_ar_to_hw(struct ath12k *ar) static inline struct ath12k_hw *ath12k_ab_to_ah(struct ath12k_base *ab, int idx) { - return ab->ah[idx]; + return ab->ag->ah[idx]; } static inline void ath12k_ab_set_ah(struct ath12k_base *ab, int idx, struct ath12k_hw *ah) { - ab->ah[idx] = ah; + ab->ag->ah[idx] = ah; } static inline int ath12k_get_num_hw(struct ath12k_base *ab) { - return ab->num_hw; + return ab->ag->num_hw; } static inline struct ath12k_hw_group *ath12k_ab_to_ag(struct ath12k_base *ab) diff --git a/drivers/net/wireless/ath/ath12k/dp.c b/drivers/net/wireless/ath/ath12k/dp.c index ce823b1c175f5..68abe9d4ab45c 100644 --- a/drivers/net/wireless/ath/ath12k/dp.c +++ b/drivers/net/wireless/ath/ath12k/dp.c @@ -991,21 +991,14 @@ void ath12k_dp_pdev_free(struct ath12k_base *ab) ath12k_dp_rx_pdev_free(ab, i); } -void ath12k_dp_pdev_pre_alloc(struct ath12k_base *ab) +void ath12k_dp_pdev_pre_alloc(struct ath12k *ar) { - struct ath12k *ar; - struct ath12k_pdev_dp *dp; - int i; + struct ath12k_pdev_dp *dp = &ar->dp; - for (i = 0; i < ab->num_radios; i++) { - ar = ab->pdevs[i].ar; - dp = &ar->dp; - dp->mac_id = i; - atomic_set(&dp->num_tx_pending, 0); - init_waitqueue_head(&dp->tx_empty_waitq); - - /* TODO: Add any RXDMA setup required per pdev */ - } + dp->mac_id = ar->pdev_idx; + atomic_set(&dp->num_tx_pending, 0); + init_waitqueue_head(&dp->tx_empty_waitq); + /* TODO: Add any RXDMA setup required per pdev */ } bool ath12k_dp_wmask_compaction_rx_tlv_supported(struct ath12k_base *ab) diff --git a/drivers/net/wireless/ath/ath12k/dp.h b/drivers/net/wireless/ath/ath12k/dp.h index a120b7a8477d8..021cd9e8ee1d3 100644 --- a/drivers/net/wireless/ath/ath12k/dp.h +++ b/drivers/net/wireless/ath/ath12k/dp.h @@ -1806,7 +1806,7 @@ void ath12k_dp_free(struct ath12k_base *ab); int ath12k_dp_alloc(struct ath12k_base *ab); void ath12k_dp_cc_config(struct ath12k_base *ab); int ath12k_dp_pdev_alloc(struct ath12k_base *ab); -void ath12k_dp_pdev_pre_alloc(struct ath12k_base *ab); +void ath12k_dp_pdev_pre_alloc(struct ath12k *ar); void ath12k_dp_pdev_free(struct ath12k_base *ab); int ath12k_dp_tx_htt_srng_setup(struct ath12k_base *ab, u32 ring_id, int mac_id, enum hal_ring_type ring_type); diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 1180070278dac..1cf724a530b51 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -10818,19 +10818,13 @@ static void ath12k_mac_setup(struct ath12k *ar) skb_queue_head_init(&ar->wmi_mgmt_tx_queue); } -int ath12k_mac_register(struct ath12k_base *ab) +int ath12k_mac_register(struct ath12k_hw_group *ag) { + struct ath12k_base *ab = ag->ab[0]; struct ath12k_hw *ah; int i; int ret; - if (test_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags)) - return 0; - - /* Initialize channel counters frequency value in hertz */ - ab->cc_freq_hz = 320000; - ab->free_vdev_map = (1LL << (ab->num_radios * TARGET_NUM_VDEVS)) - 1; - for (i = 0; i < ath12k_get_num_hw(ab); i++) { ah = ath12k_ab_to_ah(ab, i); @@ -10855,8 +10849,9 @@ int ath12k_mac_register(struct ath12k_base *ab) return ret; } -void ath12k_mac_unregister(struct ath12k_base *ab) +void ath12k_mac_unregister(struct ath12k_hw_group *ag) { + struct ath12k_base *ab = ag->ab[0]; struct ath12k_hw *ah; int i; @@ -10876,12 +10871,13 @@ static void ath12k_mac_hw_destroy(struct ath12k_hw *ah) ieee80211_free_hw(ah->hw); } -static struct ath12k_hw *ath12k_mac_hw_allocate(struct ath12k_base *ab, +static struct ath12k_hw *ath12k_mac_hw_allocate(struct ath12k_hw_group *ag, struct ath12k_pdev_map *pdev_map, u8 num_pdev_map) { struct ieee80211_hw *hw; struct ath12k *ar; + struct ath12k_base *ab; struct ath12k_pdev *pdev; struct ath12k_hw *ah; int i; @@ -10913,23 +10909,30 @@ static struct ath12k_hw *ath12k_mac_hw_allocate(struct ath12k_base *ab, pdev->ar = ar; ath12k_mac_setup(ar); + ath12k_dp_pdev_pre_alloc(ar); } return ah; } -void ath12k_mac_destroy(struct ath12k_base *ab) +void ath12k_mac_destroy(struct ath12k_hw_group *ag) { struct ath12k_pdev *pdev; + struct ath12k_base *ab = ag->ab[0]; + int i, j; struct ath12k_hw *ah; - int i; - for (i = 0; i < ab->num_radios; i++) { - pdev = &ab->pdevs[i]; - if (!pdev->ar) + for (i = 0; i < ag->num_devices; i++) { + ab = ag->ab[i]; + if (!ab) continue; - pdev->ar = NULL; + for (j = 0; j < ab->num_radios; j++) { + pdev = &ab->pdevs[j]; + if (!pdev->ar) + continue; + pdev->ar = NULL; + } } for (i = 0; i < ath12k_get_num_hw(ab); i++) { @@ -10942,26 +10945,59 @@ void ath12k_mac_destroy(struct ath12k_base *ab) } } -int ath12k_mac_allocate(struct ath12k_base *ab) +static void ath12k_mac_set_device_defaults(struct ath12k_base *ab) +{ + /* Initialize channel counters frequency value in hertz */ + ab->cc_freq_hz = 320000; + ab->free_vdev_map = (1LL << (ab->num_radios * TARGET_NUM_VDEVS)) - 1; +} + +int ath12k_mac_allocate(struct ath12k_hw_group *ag) { + struct ath12k_pdev_map pdev_map[ATH12K_GROUP_MAX_RADIO]; + int mac_id, device_id, total_radio, num_hw; + struct ath12k_base *ab; struct ath12k_hw *ah; - struct ath12k_pdev_map pdev_map[MAX_RADIOS]; int ret, i, j; u8 radio_per_hw; - if (test_bit(ATH12K_FLAG_REGISTERED, &ab->dev_flags)) - return 0; + total_radio = 0; + for (i = 0; i < ag->num_devices; i++) + total_radio += ag->ab[i]->num_radios; - ab->num_hw = ab->num_radios; + /* All pdev get combined and register as single wiphy based on + * hardware group which participate in multi-link operation else + * each pdev get register separately. + * + * Currently, registering as single pdevs. + */ radio_per_hw = 1; + num_hw = total_radio / radio_per_hw; - for (i = 0; i < ath12k_get_num_hw(ab); i++) { + if (WARN_ON(num_hw >= ATH12K_GROUP_MAX_RADIO)) + return -ENOSPC; + + ag->num_hw = 0; + device_id = 0; + mac_id = 0; + for (i = 0; i < num_hw; i++) { for (j = 0; j < radio_per_hw; j++) { + ab = ag->ab[device_id]; pdev_map[j].ab = ab; - pdev_map[j].pdev_idx = (i * radio_per_hw) + j; + pdev_map[j].pdev_idx = mac_id; + mac_id++; + + /* If mac_id falls beyond the current device MACs then + * move to next device + */ + if (mac_id >= ab->num_radios) { + mac_id = 0; + device_id++; + ath12k_mac_set_device_defaults(ab); + } } - ah = ath12k_mac_hw_allocate(ab, pdev_map, radio_per_hw); + ah = ath12k_mac_hw_allocate(ag, pdev_map, radio_per_hw); if (!ah) { ath12k_warn(ab, "failed to allocate mac80211 hw device for hw_idx %d\n", i); @@ -10971,11 +11007,10 @@ int ath12k_mac_allocate(struct ath12k_base *ab) ah->dev = ab->dev; - ath12k_ab_set_ah(ab, i, ah); + ag->ah[i] = ah; + ag->num_hw++; } - ath12k_dp_pdev_pre_alloc(ab); - return 0; err: diff --git a/drivers/net/wireless/ath/ath12k/mac.h b/drivers/net/wireless/ath/ath12k/mac.h index abdc9a6c07402..ccfc215d83ffe 100644 --- a/drivers/net/wireless/ath/ath12k/mac.h +++ b/drivers/net/wireless/ath/ath12k/mac.h @@ -14,6 +14,7 @@ struct ath12k; struct ath12k_base; struct ath12k_hw; +struct ath12k_hw_group; struct ath12k_pdev_map; struct ath12k_generic_iter { @@ -60,10 +61,10 @@ enum ath12k_supported_bw { extern const struct htt_rx_ring_tlv_filter ath12k_mac_mon_status_filter_default; -void ath12k_mac_destroy(struct ath12k_base *ab); -void ath12k_mac_unregister(struct ath12k_base *ab); -int ath12k_mac_register(struct ath12k_base *ab); -int ath12k_mac_allocate(struct ath12k_base *ab); +void ath12k_mac_destroy(struct ath12k_hw_group *ag); +void ath12k_mac_unregister(struct ath12k_hw_group *ag); +int ath12k_mac_register(struct ath12k_hw_group *ag); +int ath12k_mac_allocate(struct ath12k_hw_group *ag); int ath12k_mac_hw_ratecode_to_legacy_rate(u8 hw_rc, u8 preamble, u8 *rateidx, u16 *rate); u8 ath12k_mac_bitrate_to_idx(const struct ieee80211_supported_band *sband, From d302ac65ac938516487f57ae20f11e9cf6327606 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Wed, 4 Dec 2024 18:32:15 +0200 Subject: [PATCH 0201/1450] wifi: ath12k: send QMI host capability after device group is ready QMI host capability has the information regarding MLO parameters such as device_id, MLO capability, group id and the information of each devices involved in the group and sent immediately on QMI server arrive event. Currently, only one device is involved in the group and hence, single device information is sent as part of MLO capability of host. But, in future when multi device group abstraction is introduced, host should send all the device information involved in the group as part of QMI MLO host capability rather than single device. Hence, sending QMI host capability immediately on server arrive of a device might not be ideal for multi device group abstraction as the details of other devices in the group would not be available. Hence, once QMI server arrive event is received, request for QMI PHY capabilities of device, and defer the host capability send for that device. After QMI PHY capability is received for all the devices in the group trigger the host capability event for the deferred devices in the group. Hence, add changes to defer the QMI host capability event until the device group is ready and then resume the QMI exchange for all the device with host capabilities. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241204163216.433795-7-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 21 ++++++ drivers/net/wireless/ath/ath12k/qmi.c | 98 ++++++++++++++++++++++++-- drivers/net/wireless/ath/ath12k/qmi.h | 20 ++++++ 3 files changed, 134 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index bbfa57d097af4..18b29515c6ae4 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -969,6 +969,25 @@ bool ath12k_core_hw_group_start_ready(struct ath12k_hw_group *ag) return (ag->num_started == ag->num_devices); } +static void ath12k_core_trigger_partner(struct ath12k_base *ab) +{ + struct ath12k_hw_group *ag = ab->ag; + struct ath12k_base *partner_ab; + bool found = false; + int i; + + for (i = 0; i < ag->num_devices; i++) { + partner_ab = ag->ab[i]; + if (!partner_ab) + continue; + + if (found) + ath12k_qmi_trigger_host_cap(partner_ab); + + found = (partner_ab == ab); + } +} + int ath12k_core_qmi_firmware_ready(struct ath12k_base *ab) { struct ath12k_hw_group *ag = ath12k_ab_to_ag(ab); @@ -1010,6 +1029,8 @@ int ath12k_core_qmi_firmware_ready(struct ath12k_base *ab) goto err_core_stop; } ath12k_dbg(ab, ATH12K_DBG_BOOT, "group %d started\n", ag->id); + } else { + ath12k_core_trigger_partner(ab); } mutex_unlock(&ag->mutex); diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c index 8b4d500fe426a..2591d132a3fc7 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.c +++ b/drivers/net/wireless/ath/ath12k/qmi.c @@ -3088,11 +3088,69 @@ ath12k_qmi_driver_event_post(struct ath12k_qmi *qmi, return 0; } +void ath12k_qmi_trigger_host_cap(struct ath12k_base *ab) +{ + struct ath12k_qmi *qmi = &ab->qmi; + + spin_lock(&qmi->event_lock); + + if (ath12k_qmi_get_event_block(qmi)) + ath12k_qmi_set_event_block(qmi, false); + + spin_unlock(&qmi->event_lock); + + ath12k_dbg(ab, ATH12K_DBG_QMI, "trigger host cap for device id %d\n", + ab->device_id); + + ath12k_qmi_driver_event_post(qmi, ATH12K_QMI_EVENT_HOST_CAP, NULL); +} + +static bool ath12k_qmi_hw_group_host_cap_ready(struct ath12k_hw_group *ag) +{ + struct ath12k_base *ab; + int i; + + for (i = 0; i < ag->num_devices; i++) { + ab = ag->ab[i]; + + if (!(ab && ab->qmi.num_radios != U8_MAX)) + return false; + } + + return true; +} + +static struct ath12k_base *ath12k_qmi_hw_group_find_blocked(struct ath12k_hw_group *ag) +{ + struct ath12k_base *ab; + int i; + + lockdep_assert_held(&ag->mutex); + + for (i = 0; i < ag->num_devices; i++) { + ab = ag->ab[i]; + if (!ab) + continue; + + spin_lock(&ab->qmi.event_lock); + + if (ath12k_qmi_get_event_block(&ab->qmi)) { + spin_unlock(&ab->qmi.event_lock); + return ab; + } + + spin_unlock(&ab->qmi.event_lock); + } + + return NULL; +} + /* clang stack usage explodes if this is inlined */ static noinline_for_stack int ath12k_qmi_event_server_arrive(struct ath12k_qmi *qmi) { - struct ath12k_base *ab = qmi->ab; + struct ath12k_base *ab = qmi->ab, *block_ab; + struct ath12k_hw_group *ag = ab->ag; int ret; ath12k_qmi_phy_cap_send(ab); @@ -3103,12 +3161,22 @@ int ath12k_qmi_event_server_arrive(struct ath12k_qmi *qmi) return ret; } - ret = ath12k_qmi_host_cap_send(ab); - if (ret < 0) { - ath12k_warn(ab, "qmi failed to send host cap QMI:%d\n", ret); - return ret; + spin_lock(&qmi->event_lock); + + ath12k_qmi_set_event_block(qmi, true); + + spin_unlock(&qmi->event_lock); + + mutex_lock(&ag->mutex); + + if (ath12k_qmi_hw_group_host_cap_ready(ag)) { + block_ab = ath12k_qmi_hw_group_find_blocked(ag); + if (block_ab) + ath12k_qmi_trigger_host_cap(block_ab); } + mutex_unlock(&ag->mutex); + return ret; } @@ -3295,6 +3363,21 @@ static const struct qmi_ops ath12k_qmi_ops = { .del_server = ath12k_qmi_ops_del_server, }; +static int ath12k_qmi_event_host_cap(struct ath12k_qmi *qmi) +{ + struct ath12k_base *ab = qmi->ab; + int ret; + + ret = ath12k_qmi_host_cap_send(ab); + if (ret < 0) { + ath12k_warn(ab, "failed to send qmi host cap for device id %d: %d\n", + ab->device_id, ret); + return ret; + } + + return ret; +} + static void ath12k_qmi_driver_event_work(struct work_struct *work) { struct ath12k_qmi *qmi = container_of(work, struct ath12k_qmi, @@ -3351,6 +3434,11 @@ static void ath12k_qmi_driver_event_work(struct work_struct *work) &ab->dev_flags); break; + case ATH12K_QMI_EVENT_HOST_CAP: + ret = ath12k_qmi_event_host_cap(qmi); + if (ret < 0) + set_bit(ATH12K_FLAG_QMI_FAIL, &ab->dev_flags); + break; default: ath12k_warn(ab, "invalid event type: %d", event->type); break; diff --git a/drivers/net/wireless/ath/ath12k/qmi.h b/drivers/net/wireless/ath/ath12k/qmi.h index 0dfcbd8cb59b6..98f6009ab21e3 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.h +++ b/drivers/net/wireless/ath/ath12k/qmi.h @@ -68,6 +68,7 @@ enum ath12k_qmi_event_type { ATH12K_QMI_EVENT_FORCE_FW_ASSERT, ATH12K_QMI_EVENT_POWER_UP, ATH12K_QMI_EVENT_POWER_DOWN, + ATH12K_QMI_EVENT_HOST_CAP, ATH12K_QMI_EVENT_MAX, }; @@ -142,6 +143,10 @@ struct ath12k_qmi { u32 target_mem_mode; bool target_mem_delayed; u8 cal_done; + + /* protected with struct ath12k_qmi::event_lock */ + bool block_event; + u8 num_radios; struct target_info target; struct m3_mem_region m3_mem; @@ -594,11 +599,26 @@ struct qmi_wlanfw_wlan_ini_resp_msg_v01 { struct qmi_response_type_v01 resp; }; +static inline void ath12k_qmi_set_event_block(struct ath12k_qmi *qmi, bool block) +{ + lockdep_assert_held(&qmi->event_lock); + + qmi->block_event = block; +} + +static inline bool ath12k_qmi_get_event_block(struct ath12k_qmi *qmi) +{ + lockdep_assert_held(&qmi->event_lock); + + return qmi->block_event; +} + int ath12k_qmi_firmware_start(struct ath12k_base *ab, u32 mode); void ath12k_qmi_firmware_stop(struct ath12k_base *ab); void ath12k_qmi_deinit_service(struct ath12k_base *ab); int ath12k_qmi_init_service(struct ath12k_base *ab); void ath12k_qmi_free_resource(struct ath12k_base *ab); +void ath12k_qmi_trigger_host_cap(struct ath12k_base *ab); #endif From da8656797ae10b524a7a0c3d5eeb6237fa3ddd70 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Wed, 4 Dec 2024 18:32:16 +0200 Subject: [PATCH 0202/1450] wifi: ath12k: introduce mlo_capable flag for device group Currently, during mac80211 allocate and register single device is considered for the registration. But, in future, during multi device group abstraction is introduced, all the devices has to be combined together as a single abstraction and then hardware should be allocated. All the devices in the group should be combined together only if it supports inter device mlo capability. The decision of whether to combine the devices or not can be based on the mlo capability flag in ath12k_hw_group. By default, mlo_capable flag in the group would be set as false. During QMI PHY capability exchange, only when we have more than one chip in the group or if one chip, then that chip supports inter MLO, then mlo_capable flag in the group will be enabled. Add changes to introduce mlo_capable flag for device group and refactor ath12k_mac_hw_allocate() api based on device group (ag) rather than device (ab). Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Signed-off-by: Karthikeyan Periyasamy Signed-off-by: Harshitha Prem Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241204163216.433795-8-kvalo@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 17 +++++++++++++++++ drivers/net/wireless/ath/ath12k/core.h | 3 +++ drivers/net/wireless/ath/ath12k/mac.c | 8 +++++--- drivers/net/wireless/ath/ath12k/qmi.c | 6 ++++-- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 18b29515c6ae4..49d1ac15cb7a2 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -1439,6 +1439,7 @@ static struct ath12k_hw_group *ath12k_core_hw_group_assign(struct ath12k_base *a ab->device_id = ag->num_probed++; ag->ab[ab->device_id] = ab; ab->ag = ag; + ag->mlo_capable = false; return ag; } @@ -1548,6 +1549,22 @@ static int ath12k_core_hw_group_create(struct ath12k_hw_group *ag) return 0; } +void ath12k_core_hw_group_set_mlo_capable(struct ath12k_hw_group *ag) +{ + lockdep_assert_held(&ag->mutex); + + /* If more than one devices are grouped, then inter MLO + * functionality can work still independent of whether internally + * each device supports single_chip_mlo or not. + * Only when there is one device, then it depends whether the + * device can support intra chip MLO or not + */ + if (ag->num_devices > 1) + ag->mlo_capable = true; + else + ag->mlo_capable = ag->ab[0]->single_chip_mlo_supp; +} + int ath12k_core_init(struct ath12k_base *ab) { struct ath12k_hw_group *ag; diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 64252d6491cd8..458e3d0071a83 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -844,6 +844,7 @@ struct ath12k_hw_group { */ struct ath12k_hw *ah[ATH12K_GROUP_MAX_RADIO]; u8 num_hw; + bool mlo_capable; }; /* Master structure to hold the hw data which may be used in core module */ @@ -1066,6 +1067,8 @@ u32 ath12k_core_get_max_station_per_radio(struct ath12k_base *ab); u32 ath12k_core_get_max_peers_per_radio(struct ath12k_base *ab); u32 ath12k_core_get_max_num_tids(struct ath12k_base *ab); +void ath12k_core_hw_group_set_mlo_capable(struct ath12k_hw_group *ag); + static inline const char *ath12k_scan_state_str(enum ath12k_scan_state state) { switch (state) { diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 1cf724a530b51..c4eab4c1c10e0 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -10968,10 +10968,12 @@ int ath12k_mac_allocate(struct ath12k_hw_group *ag) /* All pdev get combined and register as single wiphy based on * hardware group which participate in multi-link operation else * each pdev get register separately. - * - * Currently, registering as single pdevs. */ - radio_per_hw = 1; + if (ag->mlo_capable) + radio_per_hw = total_radio; + else + radio_per_hw = 1; + num_hw = total_radio / radio_per_hw; if (WARN_ON(num_hw >= ATH12K_GROUP_MAX_RADIO)) diff --git a/drivers/net/wireless/ath/ath12k/qmi.c b/drivers/net/wireless/ath/ath12k/qmi.c index 2591d132a3fc7..ba3cd23424654 100644 --- a/drivers/net/wireless/ath/ath12k/qmi.c +++ b/drivers/net/wireless/ath/ath12k/qmi.c @@ -2023,9 +2023,9 @@ static void ath12k_host_cap_parse_mlo(struct ath12k_base *ab, u8 hw_link_id = 0; int i; - if (!ab->single_chip_mlo_supp) { + if (!ab->ag->mlo_capable) { ath12k_dbg(ab, ATH12K_DBG_QMI, - "intra device MLO is disabled hence skip QMI MLO cap"); + "MLO is disabled hence skip QMI MLO cap"); return; } @@ -3170,6 +3170,8 @@ int ath12k_qmi_event_server_arrive(struct ath12k_qmi *qmi) mutex_lock(&ag->mutex); if (ath12k_qmi_hw_group_host_cap_ready(ag)) { + ath12k_core_hw_group_set_mlo_capable(ag); + block_ab = ath12k_qmi_hw_group_find_blocked(ag); if (block_ab) ath12k_qmi_trigger_host_cap(block_ab); From 0bee36d1a51366fa57b731f8975f26f92943b43e Mon Sep 17 00:00:00 2001 From: Song Yoong Siang Date: Thu, 5 Dec 2024 12:42:58 +0800 Subject: [PATCH 0203/1450] selftests/bpf: Actuate tx_metadata_len in xdp_hw_metadata set XDP_UMEM_TX_METADATA_LEN flag to reserve tx_metadata_len bytes of per-chunk metadata. Fixes: d5e726d9143c ("xsk: Require XDP_UMEM_TX_METADATA_LEN to actuate tx_metadata_len") Signed-off-by: Song Yoong Siang Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20241205044258.3155799-1-yoong.siang.song@intel.com --- tools/testing/selftests/bpf/xdp_hw_metadata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 6f9956eed797f..ad6c08dfd6c8c 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -79,7 +79,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id) .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, - .flags = XSK_UMEM__DEFAULT_FLAGS, + .flags = XDP_UMEM_TX_METADATA_LEN, .tx_metadata_len = sizeof(struct xsk_tx_metadata), }; __u32 idx = 0; From 2309132fc5d9d87deb15bda3497326aded6bfe4a Mon Sep 17 00:00:00 2001 From: Song Yoong Siang Date: Thu, 5 Dec 2024 13:19:36 +0800 Subject: [PATCH 0204/1450] selftests/bpf: Enable Tx hwtstamp in xdp_hw_metadata Currently, user needs to manually enable transmit hardware timestamp feature of certain Ethernet drivers, e.g. stmmac and igc drivers, through following command after running the xdp_hw_metadata app. sudo hwstamp_ctl -i eth0 -t 1 To simplify the step test of xdp_hw_metadata, set tx_type to HWTSTAMP_TX_ON to enable hardware timestamping for all outgoing packets, so that user no longer need to execute hwstamp_ctl command. Signed-off-by: Song Yoong Siang Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20241205051936.3156307-1-yoong.siang.song@intel.com --- tools/testing/selftests/bpf/xdp_hw_metadata.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index ad6c08dfd6c8c..e38675d9b1189 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -551,6 +551,7 @@ static void hwtstamp_enable(const char *ifname) { struct hwtstamp_config cfg = { .rx_filter = HWTSTAMP_FILTER_ALL, + .tx_type = HWTSTAMP_TX_ON, }; hwtstamp_ioctl(SIOCGHWTSTAMP, ifname, &saved_hwtstamp_cfg); From 5765c7f6e3173eb894889a29963a497aeb721c5e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Dec 2024 17:19:50 +0000 Subject: [PATCH 0205/1450] net_sched: sch_fq: add three drop_reason MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three new drop_reason, more precise than generic QDISC_DROP: "tc -s qd" show aggregate counters, it might be more useful to use drop_reason infrastructure for bug hunting. 1) SKB_DROP_REASON_FQ_BAND_LIMIT Whenever a packet is added while its band limit is hit. Corresponding value in "tc -s qd" is bandX_drops XXXX 2) SKB_DROP_REASON_FQ_HORIZON_LIMIT Whenever a packet has a timestamp too far in the future. Corresponding value in "tc -s qd" is horizon_drops XXXX 3) SKB_DROP_REASON_FQ_FLOW_LIMIT Whenever a flow has reached its limit. Corresponding value in "tc -s qd" is flows_plimit XXXX Tested: tc qd replace dev eth1 root fq flow_limit 10 limit 100000 perf record -a -e skb:kfree_skb sleep 1; perf script udp_stream 12329 [004] 216.929492: skb:kfree_skb: skbaddr=0xffff888eabe17e00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT udp_stream 12385 [006] 216.929593: skb:kfree_skb: skbaddr=0xffff888ef8827f00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT udp_stream 12389 [005] 216.929871: skb:kfree_skb: skbaddr=0xffff888ecb9ba500 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT udp_stream 12316 [009] 216.930398: skb:kfree_skb: skbaddr=0xffff888eca286b00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT udp_stream 12400 [008] 216.930490: skb:kfree_skb: skbaddr=0xffff888eabf93d00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT tc qd replace dev eth1 root fq flow_limit 100 limit 10000 perf record -a -e skb:kfree_skb sleep 1; perf script udp_stream 18074 [001] 1058.318040: skb:kfree_skb: skbaddr=0xffffa23c881fc000 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT udp_stream 18126 [005] 1058.320651: skb:kfree_skb: skbaddr=0xffffa23c6aad4000 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT udp_stream 18118 [006] 1058.321065: skb:kfree_skb: skbaddr=0xffffa23df0d48a00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT udp_stream 18074 [001] 1058.321126: skb:kfree_skb: skbaddr=0xffffa23c881ffa00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT udp_stream 15815 [003] 1058.321224: skb:kfree_skb: skbaddr=0xffffa23c9835db00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT tc -s -d qd sh dev eth1 qdisc fq 8023: root refcnt 257 limit 10000p flow_limit 100p buckets 1024 orphan_mask 1023 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 weights 589824 196608 65536 quantum 18Kb initial_quantum 92120b low_rate_threshold 550Kbit refill_delay 40ms timer_slack 10us horizon 10s horizon_drop Sent 492439603330 bytes 336953991 pkt (dropped 61724094, overlimits 0 requeues 4463) backlog 14611228b 9995p requeues 4463 flows 2965 (inactive 1151 throttled 0) band0_pkts 0 band1_pkts 9993 band2_pkts 0 gc 6347 highprio 0 fastpath 30 throttled 5 latency 2.32us flows_plimit 7403693 band1_drops 54320401 Signed-off-by: Eric Dumazet Reviewed-by: Victor Nogueira Reviewed-by: Toke Høiland-Jørgensen Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20241204171950.89829-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 18 ++++++++++++++++++ include/net/sch_generic.h | 8 ++++++++ net/sched/sch_fq.c | 14 ++++++++++---- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 6c5a1ea209a22..c29282fabae6c 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -58,6 +58,9 @@ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ + FN(FQ_BAND_LIMIT) \ + FN(FQ_HORIZON_LIMIT) \ + FN(FQ_FLOW_LIMIT) \ FN(CPU_BACKLOG) \ FN(XDP) \ FN(TC_INGRESS) \ @@ -311,6 +314,21 @@ enum skb_drop_reason { * failed to enqueue to current qdisc) */ SKB_DROP_REASON_QDISC_DROP, + /** + * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band + * limit is reached. + */ + SKB_DROP_REASON_FQ_BAND_LIMIT, + /** + * @SKB_DROP_REASON_FQ_HORIZON_LIMIT: dropped by fq qdisc when packet + * timestamp is too far in the future. + */ + SKB_DROP_REASON_FQ_HORIZON_LIMIT, + /** + * @SKB_DROP_REASON_FQ_FLOW_LIMIT: dropped by fq qdisc when a flow + * exceeds its limits. + */ + SKB_DROP_REASON_FQ_FLOW_LIMIT, /** * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU * backlog queue. This can be caused by backlog queue full (see diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 5d74fa7e694cc..8074322dd636a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1245,6 +1245,14 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_DROP; } +static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free, + enum skb_drop_reason reason) +{ + tcf_set_drop_reason(skb, reason); + return qdisc_drop(skb, sch, to_free); +} + static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a5e87f9ea9861..2ca5332cfcc5c 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -537,6 +537,8 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb, return unlikely((s64)skb->tstamp > (s64)(now + q->horizon)); } +#define FQDR(reason) SKB_DROP_REASON_FQ_##reason + static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -548,7 +550,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); if (unlikely(q->band_pkt_count[band] >= sch->limit)) { q->stat_band_drops[band]++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + FQDR(BAND_LIMIT)); } now = ktime_get_ns(); @@ -558,8 +561,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Check if packet timestamp is too far in the future. */ if (fq_packet_beyond_horizon(skb, q, now)) { if (q->horizon_drop) { - q->stat_horizon_drops++; - return qdisc_drop(skb, sch, to_free); + q->stat_horizon_drops++; + return qdisc_drop_reason(skb, sch, to_free, + FQDR(HORIZON_LIMIT)); } q->stat_horizon_caps++; skb->tstamp = now + q->horizon; @@ -572,7 +576,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (f != &q->internal) { if (unlikely(f->qlen >= q->flow_plimit)) { q->stat_flows_plimit++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + FQDR(FLOW_LIMIT)); } if (fq_flow_is_detached(f)) { @@ -597,6 +602,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } +#undef FQDR static void fq_check_throttled(struct fq_sched_data *q, u64 now) { From 10685681bafce6febb39770f3387621bf5d67d0b Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Tue, 3 Dec 2024 19:05:19 -0800 Subject: [PATCH 0206/1450] net_sched: sch_sfq: don't allow 1 packet limit The current implementation does not work correctly with a limit of 1. iproute2 actually checks for this and this patch adds the check in kernel as well. This fixes the following syzkaller reported crash: UBSAN: array-index-out-of-bounds in net/sched/sch_sfq.c:210:6 index 65535 is out of range for type 'struct sfq_head[128]' CPU: 0 PID: 2569 Comm: syz-executor101 Not tainted 5.10.0-smp-DEV #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x125/0x19f lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:148 [inline] __ubsan_handle_out_of_bounds+0xed/0x120 lib/ubsan.c:347 sfq_link net/sched/sch_sfq.c:210 [inline] sfq_dec+0x528/0x600 net/sched/sch_sfq.c:238 sfq_dequeue+0x39b/0x9d0 net/sched/sch_sfq.c:500 sfq_reset+0x13/0x50 net/sched/sch_sfq.c:525 qdisc_reset+0xfe/0x510 net/sched/sch_generic.c:1026 tbf_reset+0x3d/0x100 net/sched/sch_tbf.c:319 qdisc_reset+0xfe/0x510 net/sched/sch_generic.c:1026 dev_reset_queue+0x8c/0x140 net/sched/sch_generic.c:1296 netdev_for_each_tx_queue include/linux/netdevice.h:2350 [inline] dev_deactivate_many+0x6dc/0xc20 net/sched/sch_generic.c:1362 __dev_close_many+0x214/0x350 net/core/dev.c:1468 dev_close_many+0x207/0x510 net/core/dev.c:1506 unregister_netdevice_many+0x40f/0x16b0 net/core/dev.c:10738 unregister_netdevice_queue+0x2be/0x310 net/core/dev.c:10695 unregister_netdevice include/linux/netdevice.h:2893 [inline] __tun_detach+0x6b6/0x1600 drivers/net/tun.c:689 tun_detach drivers/net/tun.c:705 [inline] tun_chr_close+0x104/0x1b0 drivers/net/tun.c:3640 __fput+0x203/0x840 fs/file_table.c:280 task_work_run+0x129/0x1b0 kernel/task_work.c:185 exit_task_work include/linux/task_work.h:33 [inline] do_exit+0x5ce/0x2200 kernel/exit.c:931 do_group_exit+0x144/0x310 kernel/exit.c:1046 __do_sys_exit_group kernel/exit.c:1057 [inline] __se_sys_exit_group kernel/exit.c:1055 [inline] __x64_sys_exit_group+0x3b/0x40 kernel/exit.c:1055 do_syscall_64+0x6c/0xd0 entry_SYSCALL_64_after_hwframe+0x61/0xcb RIP: 0033:0x7fe5e7b52479 Code: Unable to access opcode bytes at RIP 0x7fe5e7b5244f. RSP: 002b:00007ffd3c800398 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fe5e7b52479 RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000 RBP: 00007fe5e7bcd2d0 R08: ffffffffffffffb8 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fe5e7bcd2d0 R13: 0000000000000000 R14: 00007fe5e7bcdd20 R15: 00007fe5e7b24270 The crash can be also be reproduced with the following (with a tc recompiled to allow for sfq limits of 1): tc qdisc add dev dummy0 handle 1: root tbf rate 1Kbit burst 100b lat 1s ../iproute2-6.9.0/tc/tc qdisc add dev dummy0 handle 2: parent 1:10 sfq limit 1 ifconfig dummy0 up ping -I dummy0 -f -c2 -W0.1 8.8.8.8 sleep 1 Scenario that triggers the crash: * the first packet is sent and queued in TBF and SFQ; qdisc qlen is 1 * TBF dequeues: it peeks from SFQ which moves the packet to the gso_skb list and keeps qdisc qlen set to 1. TBF is out of tokens so it schedules itself for later. * the second packet is sent and TBF tries to queues it to SFQ. qdisc qlen is now 2 and because the SFQ limit is 1 the packet is dropped by SFQ. At this point qlen is 1, and all of the SFQ slots are empty, however q->tail is not NULL. At this point, assuming no more packets are queued, when sch_dequeue runs again it will decrement the qlen for the current empty slot causing an underflow and the subsequent out of bounds access. Reported-by: syzbot Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Octavian Purdila Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241204030520.2084663-2-tavip@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_sfq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index a4b8296a2fa1c..65d5b59da5830 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -652,6 +652,10 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, if (!p) return -ENOMEM; } + if (ctl->limit == 1) { + NL_SET_ERR_MSG_MOD(extack, "invalid limit"); + return -EINVAL; + } sch_tree_lock(sch); if (ctl->quantum) q->quantum = ctl->quantum; From 1e7e1f0e8be147ae98fe88ec82150c97265965a6 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Tue, 3 Dec 2024 19:05:20 -0800 Subject: [PATCH 0207/1450] selftests/tc-testing: sfq: test that kernel rejects limit of 1 Add test to check that the kernel rejects a configuration with the limit set to 1. Signed-off-by: Octavian Purdila Link: https://patch.msgid.link/20241204030520.2084663-3-tavip@google.com Signed-off-by: Jakub Kicinski --- .../tc-testing/scripts/sfq_rejects_limit_1.py | 21 +++++++++++++++++++ .../tc-testing/tc-tests/qdiscs/sfq.json | 20 ++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100755 tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py diff --git a/tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py b/tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py new file mode 100755 index 0000000000000..0f44a61994957 --- /dev/null +++ b/tools/testing/selftests/tc-testing/scripts/sfq_rejects_limit_1.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Script that checks that SFQ rejects a limit of 1 at the kernel +# level. We can't use iproute2's tc because it does not accept a limit +# of 1. + +import sys +import os + +from pyroute2 import IPRoute +from pyroute2.netlink.exceptions import NetlinkError + +ip = IPRoute() +ifidx = ip.link_lookup(ifname=sys.argv[1]) + +try: + ip.tc('add', 'sfq', ifidx, limit=1) + sys.exit(1) +except NetlinkError: + sys.exit(0) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json index 16d51936b3853..50e8d72781cbd 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/sfq.json @@ -208,5 +208,25 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "4d6f", + "name": "Check that limit of 1 is rejected", + "category": [ + "qdisc", + "sfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + ], + "cmdUnderTest": "./scripts/sfq_rejects_limit_1.py $DUMMY", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "sfq", + "matchCount": "0", + "teardown": [ + ] } ] From ca5c94949facce1f67a4a9a9528a27f635ff3e78 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:24 +0100 Subject: [PATCH 0208/1450] xsk: align &xdp_buff_xsk harder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the series "XSk buff on a diet" by Maciej, the greatest pow-2 which &xdp_buff_xsk can be divided got reduced from 16 to 8 on x86_64. Also, sizeof(xdp_buff_xsk) now is 120 bytes, which, taking the previous sentence into account, leads to that it leaves 8 bytes at the end of cacheline, which means an array of buffs will have its elements messed between the cachelines chaotically. Use __aligned_largest for this struct. This alignment is usually 16 bytes, which makes it fill two full cachelines and align an array nicely. ___cacheline_aligned may be excessive here, especially on arches with 128-256 byte CLs, as well as 32-bit arches (76 -> 96 bytes on MIPS32R2), while not doing better than _largest. Signed-off-by: Alexander Lobakin Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241203173733.3181246-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xsk_buff_pool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index bb03cee716b31..7637799b6c192 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,7 +29,7 @@ struct xdp_buff_xsk { dma_addr_t frame_dma; struct xsk_buff_pool *pool; struct list_head list_node; -}; +} __aligned_largest; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) #define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) From 7cd1107f48e2a246c6a628c2381e1b8aafa4675a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:25 +0100 Subject: [PATCH 0209/1450] bpf, xdp: constify some bpf_prog * function arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In lots of places, bpf_prog pointer is used only for tracing or other stuff that doesn't modify the structure itself. Same for net_device. Address at least some of them and add `const` attributes there. The object code didn't change, but that may prevent unwanted data modifications and also allow more helpers to have const arguments. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Jakub Kicinski --- include/linux/bpf.h | 12 ++++++------ include/linux/filter.h | 9 +++++---- include/linux/netdevice.h | 6 +++--- include/linux/skbuff.h | 2 +- kernel/bpf/devmap.c | 8 ++++---- net/core/dev.c | 10 +++++----- net/core/filter.c | 29 ++++++++++++++++------------- net/core/skbuff.c | 2 +- 8 files changed, 41 insertions(+), 37 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaee2a819f4c1..ec3acb16359e5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2591,10 +2591,10 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog); + const struct bpf_prog *xdp_prog); int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress); + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress); void __cpu_map_flush(struct list_head *flush_list); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -2864,15 +2864,15 @@ struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { return 0; } static inline int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { return 0; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 3a21947f2fd44..9a5d23ae38557 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1179,17 +1179,18 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd, * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *prog); + struct xdp_buff *xdp, const struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *prog); + const struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, - struct bpf_prog *prog); + const struct bpf_prog *prog); void xdp_do_flush(void); -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ecc686409161e..ecca21387a683 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3958,9 +3958,9 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog); -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb); + const struct bpf_prog *xdp_prog); +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog); +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 58009fa661026..95452d1a07fcf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3627,7 +3627,7 @@ static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag) int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog); + const struct bpf_prog *prog); /** * skb_frag_address - gets the address of the data contained in a paged fragment diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 7878be18e9d26..effde52bc857f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -678,7 +678,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { int err; @@ -701,7 +701,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; @@ -720,8 +720,8 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; diff --git a/net/core/dev.c b/net/core/dev.c index 45a8c3dd4a648..40a2332e3fa02 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4931,7 +4931,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; @@ -5033,7 +5033,7 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, } static int -netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) { struct sk_buff *skb = *pskb; int err, hroom, troom; @@ -5057,7 +5057,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; u32 mac_len, act = XDP_DROP; @@ -5110,7 +5110,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -5135,7 +5135,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; diff --git a/net/core/filter.c b/net/core/filter.c index 6625b3f563a4a..fac245065b0a4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4348,9 +4348,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp) EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, - struct net_device *dev, + const struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4371,10 +4371,10 @@ static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, return err; } -static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, - struct net_device *dev, - struct xdp_frame *xdpf, - struct bpf_prog *xdp_prog) +static __always_inline int +__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4443,7 +4443,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -4457,7 +4457,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, - struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -4472,9 +4473,9 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, void *fwd, - enum bpf_map_type map_type, u32 map_id, - u32 flags) + const struct bpf_prog *xdp_prog, + void *fwd, enum bpf_map_type map_type, + u32 map_id, u32 flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; @@ -4528,7 +4529,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -9075,7 +9077,8 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6841e61a6bd0b..a441613a1e6c1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1009,7 +1009,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, EXPORT_SYMBOL(skb_pp_cow_data); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog) + const struct bpf_prog *prog) { if (!prog->aux->xdp_has_frags) return -EINVAL; From dcf3827cde8621d2317a7f98e069adbdc2112982 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:26 +0100 Subject: [PATCH 0210/1450] xdp, xsk: constify read-only arguments of some static inline helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lots of read-only helpers for &xdp_buff and &xdp_frame, such as getting the frame length, skb_shared_info etc., don't have their arguments marked with `const` for no reason. Add the missing annotations to leave less place for mistakes and more for optimization. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241203173733.3181246-4-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 29 +++++++++++++++++------------ include/net/xdp_sock_drv.h | 11 ++++++----- include/net/xsk_buff_pool.h | 2 +- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index e6770dd40c917..197808df1ee12 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -88,7 +88,7 @@ struct xdp_buff { u32 flags; /* supported values defined in xdp_buff_flags */ }; -static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } @@ -103,7 +103,8 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } -static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline bool +xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } @@ -144,15 +145,16 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * -xdp_get_shared_info_from_buff(struct xdp_buff *xdp) +xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } -static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) +static __always_inline unsigned int +xdp_get_buff_len(const struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; - struct skb_shared_info *sinfo; + const struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; @@ -177,12 +179,13 @@ struct xdp_frame { u32 flags; /* supported values defined in xdp_buff_flags */ }; -static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } -static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) +static __always_inline bool +xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } @@ -201,7 +204,7 @@ static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) } static inline struct skb_shared_info * -xdp_get_shared_info_from_frame(struct xdp_frame *frame) +xdp_get_shared_info_from_frame(const struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); @@ -249,7 +252,8 @@ int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline -void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) +void xdp_convert_frame_to_buff(const struct xdp_frame *frame, + struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; @@ -260,7 +264,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) } static inline -int xdp_update_frame_from_buff(struct xdp_buff *xdp, +int xdp_update_frame_from_buff(const struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; @@ -317,9 +321,10 @@ void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); -static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) +static __always_inline unsigned int +xdp_get_frame_len(const struct xdp_frame *xdpf) { - struct skb_shared_info *sinfo; + const struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 40085afd91607..f3175a5d28f79 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -101,7 +101,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return !xp_mb_desc(desc); } @@ -143,7 +143,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) list_add_tail(&frag->list_node, &frag->pool->xskb_list); } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff *ret = NULL; @@ -200,7 +200,8 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) XDP_TXMD_FLAGS_CHECKSUM | \ 0) -static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +static inline bool +xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) { return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } @@ -337,7 +338,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return false; } @@ -360,7 +361,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { return NULL; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 7637799b6c192..50779406bc2d9 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -183,7 +183,7 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } -static inline bool xp_mb_desc(struct xdp_desc *desc) +static inline bool xp_mb_desc(const struct xdp_desc *desc) { return desc->options & XDP_PKT_CONTD; } From f65966fe0178c06065d354c22fb456fc4370b527 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:27 +0100 Subject: [PATCH 0211/1450] xdp: allow attaching already registered memory model to xdp_rxq_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One may need to register memory model separately from xdp_rxq_info. One simple example may be XDP test run code, but in general, it might be useful when memory model registering is managed by one layer and then XDP RxQ info by a different one. Allow such scenarios by adding a simple helper which "attaches" already registered memory model to the desired xdp_rxq_info. As this is mostly needed for Page Pool, add a special function to do that for a &page_pool pointer. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241203173733.3181246-5-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 32 +++++++++++++++++++++++++++ net/core/xdp.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index 197808df1ee12..1253fe21ede79 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -356,6 +356,38 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); +int xdp_reg_page_pool(struct page_pool *pool); +void xdp_unreg_page_pool(const struct page_pool *pool); +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool); + +/** + * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info + * @xdp_rxq: XDP RxQ info to attach the memory info to + * @mem: already registered memory info + * + * If the driver registers its memory providers manually, it must use this + * function instead of xdp_rxq_info_reg_mem_model(). + */ +static inline void +xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq, + const struct xdp_mem_info *mem) +{ + xdp_rxq->mem = *mem; +} + +/** + * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info + * @xdp_rxq: XDP RxQ info to detach the memory info from + * + * If the driver registers its memory providers manually and then attaches it + * via xdp_rxq_info_attach_mem_model(), it must call this function before + * xdp_rxq_info_unreg(). + */ +static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->mem = (struct xdp_mem_info){ }; +} /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. diff --git a/net/core/xdp.c b/net/core/xdp.c index bcc5551c6424b..885a2a664bce9 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -365,6 +365,62 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); +/** + * xdp_reg_page_pool - register &page_pool as a memory provider for XDP + * @pool: &page_pool to register + * + * Can be used to register pools manually without connecting to any XDP RxQ + * info, so that the XDP layer will be aware of them. Then, they can be + * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool(). + * + * Return: %0 on success, -errno on error. + */ +int xdp_reg_page_pool(struct page_pool *pool) +{ + struct xdp_mem_info mem; + + return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool); +} +EXPORT_SYMBOL_GPL(xdp_reg_page_pool); + +/** + * xdp_unreg_page_pool - unregister &page_pool from the memory providers list + * @pool: &page_pool to unregister + * + * A shorthand for manual unregistering page pools. If the pool was previously + * attached to an RxQ info, it must be detached first. + */ +void xdp_unreg_page_pool(const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_unreg_mem_model(&mem); +} +EXPORT_SYMBOL_GPL(xdp_unreg_page_pool); + +/** + * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info + * @xdp_rxq: XDP RxQ info to attach the pool to + * @pool: pool to attach + * + * If the pool was registered manually, this function must be called instead + * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info. + */ +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_rxq_info_attach_mem_model(xdp_rxq, &mem); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); + /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean From 9e25dd9d65d27aa94220831fe6453d935988801c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:28 +0100 Subject: [PATCH 0212/1450] xsk: allow attaching XSk pool via xdp_rxq_info_reg_mem_model() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When you register an XSk pool as XDP Rxq info memory model, you then need to manually attach it after the registration. Let the user combine both actions into one by just passing a pointer to the pool directly to xdp_rxq_info_reg_mem_model(), which will take care of calling xsk_pool_set_rxq_info(). This looks similar to how a &page_pool gets registered and reduce repeating driver code. Acked-by: Maciej Fijalkowski Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241203173733.3181246-6-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- net/core/xdp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/xdp.c b/net/core/xdp.c index 885a2a664bce9..de1e9cb787181 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -358,6 +358,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); + if (type == MEM_TYPE_XSK_BUFF_POOL && allocator) + xsk_pool_set_rxq_info(allocator, xdp_rxq); + if (trace_mem_connect_enabled() && xdp_alloc) trace_mem_connect(xdp_alloc, xdp_rxq); return 0; From e77d9aee951341119be16a991fcfc76d1154d22a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 3 Dec 2024 18:37:29 +0100 Subject: [PATCH 0213/1450] xdp: register system page pool as an XDP memory model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To make the system page pool usable as a source for allocating XDP frames, we need to register it with xdp_reg_mem_model(), so that page return works correctly. This is done in preparation for using the system page_pool to convert XDP_PASS XSk frames to skbs; for the same reason, make the per-cpu variable non-static so we can access it from other source files as well (but w/o exporting). Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241203173733.3181246-7-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + net/core/dev.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ecca21387a683..d1a8d98b132c6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3322,6 +3322,7 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DECLARE_PER_CPU(struct page_pool *, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) diff --git a/net/core/dev.c b/net/core/dev.c index 40a2332e3fa02..c7f3dea3e0eb9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -460,7 +460,7 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -static DEFINE_PER_CPU(struct page_pool *, system_page_pool); +DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* @@ -12152,11 +12152,18 @@ static int net_page_pool_create(int cpuid) .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; + int err; pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); if (IS_ERR(pp_ptr)) return -ENOMEM; + err = xdp_reg_page_pool(pp_ptr); + if (err) { + page_pool_destroy(pp_ptr); + return err; + } + per_cpu(system_page_pool, cpuid) = pp_ptr; #endif return 0; @@ -12290,6 +12297,7 @@ static int __init net_dev_init(void) if (!pp_ptr) continue; + xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); per_cpu(system_page_pool, i) = NULL; } From 9bd9f72a74344b54cfb6fcabf1173e6c6e5c6952 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:30 +0100 Subject: [PATCH 0214/1450] netmem: add a couple of page helper wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the following netmem counterparts: * virt_to_netmem() -- simple page_to_netmem(virt_to_page()) wrapper; * netmem_is_pfmemalloc() -- page_is_pfmemalloc() for page-backed netmems, false otherwise; and the following "unsafe" versions: * __netmem_to_page() * __netmem_get_pp() * __netmem_address() They do the same as their non-underscored buddies, but assume the netmem is always page-backed. When working with header &page_pools, you don't need to check whether netmem belongs to the host memory and you can never get NULL instead of &page. Checks for the LSB, clearing the LSB, branches take cycles and increase object code size, sometimes significantly. When you're sure your PP is always host, you can avoid this by using the underscored counterparts. Signed-off-by: Alexander Lobakin Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241203173733.3181246-8-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 78 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 2 deletions(-) diff --git a/include/net/netmem.h b/include/net/netmem.h index 8a6e20be4b9d3..1b58faa4f20f9 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -72,6 +72,22 @@ static inline bool netmem_is_net_iov(const netmem_ref netmem) return (__force unsigned long)netmem & NET_IOV; } +/** + * __netmem_to_page - unsafely get pointer to the &page backing @netmem + * @netmem: netmem reference to convert + * + * Unsafe version of netmem_to_page(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB, no WARN). When @netmem points to IOV, + * provokes undefined behaviour. + * + * Return: pointer to the &page (garbage if @netmem is not page-backed). + */ +static inline struct page *__netmem_to_page(netmem_ref netmem) +{ + return (__force struct page *)netmem; +} + /* This conversion fails (returns NULL) if the netmem_ref is not struct page * backed. */ @@ -80,7 +96,7 @@ static inline struct page *netmem_to_page(netmem_ref netmem) if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) return NULL; - return (__force struct page *)netmem; + return __netmem_to_page(netmem); } static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem) @@ -103,6 +119,17 @@ static inline netmem_ref page_to_netmem(struct page *page) return (__force netmem_ref)page; } +/** + * virt_to_netmem - convert virtual memory pointer to a netmem reference + * @data: host memory pointer to convert + * + * Return: netmem reference to the &page backing this virtual address. + */ +static inline netmem_ref virt_to_netmem(const void *data) +{ + return page_to_netmem(virt_to_page(data)); +} + static inline int netmem_ref_count(netmem_ref netmem) { /* The non-pp refcount of net_iov is always 1. On net_iov, we only @@ -127,6 +154,22 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); } +/** + * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem + * @netmem: netmem reference to get the pointer from + * + * Unsafe version of netmem_get_pp(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (avoids clearing the LSB). When @netmem points to IOV, + * provokes invalid memory access. + * + * Return: pointer to the &page_pool (garbage if @netmem is not page-backed). + */ +static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) +{ + return __netmem_to_page(netmem)->pp; +} + static inline struct page_pool *netmem_get_pp(netmem_ref netmem) { return __netmem_clear_lsb(netmem)->pp; @@ -158,12 +201,43 @@ static inline netmem_ref netmem_compound_head(netmem_ref netmem) return page_to_netmem(compound_head(netmem_to_page(netmem))); } +/** + * __netmem_address - unsafely get pointer to the memory backing @netmem + * @netmem: netmem reference to get the pointer for + * + * Unsafe version of netmem_address(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB). When @netmem points to IOV, provokes + * undefined behaviour. + * + * Return: pointer to the memory (garbage if @netmem is not page-backed). + */ +static inline void *__netmem_address(netmem_ref netmem) +{ + return page_address(__netmem_to_page(netmem)); +} + static inline void *netmem_address(netmem_ref netmem) { if (netmem_is_net_iov(netmem)) return NULL; - return page_address(netmem_to_page(netmem)); + return __netmem_address(netmem); +} + +/** + * netmem_is_pfmemalloc - check if @netmem was allocated under memory pressure + * @netmem: netmem reference to check + * + * Return: true if @netmem is page-backed and the page was allocated under + * memory pressure, false otherwise. + */ +static inline bool netmem_is_pfmemalloc(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return false; + + return page_is_pfmemalloc(netmem_to_page(netmem)); } static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) From 024bfd2e9d80d7131f1178eb2235030b96f7ef0e Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:31 +0100 Subject: [PATCH 0215/1450] page_pool: make page_pool_put_page_bulk() handle array of netmems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, page_pool_put_page_bulk() indeed takes an array of pointers to the data, not pages, despite the name. As one side effect, when you're freeing frags from &skb_shared_info, xdp_return_frame_bulk() converts page pointers to virtual addresses and then page_pool_put_page_bulk() converts them back. Moreover, data pointers assume every frag is placed in the host memory, making this function non-universal. Make page_pool_put_page_bulk() handle array of netmems. Pass frag netmems directly and use virt_to_netmem() when freeing xdpf->data, so that the PP core will then get the compound netmem and take care of the rest. Signed-off-by: Alexander Lobakin Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241203173733.3181246-9-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 8 ++++---- include/net/xdp.h | 2 +- net/core/page_pool.c | 30 +++++++++++++++--------------- net/core/xdp.c | 6 +++--- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index c022c410abe39..1ea16b0e9c793 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -259,8 +259,8 @@ void page_pool_disable_direct_recycling(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem); -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count); +void page_pool_put_netmem_bulk(struct page_pool *pool, netmem_ref *data, + u32 count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -272,8 +272,8 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, { } -static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +static inline void page_pool_put_netmem_bulk(struct page_pool *pool, + netmem_ref *data, u32 count) { } #endif diff --git a/include/net/xdp.h b/include/net/xdp.h index 1253fe21ede79..f4020b29122fa 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -194,7 +194,7 @@ xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) struct xdp_frame_bulk { int count; void *xa; - void *q[XDP_BULK_QUEUE_SIZE]; + netmem_ref q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f89cf93f6eb45..4c85b77cfdac9 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -840,22 +840,22 @@ void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, EXPORT_SYMBOL(page_pool_put_unrefed_page); /** - * page_pool_put_page_bulk() - release references on multiple pages + * page_pool_put_netmem_bulk() - release references on multiple netmems * @pool: pool from which pages were allocated - * @data: array holding page pointers - * @count: number of pages in @data + * @data: array holding netmem references + * @count: number of entries in @data * - * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring - * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() - * will release leftover pages to the page allocator. - * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx + * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() + * will release leftover netmems to the memory provider. + * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx * completion loop for the XDP_REDIRECT use case. * * Please note the caller must not use data area after running - * page_pool_put_page_bulk(), as this function overwrites it. + * page_pool_put_netmem_bulk(), as this function overwrites it. */ -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +void page_pool_put_netmem_bulk(struct page_pool *pool, netmem_ref *data, + u32 count) { int i, bulk_len = 0; bool allow_direct; @@ -864,7 +864,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, allow_direct = page_pool_napi_local(pool); for (i = 0; i < count; i++) { - netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i])); + netmem_ref netmem = netmem_compound_head(data[i]); /* It is not the last user for the page frag case */ if (!page_pool_is_last_ref(netmem)) @@ -873,7 +873,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (netmem) - data[bulk_len++] = (__force void *)netmem; + data[bulk_len++] = netmem; } if (!bulk_len) @@ -882,7 +882,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, /* Bulk producer into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, data[i])) { + if (__ptr_ring_produce(&pool->ring, (__force void *)data[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; @@ -899,9 +899,9 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, * since put_page() with refcnt == 1 can be an expensive operation */ for (; i < bulk_len; i++) - page_pool_return_page(pool, (__force netmem_ref)data[i]); + page_pool_return_page(pool, data[i]); } -EXPORT_SYMBOL(page_pool_put_page_bulk); +EXPORT_SYMBOL(page_pool_put_netmem_bulk); static netmem_ref page_pool_drain_frag(struct page_pool *pool, netmem_ref netmem) diff --git a/net/core/xdp.c b/net/core/xdp.c index de1e9cb787181..938ad15c98577 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -518,7 +518,7 @@ void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) if (unlikely(!xa || !bq->count)) return; - page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); + page_pool_put_netmem_bulk(xa->page_pool, bq->q, bq->count); /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ bq->count = 0; } @@ -559,12 +559,12 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; - bq->q[bq->count++] = skb_frag_address(frag); + bq->q[bq->count++] = skb_frag_netmem(frag); if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); } } - bq->q[bq->count++] = xdpf->data; + bq->q[bq->count++] = virt_to_netmem(xdpf->data); } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); From 3e42bb998c6d574cbd683bd2d4ba1a2abf5aa044 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:12 +0100 Subject: [PATCH 0216/1450] net: freescale: ucc_geth: Drop support for the "interface" DT property In april 2007, ucc_geth was converted to phylib with : commit 728de4c927a3 ("ucc_geth: migrate ucc_geth to phylib"). In that commit, the device-tree property "interface", that could be used to retrieve the PHY interface mode was deprecated. DTS files that still used that property were converted along the way, in the following commit, also dating from april 2007 : commit 0fd8c47cccb1 ("[POWERPC] Replace undocumented interface properties in dts files") 17 years later, there's no users of that property left and I hope it's safe to say we can remove support from that in the ucc_geth driver, making the probe() function a bit simpler. Should there be any users that have a DT that was generated when 2.6.21 was cutting-edge, print an error message with hints on how to convert the devicetree if the 'interface' property is found. With that property gone, we can greatly simplify the parsing of the phy-interface-mode from the devicetree by using of_get_phy_mode(), allowing the removal of the open-coded parsing in the driver. Reviewed-by: Andrew Lunn Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 63 +++++------------------ 1 file changed, 12 insertions(+), 51 deletions(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 6663c17680892..b023a1a1dc5cd 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3469,32 +3469,6 @@ static int ucc_geth_resume(struct platform_device *ofdev) #define ucc_geth_resume NULL #endif -static phy_interface_t to_phy_interface(const char *phy_connection_type) -{ - if (strcasecmp(phy_connection_type, "mii") == 0) - return PHY_INTERFACE_MODE_MII; - if (strcasecmp(phy_connection_type, "gmii") == 0) - return PHY_INTERFACE_MODE_GMII; - if (strcasecmp(phy_connection_type, "tbi") == 0) - return PHY_INTERFACE_MODE_TBI; - if (strcasecmp(phy_connection_type, "rmii") == 0) - return PHY_INTERFACE_MODE_RMII; - if (strcasecmp(phy_connection_type, "rgmii") == 0) - return PHY_INTERFACE_MODE_RGMII; - if (strcasecmp(phy_connection_type, "rgmii-id") == 0) - return PHY_INTERFACE_MODE_RGMII_ID; - if (strcasecmp(phy_connection_type, "rgmii-txid") == 0) - return PHY_INTERFACE_MODE_RGMII_TXID; - if (strcasecmp(phy_connection_type, "rgmii-rxid") == 0) - return PHY_INTERFACE_MODE_RGMII_RXID; - if (strcasecmp(phy_connection_type, "rtbi") == 0) - return PHY_INTERFACE_MODE_RTBI; - if (strcasecmp(phy_connection_type, "sgmii") == 0) - return PHY_INTERFACE_MODE_SGMII; - - return PHY_INTERFACE_MODE_MII; -} - static int ucc_geth_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { struct ucc_geth_private *ugeth = netdev_priv(dev); @@ -3564,19 +3538,6 @@ static int ucc_geth_probe(struct platform_device* ofdev) int err, ucc_num, max_speed = 0; const unsigned int *prop; phy_interface_t phy_interface; - static const int enet_to_speed[] = { - SPEED_10, SPEED_10, SPEED_10, - SPEED_100, SPEED_100, SPEED_100, - SPEED_1000, SPEED_1000, SPEED_1000, SPEED_1000, - }; - static const phy_interface_t enet_to_phy_interface[] = { - PHY_INTERFACE_MODE_MII, PHY_INTERFACE_MODE_RMII, - PHY_INTERFACE_MODE_RGMII, PHY_INTERFACE_MODE_MII, - PHY_INTERFACE_MODE_RMII, PHY_INTERFACE_MODE_RGMII, - PHY_INTERFACE_MODE_GMII, PHY_INTERFACE_MODE_RGMII, - PHY_INTERFACE_MODE_TBI, PHY_INTERFACE_MODE_RTBI, - PHY_INTERFACE_MODE_SGMII, - }; ugeth_vdbg("%s: IN", __func__); @@ -3627,18 +3588,18 @@ static int ucc_geth_probe(struct platform_device* ofdev) /* Find the TBI PHY node. If it's not there, we don't support SGMII */ ug_info->tbi_node = of_parse_phandle(np, "tbi-handle", 0); - /* get the phy interface type, or default to MII */ - prop = of_get_property(np, "phy-connection-type", NULL); - if (!prop) { - /* handle interface property present in old trees */ - prop = of_get_property(ug_info->phy_node, "interface", NULL); - if (prop != NULL) { - phy_interface = enet_to_phy_interface[*prop]; - max_speed = enet_to_speed[*prop]; - } else - phy_interface = PHY_INTERFACE_MODE_MII; - } else { - phy_interface = to_phy_interface((const char *)prop); + prop = of_get_property(ug_info->phy_node, "interface", NULL); + if (prop) { + dev_err(&ofdev->dev, + "Device-tree property 'interface' is no longer supported. Please use 'phy-connection-type' instead."); + err = -EINVAL; + goto err_deregister_fixed_link; + } + + err = of_get_phy_mode(np, &phy_interface); + if (err) { + dev_err(&ofdev->dev, "Invalid phy-connection-type"); + goto err_deregister_fixed_link; } /* get speed, or derive from PHY interface */ From 1e59fd163100c2e21a65004c96f81b458e86b457 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:13 +0100 Subject: [PATCH 0217/1450] net: freescale: ucc_geth: split adjust_link for phylink conversion Preparing the phylink conversion, split the adjust_link callbaclk, by clearly separating the mac configuration, link_up and link_down phases. Reviewed-by: Andrew Lunn Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 180 +++++++++++----------- 1 file changed, 93 insertions(+), 87 deletions(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index b023a1a1dc5cd..bf9f5901b405d 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -1548,105 +1548,111 @@ static void ugeth_activate(struct ucc_geth_private *ugeth) __netdev_watchdog_up(ugeth->ndev); } -/* Called every time the controller might need to be made - * aware of new link state. The PHY code conveys this - * information through variables in the ugeth structure, and this - * function converts those variables into the appropriate - * register values, and can bring down the device if needed. - */ - -static void adjust_link(struct net_device *dev) +static void ugeth_link_up(struct ucc_geth_private *ugeth, + struct phy_device *phy, + phy_interface_t interface, int speed, int duplex) { - struct ucc_geth_private *ugeth = netdev_priv(dev); - struct ucc_geth __iomem *ug_regs; - struct ucc_fast __iomem *uf_regs; - struct phy_device *phydev = ugeth->phydev; + struct ucc_geth __iomem *ug_regs = ugeth->ug_regs; + struct ucc_fast __iomem *uf_regs = ugeth->uccf->uf_regs; + u32 tempval = in_be32(&ug_regs->maccfg2); + u32 upsmr = in_be32(&uf_regs->upsmr); int new_state = 0; - ug_regs = ugeth->ug_regs; - uf_regs = ugeth->uccf->uf_regs; - - if (phydev->link) { - u32 tempval = in_be32(&ug_regs->maccfg2); - u32 upsmr = in_be32(&uf_regs->upsmr); - /* Now we make sure that we can be in full duplex mode. - * If not, we operate in half-duplex mode. */ - if (phydev->duplex != ugeth->oldduplex) { - new_state = 1; - if (!(phydev->duplex)) - tempval &= ~(MACCFG2_FDX); - else - tempval |= MACCFG2_FDX; - ugeth->oldduplex = phydev->duplex; - } + /* Now we make sure that we can be in full duplex mode. + * If not, we operate in half-duplex mode. + */ + if (duplex != ugeth->oldduplex) { + new_state = 1; + if (duplex == DUPLEX_HALF) + tempval &= ~(MACCFG2_FDX); + else + tempval |= MACCFG2_FDX; + ugeth->oldduplex = duplex; + } - if (phydev->speed != ugeth->oldspeed) { - new_state = 1; - switch (phydev->speed) { - case SPEED_1000: - tempval = ((tempval & - ~(MACCFG2_INTERFACE_MODE_MASK)) | - MACCFG2_INTERFACE_MODE_BYTE); - break; - case SPEED_100: - case SPEED_10: - tempval = ((tempval & - ~(MACCFG2_INTERFACE_MODE_MASK)) | - MACCFG2_INTERFACE_MODE_NIBBLE); - /* if reduced mode, re-set UPSMR.R10M */ - if ((ugeth->phy_interface == PHY_INTERFACE_MODE_RMII) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_ID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RTBI)) { - if (phydev->speed == SPEED_10) - upsmr |= UCC_GETH_UPSMR_R10M; - else - upsmr &= ~UCC_GETH_UPSMR_R10M; - } - break; - default: - if (netif_msg_link(ugeth)) - pr_warn( - "%s: Ack! Speed (%d) is not 10/100/1000!", - dev->name, phydev->speed); - break; + if (speed != ugeth->oldspeed) { + new_state = 1; + switch (speed) { + case SPEED_1000: + tempval = ((tempval & + ~(MACCFG2_INTERFACE_MODE_MASK)) | + MACCFG2_INTERFACE_MODE_BYTE); + break; + case SPEED_100: + case SPEED_10: + tempval = ((tempval & + ~(MACCFG2_INTERFACE_MODE_MASK)) | + MACCFG2_INTERFACE_MODE_NIBBLE); + /* if reduced mode, re-set UPSMR.R10M */ + if (interface == PHY_INTERFACE_MODE_RMII || + phy_interface_mode_is_rgmii(interface) || + interface == PHY_INTERFACE_MODE_RTBI) { + if (speed == SPEED_10) + upsmr |= UCC_GETH_UPSMR_R10M; + else + upsmr &= ~UCC_GETH_UPSMR_R10M; } - ugeth->oldspeed = phydev->speed; + break; + default: + if (netif_msg_link(ugeth)) + pr_warn("%s: Speed (%d) is not 10/100/1000!", + netdev_name(ugeth->ndev), speed); + break; } + ugeth->oldspeed = speed; + } - if (!ugeth->oldlink) { - new_state = 1; - ugeth->oldlink = 1; - } + if (!ugeth->oldlink) { + new_state = 1; + ugeth->oldlink = 1; + } - if (new_state) { - /* - * To change the MAC configuration we need to disable - * the controller. To do so, we have to either grab - * ugeth->lock, which is a bad idea since 'graceful - * stop' commands might take quite a while, or we can - * quiesce driver's activity. - */ - ugeth_quiesce(ugeth); - ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); + if (new_state) { + /* + * To change the MAC configuration we need to disable + * the controller. To do so, we have to either grab + * ugeth->lock, which is a bad idea since 'graceful + * stop' commands might take quite a while, or we can + * quiesce driver's activity. + */ + ugeth_quiesce(ugeth); + ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); - out_be32(&ug_regs->maccfg2, tempval); - out_be32(&uf_regs->upsmr, upsmr); + out_be32(&ug_regs->maccfg2, tempval); + out_be32(&uf_regs->upsmr, upsmr); - ugeth_enable(ugeth, COMM_DIR_RX_AND_TX); - ugeth_activate(ugeth); - } - } else if (ugeth->oldlink) { - new_state = 1; - ugeth->oldlink = 0; - ugeth->oldspeed = 0; - ugeth->oldduplex = -1; + ugeth_enable(ugeth, COMM_DIR_RX_AND_TX); + ugeth_activate(ugeth); } - if (new_state && netif_msg_link(ugeth)) - phy_print_status(phydev); + if (netif_msg_link(ugeth)) + phy_print_status(phy); +} + +static void ugeth_link_down(struct ucc_geth_private *ugeth) +{ + ugeth->oldlink = 0; + ugeth->oldspeed = 0; + ugeth->oldduplex = -1; +} + +/* Called every time the controller might need to be made + * aware of new link state. The PHY code conveys this + * information through variables in the ugeth structure, and this + * function converts those variables into the appropriate + * register values, and can bring down the device if needed. + */ + +static void adjust_link(struct net_device *dev) +{ + struct ucc_geth_private *ugeth = netdev_priv(dev); + struct phy_device *phydev = ugeth->phydev; + + if (phydev->link) + ugeth_link_up(ugeth, phydev, phydev->interface, + phydev->speed, phydev->duplex); + else + ugeth_link_down(ugeth); } /* Initialize TBI PHY interface for communicating with the From 43068024cc2a2abdf39c73d3c1ed63a77207ae31 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:14 +0100 Subject: [PATCH 0218/1450] net: freescale: ucc_geth: Use netdev->phydev to access the PHY As this driver pre-dates phylib, it uses a private pointer to get a reference to the attached phy_device. Drop that pointer and use the netdev's pointer instead. Reviewed-by: Andrew Lunn Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 27 ++++++++----------- drivers/net/ethernet/freescale/ucc_geth.h | 1 - .../net/ethernet/freescale/ucc_geth_ethtool.c | 17 ++++++------ 3 files changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index bf9f5901b405d..cc5f9ca42a780 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -1646,7 +1646,7 @@ static void ugeth_link_down(struct ucc_geth_private *ugeth) static void adjust_link(struct net_device *dev) { struct ucc_geth_private *ugeth = netdev_priv(dev); - struct phy_device *phydev = ugeth->phydev; + struct phy_device *phydev = dev->phydev; if (phydev->link) ugeth_link_up(ugeth, phydev, phydev->interface, @@ -1727,8 +1727,6 @@ static int init_phy(struct net_device *dev) phy_set_max_speed(phydev, priv->max_speed); - priv->phydev = phydev; - return 0; } @@ -2001,7 +1999,7 @@ static void ucc_geth_set_multi(struct net_device *dev) static void ucc_geth_stop(struct ucc_geth_private *ugeth) { struct ucc_geth __iomem *ug_regs = ugeth->ug_regs; - struct phy_device *phydev = ugeth->phydev; + struct phy_device *phydev = ugeth->ndev->phydev; ugeth_vdbg("%s: IN", __func__); @@ -3316,13 +3314,13 @@ static int ucc_geth_open(struct net_device *dev) goto err; } - phy_start(ugeth->phydev); + phy_start(dev->phydev); napi_enable(&ugeth->napi); netdev_reset_queue(dev); netif_start_queue(dev); device_set_wakeup_capable(&dev->dev, - qe_alive_during_sleep() || ugeth->phydev->irq); + qe_alive_during_sleep() || dev->phydev->irq); device_set_wakeup_enable(&dev->dev, ugeth->wol_en); return err; @@ -3343,8 +3341,7 @@ static int ucc_geth_close(struct net_device *dev) cancel_work_sync(&ugeth->timeout_work); ucc_geth_stop(ugeth); - phy_disconnect(ugeth->phydev); - ugeth->phydev = NULL; + phy_disconnect(dev->phydev); free_irq(ugeth->ug_info->uf_info.irq, ugeth->ndev); @@ -3378,7 +3375,7 @@ static void ucc_geth_timeout_work(struct work_struct *work) ucc_geth_stop(ugeth); ucc_geth_init_mac(ugeth); /* Must start PHY here */ - phy_start(ugeth->phydev); + phy_start(dev->phydev); netif_tx_start_all_queues(dev); } @@ -3421,7 +3418,7 @@ static int ucc_geth_suspend(struct platform_device *ofdev, pm_message_t state) setbits32(&ugeth->ug_regs->maccfg2, MACCFG2_MPE); ucc_fast_enable(ugeth->uccf, COMM_DIR_RX_AND_TX); } else if (!(ugeth->wol_en & WAKE_PHY)) { - phy_stop(ugeth->phydev); + phy_stop(ndev->phydev); } return 0; @@ -3461,8 +3458,8 @@ static int ucc_geth_resume(struct platform_device *ofdev) ugeth->oldspeed = 0; ugeth->oldduplex = -1; - phy_stop(ugeth->phydev); - phy_start(ugeth->phydev); + phy_stop(ndev->phydev); + phy_start(ndev->phydev); napi_enable(&ugeth->napi); netif_device_attach(ndev); @@ -3477,15 +3474,13 @@ static int ucc_geth_resume(struct platform_device *ofdev) static int ucc_geth_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { - struct ucc_geth_private *ugeth = netdev_priv(dev); - if (!netif_running(dev)) return -EINVAL; - if (!ugeth->phydev) + if (!dev->phydev) return -ENODEV; - return phy_mii_ioctl(ugeth->phydev, rq, cmd); + return phy_mii_ioctl(dev->phydev, rq, cmd); } static const struct net_device_ops ucc_geth_netdev_ops = { diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h index 4294ed096ebbc..c08a56b7c9fe3 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.h +++ b/drivers/net/ethernet/freescale/ucc_geth.h @@ -1210,7 +1210,6 @@ struct ucc_geth_private { u16 skb_dirtytx[NUM_TX_QUEUES]; struct ugeth_mii_info *mii_info; - struct phy_device *phydev; phy_interface_t phy_interface; int max_speed; uint32_t msg_enable; diff --git a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c index 699f346faf5cd..fb5254d7d1ba3 100644 --- a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c +++ b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c @@ -103,8 +103,7 @@ static const char rx_fw_stat_gstrings[][ETH_GSTRING_LEN] = { static int uec_get_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { - struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = ugeth->phydev; + struct phy_device *phydev = netdev->phydev; if (!phydev) return -ENODEV; @@ -118,8 +117,7 @@ static int uec_set_ksettings(struct net_device *netdev, const struct ethtool_link_ksettings *cmd) { - struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = ugeth->phydev; + struct phy_device *phydev = netdev->phydev; if (!phydev) return -ENODEV; @@ -132,8 +130,10 @@ uec_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { struct ucc_geth_private *ugeth = netdev_priv(netdev); + struct phy_device *phydev = netdev->phydev; - pause->autoneg = ugeth->phydev->autoneg; + if (phydev) + pause->autoneg = phydev->autoneg; if (ugeth->ug_info->receiveFlowControl) pause->rx_pause = 1; @@ -146,12 +146,13 @@ uec_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { struct ucc_geth_private *ugeth = netdev_priv(netdev); + struct phy_device *phydev = netdev->phydev; int ret = 0; ugeth->ug_info->receiveFlowControl = pause->rx_pause; ugeth->ug_info->transmitFlowControl = pause->tx_pause; - if (ugeth->phydev->autoneg) { + if (phydev && phydev->autoneg) { if (netif_running(netdev)) { /* FIXME: automatically restart */ netdev_info(netdev, "Please re-open the interface\n"); @@ -343,7 +344,7 @@ uec_get_drvinfo(struct net_device *netdev, static void uec_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = ugeth->phydev; + struct phy_device *phydev = netdev->phydev; if (phydev && phydev->irq) wol->supported |= WAKE_PHY; @@ -356,7 +357,7 @@ static void uec_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) static int uec_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = ugeth->phydev; + struct phy_device *phydev = netdev->phydev; if (wol->wolopts & ~(WAKE_PHY | WAKE_MAGIC)) return -EINVAL; From d2adc441a19a592ce104e2b257ad9d002eaec53f Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:15 +0100 Subject: [PATCH 0219/1450] net: freescale: ucc_geth: Fix WOL configuration The get/set_wol ethtool ops rely on querying the PHY for its WoL capabilities, checking for the presence of a PHY and a PHY interrupts isn't enough. Address that by cleaning up the WoL configuration sequence. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 4 +-- drivers/net/ethernet/freescale/ucc_geth.h | 1 + .../net/ethernet/freescale/ucc_geth_ethtool.c | 36 +++++++++++++++---- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index cc5f9ca42a780..587bcbc079dae 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3413,11 +3413,11 @@ static int ucc_geth_suspend(struct platform_device *ofdev, pm_message_t state) */ ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); - if (ugeth->wol_en & WAKE_MAGIC) { + if (ugeth->wol_en & WAKE_MAGIC && !ugeth->phy_wol_en) { setbits32(ugeth->uccf->p_uccm, UCC_GETH_UCCE_MPD); setbits32(&ugeth->ug_regs->maccfg2, MACCFG2_MPE); ucc_fast_enable(ugeth->uccf, COMM_DIR_RX_AND_TX); - } else if (!(ugeth->wol_en & WAKE_PHY)) { + } else if (!ugeth->phy_wol_en) { phy_stop(ndev->phydev); } diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h index c08a56b7c9fe3..e08cfc8d8904b 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.h +++ b/drivers/net/ethernet/freescale/ucc_geth.h @@ -1217,6 +1217,7 @@ struct ucc_geth_private { int oldduplex; int oldlink; int wol_en; + u32 phy_wol_en; struct device_node *node; }; diff --git a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c index fb5254d7d1ba3..89b323ef81450 100644 --- a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c +++ b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c @@ -346,26 +346,48 @@ static void uec_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) struct ucc_geth_private *ugeth = netdev_priv(netdev); struct phy_device *phydev = netdev->phydev; - if (phydev && phydev->irq) - wol->supported |= WAKE_PHY; + wol->supported = 0; + wol->wolopts = 0; + + if (phydev) + phy_ethtool_get_wol(phydev, wol); + if (qe_alive_during_sleep()) wol->supported |= WAKE_MAGIC; - wol->wolopts = ugeth->wol_en; + wol->wolopts |= ugeth->wol_en; } static int uec_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct ucc_geth_private *ugeth = netdev_priv(netdev); struct phy_device *phydev = netdev->phydev; + int ret = 0; - if (wol->wolopts & ~(WAKE_PHY | WAKE_MAGIC)) - return -EINVAL; - else if (wol->wolopts & WAKE_PHY && (!phydev || !phydev->irq)) + if (phydev) { + ret = phy_ethtool_set_wol(phydev, wol); + if (ret == -EOPNOTSUPP) { + ugeth->phy_wol_en = 0; + } else if (ret) { + return ret; + } else { + ugeth->phy_wol_en = wol->wolopts; + goto out; + } + } + + /* If the PHY isn't handling the WoL and the MAC is asked to more than + * WAKE_MAGIC, error-out + */ + if (!ugeth->phy_wol_en && + wol->wolopts & ~WAKE_MAGIC) return -EINVAL; - else if (wol->wolopts & WAKE_MAGIC && !qe_alive_during_sleep()) + + if (wol->wolopts & WAKE_MAGIC && + !qe_alive_during_sleep()) return -EINVAL; +out: ugeth->wol_en = wol->wolopts; device_set_wakeup_enable(&netdev->dev, ugeth->wol_en); From 420d56e4de5247b78fa1e1d5084f246e547e95a9 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:16 +0100 Subject: [PATCH 0220/1450] net: freescale: ucc_geth: Use the correct type to store WoL opts The WoL opts are represented through a bitmask stored in a u32. As this mask is copied as-is in the driver, make sure we use the exact same type to store them internally. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h index e08cfc8d8904b..60fd804a616a3 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.h +++ b/drivers/net/ethernet/freescale/ucc_geth.h @@ -1216,7 +1216,7 @@ struct ucc_geth_private { int oldspeed; int oldduplex; int oldlink; - int wol_en; + u32 wol_en; u32 phy_wol_en; struct device_node *node; From 270ec339126a09564fab67209da8330a20fe446f Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:17 +0100 Subject: [PATCH 0221/1450] net: freescale: ucc_geth: Simplify frame length check The frame length check is configured when the phy interface is setup. However, it's configured according to an internal flag that is always false. So, just make so that we disable the relevant bit in the MACCFG2 register upon accessing it for other MAC configuration operations. Reviewed-by: Andrew Lunn Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 21 +++------------------ drivers/net/ethernet/freescale/ucc_geth.h | 1 - 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 587bcbc079dae..566f53e24d286 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -1205,22 +1205,6 @@ static int init_mac_station_addr_regs(u8 address_byte_0, return 0; } -static int init_check_frame_length_mode(int length_check, - u32 __iomem *maccfg2_register) -{ - u32 value = 0; - - value = in_be32(maccfg2_register); - - if (length_check) - value |= MACCFG2_LC; - else - value &= ~MACCFG2_LC; - - out_be32(maccfg2_register, value); - return 0; -} - static int init_preamble_length(u8 preamble_length, u32 __iomem *maccfg2_register) { @@ -1304,6 +1288,9 @@ static int adjust_enet_interface(struct ucc_geth_private *ugeth) /* Set MACCFG2 */ maccfg2 = in_be32(&ug_regs->maccfg2); + + /* Disable frame length check */ + maccfg2 &= ~MACCFG2_LC; maccfg2 &= ~MACCFG2_INTERFACE_MODE_MASK; if ((ugeth->max_speed == SPEED_10) || (ugeth->max_speed == SPEED_100)) @@ -1365,8 +1352,6 @@ static int adjust_enet_interface(struct ucc_geth_private *ugeth) put_device(&tbiphy->mdio.dev); } - init_check_frame_length_mode(ug_info->lengthCheckRx, &ug_regs->maccfg2); - ret_val = init_preamble_length(ug_info->prel, &ug_regs->maccfg2); if (ret_val != 0) { if (netif_msg_probe(ugeth)) diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h index 60fd804a616a3..2365b61c743ac 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.h +++ b/drivers/net/ethernet/freescale/ucc_geth.h @@ -1088,7 +1088,6 @@ struct ucc_geth_info { u8 miminumInterFrameGapEnforcement; u8 backToBackInterFrameGap; int ipAddressAlignment; - int lengthCheckRx; u32 mblinterval; u16 nortsrbytetime; u8 fracsiz; From dba25f75383fd8c2fe6f0390aa7f7a3b0dd72a63 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:18 +0100 Subject: [PATCH 0222/1450] net: freescale: ucc_geth: Hardcode the preamble length to 7 bytes The preamble length can be configured in ucc_geth, however it just ends-up always being configured to 7 bytes, as nothing ever changes the default value of 7. Make that value the default value when the MACCFG2 register gets initialized, and remove the code to configure that value altogether. Reviewed-by: Andrew Lunn Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 21 --------------------- drivers/net/ethernet/freescale/ucc_geth.h | 4 ++-- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 566f53e24d286..81aefe291d807 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -132,7 +132,6 @@ static const struct ucc_geth_info ugeth_primary_info = { .transmitFlowControl = 1, .maxGroupAddrInHash = 4, .maxIndAddrInHash = 4, - .prel = 7, .maxFrameLength = 1518+16, /* Add extra bytes for VLANs etc. */ .minFrameLength = 64, .maxD1Length = 1520+16, /* Add extra bytes for VLANs etc. */ @@ -1205,18 +1204,6 @@ static int init_mac_station_addr_regs(u8 address_byte_0, return 0; } -static int init_preamble_length(u8 preamble_length, - u32 __iomem *maccfg2_register) -{ - if ((preamble_length < 3) || (preamble_length > 7)) - return -EINVAL; - - clrsetbits_be32(maccfg2_register, MACCFG2_PREL_MASK, - preamble_length << MACCFG2_PREL_SHIFT); - - return 0; -} - static int init_rx_parameters(int reject_broadcast, int receive_short_frames, int promiscuous, u32 __iomem *upsmr_register) @@ -1276,7 +1263,6 @@ static int adjust_enet_interface(struct ucc_geth_private *ugeth) struct ucc_geth_info *ug_info; struct ucc_geth __iomem *ug_regs; struct ucc_fast __iomem *uf_regs; - int ret_val; u32 upsmr, maccfg2; u16 value; @@ -1352,13 +1338,6 @@ static int adjust_enet_interface(struct ucc_geth_private *ugeth) put_device(&tbiphy->mdio.dev); } - ret_val = init_preamble_length(ug_info->prel, &ug_regs->maccfg2); - if (ret_val != 0) { - if (netif_msg_probe(ugeth)) - pr_err("Preamble length must be between 3 and 7 inclusive\n"); - return ret_val; - } - return 0; } diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h index 2365b61c743ac..dfb7273270931 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.h +++ b/drivers/net/ethernet/freescale/ucc_geth.h @@ -921,7 +921,8 @@ struct ucc_geth_hardware_statistics { #define UCC_GETH_UPSMR_INIT UCC_GETH_UPSMR_RES1 #define UCC_GETH_MACCFG1_INIT 0 -#define UCC_GETH_MACCFG2_INIT (MACCFG2_RESERVED_1) +#define UCC_GETH_MACCFG2_INIT (MACCFG2_RESERVED_1 | \ + (7 << MACCFG2_PREL_SHIFT)) /* Ethernet Address Type. */ enum enet_addr_type { @@ -1113,7 +1114,6 @@ struct ucc_geth_info { int transmitFlowControl; u8 maxGroupAddrInHash; u8 maxIndAddrInHash; - u8 prel; u16 maxFrameLength; u16 minFrameLength; u16 maxD1Length; From efc52055b756a231b2c4c6fdec4369c8903afa1e Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:19 +0100 Subject: [PATCH 0223/1450] net: freescale: ucc_geth: Move the serdes configuration around The uec_configure_serdes() function deals with serialized linkmodes settings. It's used during the link bringup sequence. It is planned to be used during the phylink conversion for mac configuration, but it needs to me moved around in the process. To make the phylink port clearer, this commit moves the function without any feature change. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 93 +++++++++++------------ 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 81aefe291d807..f6dd36dc03fea 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -1512,6 +1512,52 @@ static void ugeth_activate(struct ucc_geth_private *ugeth) __netdev_watchdog_up(ugeth->ndev); } +/* Initialize TBI PHY interface for communicating with the + * SERDES lynx PHY on the chip. We communicate with this PHY + * through the MDIO bus on each controller, treating it as a + * "normal" PHY at the address found in the UTBIPA register. We assume + * that the UTBIPA register is valid. Either the MDIO bus code will set + * it to a value that doesn't conflict with other PHYs on the bus, or the + * value doesn't matter, as there are no other PHYs on the bus. + */ +static void uec_configure_serdes(struct net_device *dev) +{ + struct ucc_geth_private *ugeth = netdev_priv(dev); + struct ucc_geth_info *ug_info = ugeth->ug_info; + struct phy_device *tbiphy; + + if (!ug_info->tbi_node) { + dev_warn(&dev->dev, "SGMII mode requires that the device tree specify a tbi-handle\n"); + return; + } + + tbiphy = of_phy_find_device(ug_info->tbi_node); + if (!tbiphy) { + dev_err(&dev->dev, "error: Could not get TBI device\n"); + return; + } + + /* + * If the link is already up, we must already be ok, and don't need to + * configure and reset the TBI<->SerDes link. Maybe U-Boot configured + * everything for us? Resetting it takes the link down and requires + * several seconds for it to come back. + */ + if (phy_read(tbiphy, ENET_TBI_MII_SR) & TBISR_LSTATUS) { + put_device(&tbiphy->mdio.dev); + return; + } + + /* Single clk mode, mii mode off(for serdes communication) */ + phy_write(tbiphy, ENET_TBI_MII_ANA, TBIANA_SETTINGS); + + phy_write(tbiphy, ENET_TBI_MII_TBICON, TBICON_CLK_SELECT); + + phy_write(tbiphy, ENET_TBI_MII_CR, TBICR_SETTINGS); + + put_device(&tbiphy->mdio.dev); +} + static void ugeth_link_up(struct ucc_geth_private *ugeth, struct phy_device *phy, phy_interface_t interface, int speed, int duplex) @@ -1619,53 +1665,6 @@ static void adjust_link(struct net_device *dev) ugeth_link_down(ugeth); } -/* Initialize TBI PHY interface for communicating with the - * SERDES lynx PHY on the chip. We communicate with this PHY - * through the MDIO bus on each controller, treating it as a - * "normal" PHY at the address found in the UTBIPA register. We assume - * that the UTBIPA register is valid. Either the MDIO bus code will set - * it to a value that doesn't conflict with other PHYs on the bus, or the - * value doesn't matter, as there are no other PHYs on the bus. - */ -static void uec_configure_serdes(struct net_device *dev) -{ - struct ucc_geth_private *ugeth = netdev_priv(dev); - struct ucc_geth_info *ug_info = ugeth->ug_info; - struct phy_device *tbiphy; - - if (!ug_info->tbi_node) { - dev_warn(&dev->dev, "SGMII mode requires that the device " - "tree specify a tbi-handle\n"); - return; - } - - tbiphy = of_phy_find_device(ug_info->tbi_node); - if (!tbiphy) { - dev_err(&dev->dev, "error: Could not get TBI device\n"); - return; - } - - /* - * If the link is already up, we must already be ok, and don't need to - * configure and reset the TBI<->SerDes link. Maybe U-Boot configured - * everything for us? Resetting it takes the link down and requires - * several seconds for it to come back. - */ - if (phy_read(tbiphy, ENET_TBI_MII_SR) & TBISR_LSTATUS) { - put_device(&tbiphy->mdio.dev); - return; - } - - /* Single clk mode, mii mode off(for serdes communication) */ - phy_write(tbiphy, ENET_TBI_MII_ANA, TBIANA_SETTINGS); - - phy_write(tbiphy, ENET_TBI_MII_TBICON, TBICON_CLK_SELECT); - - phy_write(tbiphy, ENET_TBI_MII_CR, TBICR_SETTINGS); - - put_device(&tbiphy->mdio.dev); -} - /* Configure the PHY for dev. * returns 0 if success. -1 if failure */ From 02d4a6498b3028f64f93ac61c0ff346e8ab661e0 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:20 +0100 Subject: [PATCH 0224/1450] net: freescale: ucc_geth: Introduce a helper to check Reduced modes A number of parallel MII interfaces also exist in a "Reduced" mode, usually with higher clock rates and fewer data lines, to ease the hardware design. This is what the 'R' stands for in RGMII, RMII, RTBI, RXAUI, etc. The UCC Geth controller has a special configuration bit that needs to be set when the MII mode is one of the supported reduced modes. Add a local helper for that. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index f6dd36dc03fea..57debcba124c9 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -1258,6 +1258,13 @@ static int init_min_frame_len(u16 min_frame_length, return 0; } +static bool phy_interface_mode_is_reduced(phy_interface_t interface) +{ + return phy_interface_mode_is_rgmii(interface) || + interface == PHY_INTERFACE_MODE_RMII || + interface == PHY_INTERFACE_MODE_RTBI; +} + static int adjust_enet_interface(struct ucc_geth_private *ugeth) { struct ucc_geth_info *ug_info; @@ -1290,12 +1297,7 @@ static int adjust_enet_interface(struct ucc_geth_private *ugeth) upsmr = in_be32(&uf_regs->upsmr); upsmr &= ~(UCC_GETH_UPSMR_RPM | UCC_GETH_UPSMR_R10M | UCC_GETH_UPSMR_TBIM | UCC_GETH_UPSMR_RMM); - if ((ugeth->phy_interface == PHY_INTERFACE_MODE_RMII) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_ID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RTBI)) { + if (phy_interface_mode_is_reduced(ugeth->phy_interface)) { if (ugeth->phy_interface != PHY_INTERFACE_MODE_RMII) upsmr |= UCC_GETH_UPSMR_RPM; switch (ugeth->max_speed) { @@ -1594,9 +1596,7 @@ static void ugeth_link_up(struct ucc_geth_private *ugeth, ~(MACCFG2_INTERFACE_MODE_MASK)) | MACCFG2_INTERFACE_MODE_NIBBLE); /* if reduced mode, re-set UPSMR.R10M */ - if (interface == PHY_INTERFACE_MODE_RMII || - phy_interface_mode_is_rgmii(interface) || - interface == PHY_INTERFACE_MODE_RTBI) { + if (phy_interface_mode_is_reduced(interface)) { if (speed == SPEED_10) upsmr |= UCC_GETH_UPSMR_R10M; else From 53036aa8d03178a8d056a24a52a301ad290877d4 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 3 Dec 2024 13:43:21 +0100 Subject: [PATCH 0225/1450] net: freescale: ucc_geth: phylink conversion ucc_geth is quite capable in terms of supported interfaces, and even includes an externally controlled PCS (well, TBI). Port that driver to phylink. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/Kconfig | 3 +- drivers/net/ethernet/freescale/ucc_geth.c | 445 ++++++++---------- drivers/net/ethernet/freescale/ucc_geth.h | 13 +- .../net/ethernet/freescale/ucc_geth_ethtool.c | 73 +-- 4 files changed, 209 insertions(+), 325 deletions(-) diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig index 75401d2a5fb49..a2d7300925a82 100644 --- a/drivers/net/ethernet/freescale/Kconfig +++ b/drivers/net/ethernet/freescale/Kconfig @@ -81,8 +81,7 @@ config UCC_GETH tristate "Freescale QE Gigabit Ethernet" depends on QUICC_ENGINE && PPC32 select FSL_PQ_MDIO - select PHYLIB - select FIXED_PHY + select PHYLINK help This driver supports the Gigabit Ethernet mode of the QUICC Engine, which is available on some Freescale SOCs. diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 57debcba124c9..f47f8177a93b4 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1265,84 +1266,6 @@ static bool phy_interface_mode_is_reduced(phy_interface_t interface) interface == PHY_INTERFACE_MODE_RTBI; } -static int adjust_enet_interface(struct ucc_geth_private *ugeth) -{ - struct ucc_geth_info *ug_info; - struct ucc_geth __iomem *ug_regs; - struct ucc_fast __iomem *uf_regs; - u32 upsmr, maccfg2; - u16 value; - - ugeth_vdbg("%s: IN", __func__); - - ug_info = ugeth->ug_info; - ug_regs = ugeth->ug_regs; - uf_regs = ugeth->uccf->uf_regs; - - /* Set MACCFG2 */ - maccfg2 = in_be32(&ug_regs->maccfg2); - - /* Disable frame length check */ - maccfg2 &= ~MACCFG2_LC; - maccfg2 &= ~MACCFG2_INTERFACE_MODE_MASK; - if ((ugeth->max_speed == SPEED_10) || - (ugeth->max_speed == SPEED_100)) - maccfg2 |= MACCFG2_INTERFACE_MODE_NIBBLE; - else if (ugeth->max_speed == SPEED_1000) - maccfg2 |= MACCFG2_INTERFACE_MODE_BYTE; - maccfg2 |= ug_info->padAndCrc; - out_be32(&ug_regs->maccfg2, maccfg2); - - /* Set UPSMR */ - upsmr = in_be32(&uf_regs->upsmr); - upsmr &= ~(UCC_GETH_UPSMR_RPM | UCC_GETH_UPSMR_R10M | - UCC_GETH_UPSMR_TBIM | UCC_GETH_UPSMR_RMM); - if (phy_interface_mode_is_reduced(ugeth->phy_interface)) { - if (ugeth->phy_interface != PHY_INTERFACE_MODE_RMII) - upsmr |= UCC_GETH_UPSMR_RPM; - switch (ugeth->max_speed) { - case SPEED_10: - upsmr |= UCC_GETH_UPSMR_R10M; - fallthrough; - case SPEED_100: - if (ugeth->phy_interface != PHY_INTERFACE_MODE_RTBI) - upsmr |= UCC_GETH_UPSMR_RMM; - } - } - if ((ugeth->phy_interface == PHY_INTERFACE_MODE_TBI) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RTBI)) { - upsmr |= UCC_GETH_UPSMR_TBIM; - } - if (ugeth->phy_interface == PHY_INTERFACE_MODE_SGMII) - upsmr |= UCC_GETH_UPSMR_SGMM; - - out_be32(&uf_regs->upsmr, upsmr); - - /* Disable autonegotiation in tbi mode, because by default it - comes up in autonegotiation mode. */ - /* Note that this depends on proper setting in utbipar register. */ - if ((ugeth->phy_interface == PHY_INTERFACE_MODE_TBI) || - (ugeth->phy_interface == PHY_INTERFACE_MODE_RTBI)) { - struct ucc_geth_info *ug_info = ugeth->ug_info; - struct phy_device *tbiphy; - - if (!ug_info->tbi_node) - pr_warn("TBI mode requires that the device tree specify a tbi-handle\n"); - - tbiphy = of_phy_find_device(ug_info->tbi_node); - if (!tbiphy) - pr_warn("Could not get TBI device\n"); - - value = phy_read(tbiphy, ENET_TBI_MII_CR); - value &= ~0x1000; /* Turn off autonegotiation */ - phy_write(tbiphy, ENET_TBI_MII_CR, value); - - put_device(&tbiphy->mdio.dev); - } - - return 0; -} - static int ugeth_graceful_stop_tx(struct ucc_geth_private *ugeth) { struct ucc_fast_private *uccf; @@ -1560,64 +1483,62 @@ static void uec_configure_serdes(struct net_device *dev) put_device(&tbiphy->mdio.dev); } -static void ugeth_link_up(struct ucc_geth_private *ugeth, - struct phy_device *phy, - phy_interface_t interface, int speed, int duplex) +static void ugeth_mac_link_up(struct phylink_config *config, struct phy_device *phy, + unsigned int mode, phy_interface_t interface, + int speed, int duplex, bool tx_pause, bool rx_pause) { + struct net_device *ndev = to_net_dev(config->dev); + struct ucc_geth_private *ugeth = netdev_priv(ndev); + struct ucc_geth_info *ug_info = ugeth->ug_info; struct ucc_geth __iomem *ug_regs = ugeth->ug_regs; struct ucc_fast __iomem *uf_regs = ugeth->uccf->uf_regs; - u32 tempval = in_be32(&ug_regs->maccfg2); - u32 upsmr = in_be32(&uf_regs->upsmr); - int new_state = 0; + u32 old_maccfg2, maccfg2 = in_be32(&ug_regs->maccfg2); + u32 old_upsmr, upsmr = in_be32(&uf_regs->upsmr); - /* Now we make sure that we can be in full duplex mode. - * If not, we operate in half-duplex mode. - */ - if (duplex != ugeth->oldduplex) { - new_state = 1; - if (duplex == DUPLEX_HALF) - tempval &= ~(MACCFG2_FDX); - else - tempval |= MACCFG2_FDX; - ugeth->oldduplex = duplex; - } + old_maccfg2 = maccfg2; + old_upsmr = upsmr; + + /* No length check */ + maccfg2 &= ~MACCFG2_LC; + maccfg2 &= ~MACCFG2_INTERFACE_MODE_MASK; + upsmr &= ~(UCC_GETH_UPSMR_RPM | UCC_GETH_UPSMR_R10M | + UCC_GETH_UPSMR_TBIM | UCC_GETH_UPSMR_RMM); + + if (speed == SPEED_10 || speed == SPEED_100) + maccfg2 |= MACCFG2_INTERFACE_MODE_NIBBLE; + else if (speed == SPEED_1000) + maccfg2 |= MACCFG2_INTERFACE_MODE_BYTE; + + maccfg2 |= ug_info->padAndCrc; + + if (phy_interface_mode_is_reduced(interface)) { + + if (interface != PHY_INTERFACE_MODE_RMII) + upsmr |= UCC_GETH_UPSMR_RPM; - if (speed != ugeth->oldspeed) { - new_state = 1; switch (speed) { - case SPEED_1000: - tempval = ((tempval & - ~(MACCFG2_INTERFACE_MODE_MASK)) | - MACCFG2_INTERFACE_MODE_BYTE); - break; - case SPEED_100: case SPEED_10: - tempval = ((tempval & - ~(MACCFG2_INTERFACE_MODE_MASK)) | - MACCFG2_INTERFACE_MODE_NIBBLE); - /* if reduced mode, re-set UPSMR.R10M */ - if (phy_interface_mode_is_reduced(interface)) { - if (speed == SPEED_10) - upsmr |= UCC_GETH_UPSMR_R10M; - else - upsmr &= ~UCC_GETH_UPSMR_R10M; - } - break; - default: - if (netif_msg_link(ugeth)) - pr_warn("%s: Speed (%d) is not 10/100/1000!", - netdev_name(ugeth->ndev), speed); - break; + upsmr |= UCC_GETH_UPSMR_R10M; + fallthrough; + case SPEED_100: + if (interface != PHY_INTERFACE_MODE_RTBI) + upsmr |= UCC_GETH_UPSMR_RMM; } - ugeth->oldspeed = speed; } - if (!ugeth->oldlink) { - new_state = 1; - ugeth->oldlink = 1; - } + if (interface == PHY_INTERFACE_MODE_TBI || + interface == PHY_INTERFACE_MODE_RTBI) + upsmr |= UCC_GETH_UPSMR_TBIM; + + if (interface == PHY_INTERFACE_MODE_SGMII) + upsmr |= UCC_GETH_UPSMR_SGMM; - if (new_state) { + if (duplex == DUPLEX_HALF) + maccfg2 &= ~(MACCFG2_FDX); + else + maccfg2 |= MACCFG2_FDX; + + if (maccfg2 != old_maccfg2 || upsmr != old_upsmr) { /* * To change the MAC configuration we need to disable * the controller. To do so, we have to either grab @@ -1628,69 +1549,79 @@ static void ugeth_link_up(struct ucc_geth_private *ugeth, ugeth_quiesce(ugeth); ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); - out_be32(&ug_regs->maccfg2, tempval); + out_be32(&ug_regs->maccfg2, maccfg2); out_be32(&uf_regs->upsmr, upsmr); ugeth_enable(ugeth, COMM_DIR_RX_AND_TX); ugeth_activate(ugeth); } - if (netif_msg_link(ugeth)) - phy_print_status(phy); -} + if (interface == PHY_INTERFACE_MODE_SGMII) + uec_configure_serdes(ndev); -static void ugeth_link_down(struct ucc_geth_private *ugeth) -{ - ugeth->oldlink = 0; - ugeth->oldspeed = 0; - ugeth->oldduplex = -1; -} + if (!phylink_autoneg_inband(mode)) { + ug_info->aufc = 0; + ug_info->receiveFlowControl = rx_pause; + ug_info->transmitFlowControl = tx_pause; -/* Called every time the controller might need to be made - * aware of new link state. The PHY code conveys this - * information through variables in the ugeth structure, and this - * function converts those variables into the appropriate - * register values, and can bring down the device if needed. - */ + init_flow_control_params(ug_info->aufc, + ug_info->receiveFlowControl, + ug_info->transmitFlowControl, + ug_info->pausePeriod, + ug_info->extensionField, + &ugeth->uccf->uf_regs->upsmr, + &ugeth->ug_regs->uempr, + &ugeth->ug_regs->maccfg1); + } -static void adjust_link(struct net_device *dev) + ugeth_enable(ugeth, COMM_DIR_RX_AND_TX); +} + +static void ugeth_mac_link_down(struct phylink_config *config, + unsigned int mode, phy_interface_t interface) { - struct ucc_geth_private *ugeth = netdev_priv(dev); - struct phy_device *phydev = dev->phydev; + struct net_device *ndev = to_net_dev(config->dev); + struct ucc_geth_private *ugeth = netdev_priv(ndev); - if (phydev->link) - ugeth_link_up(ugeth, phydev, phydev->interface, - phydev->speed, phydev->duplex); - else - ugeth_link_down(ugeth); + ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); } -/* Configure the PHY for dev. - * returns 0 if success. -1 if failure - */ -static int init_phy(struct net_device *dev) +static void ugeth_mac_config(struct phylink_config *config, unsigned int mode, + const struct phylink_link_state *state) { - struct ucc_geth_private *priv = netdev_priv(dev); - struct ucc_geth_info *ug_info = priv->ug_info; - struct phy_device *phydev; + struct net_device *ndev = to_net_dev(config->dev); + struct ucc_geth_private *ugeth = netdev_priv(ndev); + struct ucc_geth_info *ug_info = ugeth->ug_info; + u16 value; - priv->oldlink = 0; - priv->oldspeed = 0; - priv->oldduplex = -1; + if (state->interface == PHY_INTERFACE_MODE_TBI || + state->interface == PHY_INTERFACE_MODE_RTBI) { + struct phy_device *tbiphy; - phydev = of_phy_connect(dev, ug_info->phy_node, &adjust_link, 0, - priv->phy_interface); - if (!phydev) { - dev_err(&dev->dev, "Could not attach to PHY\n"); - return -ENODEV; - } + if (!ug_info->tbi_node) + pr_warn("TBI mode requires that the device tree specify a tbi-handle\n"); + + tbiphy = of_phy_find_device(ug_info->tbi_node); + if (!tbiphy) + pr_warn("Could not get TBI device\n"); + + value = phy_read(tbiphy, ENET_TBI_MII_CR); + value &= ~0x1000; /* Turn off autonegotiation */ + phy_write(tbiphy, ENET_TBI_MII_CR, value); - if (priv->phy_interface == PHY_INTERFACE_MODE_SGMII) - uec_configure_serdes(dev); + put_device(&tbiphy->mdio.dev); + } - phy_set_max_speed(phydev, priv->max_speed); + if (phylink_autoneg_inband(mode)) { + ug_info->aufc = 1; - return 0; + init_flow_control_params(ug_info->aufc, 1, 1, + ug_info->pausePeriod, + ug_info->extensionField, + &ugeth->uccf->uf_regs->upsmr, + &ugeth->ug_regs->uempr, + &ugeth->ug_regs->maccfg1); + } } static void ugeth_dump_regs(struct ucc_geth_private *ugeth) @@ -1962,7 +1893,6 @@ static void ucc_geth_set_multi(struct net_device *dev) static void ucc_geth_stop(struct ucc_geth_private *ugeth) { struct ucc_geth __iomem *ug_regs = ugeth->ug_regs; - struct phy_device *phydev = ugeth->ndev->phydev; ugeth_vdbg("%s: IN", __func__); @@ -1971,7 +1901,7 @@ static void ucc_geth_stop(struct ucc_geth_private *ugeth) * Must be done before disabling the controller * or deadlock may happen. */ - phy_stop(phydev); + phylink_stop(ugeth->phylink); /* Disable the controller */ ugeth_disable(ugeth, COMM_DIR_RX_AND_TX); @@ -3213,12 +3143,6 @@ static int ucc_geth_init_mac(struct ucc_geth_private *ugeth) goto err; } - err = adjust_enet_interface(ugeth); - if (err) { - netif_err(ugeth, ifup, dev, "Cannot configure net device, aborting\n"); - goto err; - } - /* Set MACSTNADDR1, MACSTNADDR2 */ /* For more details see the hardware spec. */ init_mac_station_addr_regs(dev->dev_addr[0], @@ -3230,12 +3154,6 @@ static int ucc_geth_init_mac(struct ucc_geth_private *ugeth) &ugeth->ug_regs->macstnaddr1, &ugeth->ug_regs->macstnaddr2); - err = ugeth_enable(ugeth, COMM_DIR_RX_AND_TX); - if (err) { - netif_err(ugeth, ifup, dev, "Cannot enable net device, aborting\n"); - goto err; - } - return 0; err: ucc_geth_stop(ugeth); @@ -3258,10 +3176,10 @@ static int ucc_geth_open(struct net_device *dev) return -EINVAL; } - err = init_phy(dev); + err = phylink_of_phy_connect(ugeth->phylink, ugeth->dev->of_node, 0); if (err) { - netif_err(ugeth, ifup, dev, "Cannot initialize PHY, aborting\n"); - return err; + dev_err(&dev->dev, "Could not attach to PHY\n"); + return -ENODEV; } err = ucc_geth_init_mac(ugeth); @@ -3277,7 +3195,7 @@ static int ucc_geth_open(struct net_device *dev) goto err; } - phy_start(dev->phydev); + phylink_start(ugeth->phylink); napi_enable(&ugeth->napi); netdev_reset_queue(dev); netif_start_queue(dev); @@ -3304,7 +3222,7 @@ static int ucc_geth_close(struct net_device *dev) cancel_work_sync(&ugeth->timeout_work); ucc_geth_stop(ugeth); - phy_disconnect(dev->phydev); + phylink_disconnect_phy(ugeth->phylink); free_irq(ugeth->ug_info->uf_info.irq, ugeth->ndev); @@ -3338,7 +3256,7 @@ static void ucc_geth_timeout_work(struct work_struct *work) ucc_geth_stop(ugeth); ucc_geth_init_mac(ugeth); /* Must start PHY here */ - phy_start(dev->phydev); + phylink_start(ugeth->phylink); netif_tx_start_all_queues(dev); } @@ -3363,6 +3281,7 @@ static int ucc_geth_suspend(struct platform_device *ofdev, pm_message_t state) { struct net_device *ndev = platform_get_drvdata(ofdev); struct ucc_geth_private *ugeth = netdev_priv(ndev); + bool mac_wol = false; if (!netif_running(ndev)) return 0; @@ -3380,10 +3299,13 @@ static int ucc_geth_suspend(struct platform_device *ofdev, pm_message_t state) setbits32(ugeth->uccf->p_uccm, UCC_GETH_UCCE_MPD); setbits32(&ugeth->ug_regs->maccfg2, MACCFG2_MPE); ucc_fast_enable(ugeth->uccf, COMM_DIR_RX_AND_TX); - } else if (!ugeth->phy_wol_en) { - phy_stop(ndev->phydev); + mac_wol = true; } + rtnl_lock(); + phylink_suspend(ugeth->phylink, mac_wol); + rtnl_unlock(); + return 0; } @@ -3417,12 +3339,9 @@ static int ucc_geth_resume(struct platform_device *ofdev) } } - ugeth->oldlink = 0; - ugeth->oldspeed = 0; - ugeth->oldduplex = -1; - - phy_stop(ndev->phydev); - phy_start(ndev->phydev); + rtnl_lock(); + phylink_resume(ugeth->phylink); + rtnl_unlock(); napi_enable(&ugeth->napi); netif_device_attach(ndev); @@ -3437,13 +3356,12 @@ static int ucc_geth_resume(struct platform_device *ofdev) static int ucc_geth_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { + struct ucc_geth_private *ugeth = netdev_priv(dev); + if (!netif_running(dev)) return -EINVAL; - if (!dev->phydev) - return -ENODEV; - - return phy_mii_ioctl(dev->phydev, rq, cmd); + return phylink_mii_ioctl(ugeth->phylink, rq, cmd); } static const struct net_device_ops ucc_geth_netdev_ops = { @@ -3451,7 +3369,6 @@ static const struct net_device_ops ucc_geth_netdev_ops = { .ndo_stop = ucc_geth_close, .ndo_start_xmit = ucc_geth_start_xmit, .ndo_validate_addr = eth_validate_addr, - .ndo_change_carrier = fixed_phy_change_carrier, .ndo_set_mac_address = ucc_geth_set_mac_addr, .ndo_set_rx_mode = ucc_geth_set_multi, .ndo_tx_timeout = ucc_geth_timeout, @@ -3491,6 +3408,12 @@ static int ucc_geth_parse_clock(struct device_node *np, const char *which, return 0; } +struct phylink_mac_ops ugeth_mac_ops = { + .mac_link_up = ugeth_mac_link_up, + .mac_link_down = ugeth_mac_link_down, + .mac_config = ugeth_mac_config, +}; + static int ucc_geth_probe(struct platform_device* ofdev) { struct device *device = &ofdev->dev; @@ -3498,8 +3421,10 @@ static int ucc_geth_probe(struct platform_device* ofdev) struct net_device *dev = NULL; struct ucc_geth_private *ugeth = NULL; struct ucc_geth_info *ug_info; + struct device_node *phy_node; + struct phylink *phylink; struct resource res; - int err, ucc_num, max_speed = 0; + int err, ucc_num; const unsigned int *prop; phy_interface_t phy_interface; @@ -3537,57 +3462,35 @@ static int ucc_geth_probe(struct platform_device* ofdev) ug_info->uf_info.regs = res.start; ug_info->uf_info.irq = irq_of_parse_and_map(np, 0); - ug_info->phy_node = of_parse_phandle(np, "phy-handle", 0); - if (!ug_info->phy_node && of_phy_is_fixed_link(np)) { - /* - * In the case of a fixed PHY, the DT node associated - * to the PHY is the Ethernet MAC DT node. - */ - err = of_phy_register_fixed_link(np); - if (err) - return err; - ug_info->phy_node = of_node_get(np); - } - /* Find the TBI PHY node. If it's not there, we don't support SGMII */ ug_info->tbi_node = of_parse_phandle(np, "tbi-handle", 0); - prop = of_get_property(ug_info->phy_node, "interface", NULL); - if (prop) { - dev_err(&ofdev->dev, - "Device-tree property 'interface' is no longer supported. Please use 'phy-connection-type' instead."); - err = -EINVAL; - goto err_deregister_fixed_link; + phy_node = of_parse_phandle(np, "phy-handle", 0); + if (phy_node) { + prop = of_get_property(phy_node, "interface", NULL); + if (prop) { + dev_err(&ofdev->dev, + "Device-tree property 'interface' is no longer supported. Please use 'phy-connection-type' instead."); + of_node_put(phy_node); + err = -EINVAL; + goto err_put_tbi; + } + of_node_put(phy_node); } err = of_get_phy_mode(np, &phy_interface); if (err) { dev_err(&ofdev->dev, "Invalid phy-connection-type"); - goto err_deregister_fixed_link; - } - - /* get speed, or derive from PHY interface */ - if (max_speed == 0) - switch (phy_interface) { - case PHY_INTERFACE_MODE_GMII: - case PHY_INTERFACE_MODE_RGMII: - case PHY_INTERFACE_MODE_RGMII_ID: - case PHY_INTERFACE_MODE_RGMII_RXID: - case PHY_INTERFACE_MODE_RGMII_TXID: - case PHY_INTERFACE_MODE_TBI: - case PHY_INTERFACE_MODE_RTBI: - case PHY_INTERFACE_MODE_SGMII: - max_speed = SPEED_1000; - break; - default: - max_speed = SPEED_100; - break; - } + goto err_put_tbi; + } - if (max_speed == SPEED_1000) { + if (phy_interface == PHY_INTERFACE_MODE_GMII || + phy_interface_mode_is_rgmii(phy_interface) || + phy_interface == PHY_INTERFACE_MODE_TBI || + phy_interface == PHY_INTERFACE_MODE_RTBI || + phy_interface == PHY_INTERFACE_MODE_SGMII) { unsigned int snums = qe_get_num_of_snums(); - /* configure muram FIFOs for gigabit operation */ ug_info->uf_info.urfs = UCC_GETH_URFS_GIGA_INIT; ug_info->uf_info.urfet = UCC_GETH_URFET_GIGA_INIT; ug_info->uf_info.urfset = UCC_GETH_URFSET_GIGA_INIT; @@ -3616,7 +3519,7 @@ static int ucc_geth_probe(struct platform_device* ofdev) dev = devm_alloc_etherdev(&ofdev->dev, sizeof(*ugeth)); if (!dev) { err = -ENOMEM; - goto err_deregister_fixed_link; + goto err_put_tbi; } ugeth = netdev_priv(dev); @@ -3643,23 +3546,50 @@ static int ucc_geth_probe(struct platform_device* ofdev) dev->max_mtu = 1518; ugeth->msg_enable = netif_msg_init(debug.msg_enable, UGETH_MSG_DEFAULT); - ugeth->phy_interface = phy_interface; - ugeth->max_speed = max_speed; - /* Carrier starts down, phylib will bring it up */ - netif_carrier_off(dev); + ugeth->phylink_config.dev = &dev->dev; + ugeth->phylink_config.type = PHYLINK_NETDEV; + + ugeth->phylink_config.mac_capabilities = + MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000FD; + + __set_bit(PHY_INTERFACE_MODE_MII, + ugeth->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_RMII, + ugeth->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_GMII, + ugeth->phylink_config.supported_interfaces); + phy_interface_set_rgmii(ugeth->phylink_config.supported_interfaces); + + if (ug_info->tbi_node) { + __set_bit(PHY_INTERFACE_MODE_SGMII, + ugeth->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_TBI, + ugeth->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_RTBI, + ugeth->phylink_config.supported_interfaces); + } + + phylink = phylink_create(&ugeth->phylink_config, dev_fwnode(&dev->dev), + phy_interface, &ugeth_mac_ops); + if (IS_ERR(phylink)) { + err = PTR_ERR(phylink); + goto err_put_tbi; + } + + ugeth->phylink = phylink; err = devm_register_netdev(&ofdev->dev, dev); if (err) { if (netif_msg_probe(ugeth)) pr_err("%s: Cannot register net device, aborting\n", dev->name); - goto err_deregister_fixed_link; + goto err_destroy_phylink; } err = of_get_ethdev_address(np, dev); if (err == -EPROBE_DEFER) - goto err_deregister_fixed_link; + goto err_destroy_phylink; ugeth->ug_info = ug_info; ugeth->dev = device; @@ -3668,11 +3598,11 @@ static int ucc_geth_probe(struct platform_device* ofdev) return 0; -err_deregister_fixed_link: - if (of_phy_is_fixed_link(np)) - of_phy_deregister_fixed_link(np); +err_destroy_phylink: + phylink_destroy(phylink); +err_put_tbi: of_node_put(ug_info->tbi_node); - of_node_put(ug_info->phy_node); + return err; } @@ -3680,13 +3610,10 @@ static void ucc_geth_remove(struct platform_device* ofdev) { struct net_device *dev = platform_get_drvdata(ofdev); struct ucc_geth_private *ugeth = netdev_priv(dev); - struct device_node *np = ofdev->dev.of_node; ucc_geth_memclean(ugeth); - if (of_phy_is_fixed_link(np)) - of_phy_deregister_fixed_link(np); + phylink_destroy(ugeth->phylink); of_node_put(ugeth->ug_info->tbi_node); - of_node_put(ugeth->ug_info->phy_node); } static const struct of_device_id ucc_geth_match[] = { diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h index dfb7273270931..38789faae706b 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.h +++ b/drivers/net/ethernet/freescale/ucc_geth.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -1074,6 +1075,9 @@ struct ucc_geth_tad_params { u16 vid; }; +struct phylink; +struct phylink_config; + /* GETH protocol initialization structure */ struct ucc_geth_info { struct ucc_fast_info uf_info; @@ -1124,7 +1128,6 @@ struct ucc_geth_info { u32 eventRegMask; u16 pausePeriod; u16 extensionField; - struct device_node *phy_node; struct device_node *tbi_node; u8 weightfactor[NUM_TX_QUEUES]; u8 interruptcoalescingmaxvalue[NUM_RX_QUEUES]; @@ -1209,15 +1212,13 @@ struct ucc_geth_private { u16 skb_dirtytx[NUM_TX_QUEUES]; struct ugeth_mii_info *mii_info; - phy_interface_t phy_interface; - int max_speed; uint32_t msg_enable; - int oldspeed; - int oldduplex; - int oldlink; u32 wol_en; u32 phy_wol_en; + struct phylink *phylink; + struct phylink_config phylink_config; + struct device_node *node; }; diff --git a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c index 89b323ef81450..1fb49e5a414a4 100644 --- a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c +++ b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c @@ -103,26 +103,18 @@ static const char rx_fw_stat_gstrings[][ETH_GSTRING_LEN] = { static int uec_get_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { - struct phy_device *phydev = netdev->phydev; - - if (!phydev) - return -ENODEV; - - phy_ethtool_ksettings_get(phydev, cmd); + struct ucc_geth_private *ugeth = netdev_priv(netdev); - return 0; + return phylink_ethtool_ksettings_get(ugeth->phylink, cmd); } static int uec_set_ksettings(struct net_device *netdev, const struct ethtool_link_ksettings *cmd) { - struct phy_device *phydev = netdev->phydev; - - if (!phydev) - return -ENODEV; + struct ucc_geth_private *ugeth = netdev_priv(netdev); - return phy_ethtool_ksettings_set(phydev, cmd); + return phylink_ethtool_ksettings_set(ugeth->phylink, cmd); } static void @@ -130,15 +122,8 @@ uec_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = netdev->phydev; - - if (phydev) - pause->autoneg = phydev->autoneg; - if (ugeth->ug_info->receiveFlowControl) - pause->rx_pause = 1; - if (ugeth->ug_info->transmitFlowControl) - pause->tx_pause = 1; + return phylink_ethtool_get_pauseparam(ugeth->phylink, pause); } static int @@ -146,31 +131,11 @@ uec_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = netdev->phydev; - int ret = 0; ugeth->ug_info->receiveFlowControl = pause->rx_pause; ugeth->ug_info->transmitFlowControl = pause->tx_pause; - if (phydev && phydev->autoneg) { - if (netif_running(netdev)) { - /* FIXME: automatically restart */ - netdev_info(netdev, "Please re-open the interface\n"); - } - } else { - struct ucc_geth_info *ug_info = ugeth->ug_info; - - ret = init_flow_control_params(ug_info->aufc, - ug_info->receiveFlowControl, - ug_info->transmitFlowControl, - ug_info->pausePeriod, - ug_info->extensionField, - &ugeth->uccf->uf_regs->upsmr, - &ugeth->ug_regs->uempr, - &ugeth->ug_regs->maccfg1); - } - - return ret; + return phylink_ethtool_set_pauseparam(ugeth->phylink, pause); } static uint32_t @@ -344,13 +309,8 @@ uec_get_drvinfo(struct net_device *netdev, static void uec_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = netdev->phydev; - - wol->supported = 0; - wol->wolopts = 0; - if (phydev) - phy_ethtool_get_wol(phydev, wol); + phylink_ethtool_get_wol(ugeth->phylink, wol); if (qe_alive_during_sleep()) wol->supported |= WAKE_MAGIC; @@ -361,19 +321,16 @@ static void uec_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) static int uec_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct ucc_geth_private *ugeth = netdev_priv(netdev); - struct phy_device *phydev = netdev->phydev; int ret = 0; - if (phydev) { - ret = phy_ethtool_set_wol(phydev, wol); - if (ret == -EOPNOTSUPP) { - ugeth->phy_wol_en = 0; - } else if (ret) { - return ret; - } else { - ugeth->phy_wol_en = wol->wolopts; - goto out; - } + ret = phylink_ethtool_set_wol(ugeth->phylink, wol); + if (ret == -EOPNOTSUPP) { + ugeth->phy_wol_en = 0; + } else if (ret) { + return ret; + } else { + ugeth->phy_wol_en = wol->wolopts; + goto out; } /* If the PHY isn't handling the WoL and the MAC is asked to more than From e36d46b9af682bac7c376638cf0fd98d18b98653 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Tue, 3 Dec 2024 15:13:37 -0800 Subject: [PATCH 0226/1450] net: simplify resource acquisition + ioremap get resource + request_mem_region + ioremap can all be done by a single function. Replace them with devm_platform_get_and_ioremap_resource or\ devm_platform_ioremap_resource where res is not used. Signed-off-by: Rosen Penev Reviewed-by: Vincent Mailhol # sja1000_platform.c Link: https://patch.msgid.link/20241203231337.182391-1-rosenp@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/can/sja1000/sja1000_platform.c | 15 ++-------- drivers/net/ethernet/freescale/fman/fman.c | 35 +++++----------------- drivers/net/ethernet/lantiq_etop.c | 25 ++-------------- drivers/net/mdio/mdio-octeon.c | 25 +++------------- 4 files changed, 17 insertions(+), 83 deletions(-) diff --git a/drivers/net/can/sja1000/sja1000_platform.c b/drivers/net/can/sja1000/sja1000_platform.c index c42ebe9da55ab..2d555f854008b 100644 --- a/drivers/net/can/sja1000/sja1000_platform.c +++ b/drivers/net/can/sja1000/sja1000_platform.c @@ -230,18 +230,9 @@ static int sp_probe(struct platform_device *pdev) return -ENODEV; } - res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res_mem) - return -ENODEV; - - if (!devm_request_mem_region(&pdev->dev, res_mem->start, - resource_size(res_mem), DRV_NAME)) - return -EBUSY; - - addr = devm_ioremap(&pdev->dev, res_mem->start, - resource_size(res_mem)); - if (!addr) - return -ENOMEM; + addr = devm_platform_get_and_ioremap_resource(pdev, 0, &res_mem); + if (IS_ERR(addr)) + return PTR_ERR(addr); if (of) { irq = platform_get_irq(pdev, 0); diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c index fb416d60dcd72..11887458f050c 100644 --- a/drivers/net/ethernet/freescale/fman/fman.c +++ b/drivers/net/ethernet/freescale/fman/fman.c @@ -2690,13 +2690,12 @@ static struct fman *read_dts_node(struct platform_device *of_dev) { struct fman *fman; struct device_node *fm_node, *muram_node; + void __iomem *base_addr; struct resource *res; u32 val, range[2]; int err, irq; struct clk *clk; u32 clk_rate; - phys_addr_t phys_base_addr; - resource_size_t mem_size; fman = kzalloc(sizeof(*fman), GFP_KERNEL); if (!fman) @@ -2724,18 +2723,6 @@ static struct fman *read_dts_node(struct platform_device *of_dev) goto fman_node_put; fman->dts_params.err_irq = err; - /* Get the FM address */ - res = platform_get_resource(of_dev, IORESOURCE_MEM, 0); - if (!res) { - err = -EINVAL; - dev_err(&of_dev->dev, "%s: Can't get FMan memory resource\n", - __func__); - goto fman_node_put; - } - - phys_base_addr = res->start; - mem_size = resource_size(res); - clk = of_clk_get(fm_node, 0); if (IS_ERR(clk)) { err = PTR_ERR(clk); @@ -2803,24 +2790,16 @@ static struct fman *read_dts_node(struct platform_device *of_dev) } } - fman->dts_params.res = - devm_request_mem_region(&of_dev->dev, phys_base_addr, - mem_size, "fman"); - if (!fman->dts_params.res) { - err = -EBUSY; - dev_err(&of_dev->dev, "%s: request_mem_region() failed\n", - __func__); - goto fman_free; - } - - fman->dts_params.base_addr = - devm_ioremap(&of_dev->dev, phys_base_addr, mem_size); - if (!fman->dts_params.base_addr) { - err = -ENOMEM; + base_addr = devm_platform_get_and_ioremap_resource(of_dev, 0, &res); + if (IS_ERR(base_addr)) { + err = PTR_ERR(base_addr); dev_err(&of_dev->dev, "%s: devm_ioremap() failed\n", __func__); goto fman_free; } + fman->dts_params.base_addr = base_addr; + fman->dts_params.res = res; + fman->dev = &of_dev->dev; err = of_platform_populate(fm_node, NULL, NULL, &of_dev->dev); diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 660dff5426e7f..83ce3bfefa5c2 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -90,7 +90,6 @@ struct ltq_etop_priv { struct net_device *netdev; struct platform_device *pdev; struct ltq_eth_data *pldata; - struct resource *res; struct mii_bus *mii_bus; @@ -643,31 +642,14 @@ ltq_etop_probe(struct platform_device *pdev) { struct net_device *dev; struct ltq_etop_priv *priv; - struct resource *res; int err; int i; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(&pdev->dev, "failed to get etop resource\n"); - err = -ENOENT; - goto err_out; - } - - res = devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), dev_name(&pdev->dev)); - if (!res) { - dev_err(&pdev->dev, "failed to request etop resource\n"); - err = -EBUSY; - goto err_out; - } - - ltq_etop_membase = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); - if (!ltq_etop_membase) { + ltq_etop_membase = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(ltq_etop_membase)) { dev_err(&pdev->dev, "failed to remap etop engine %d\n", pdev->id); - err = -ENOMEM; + err = PTR_ERR(ltq_etop_membase); goto err_out; } @@ -679,7 +661,6 @@ ltq_etop_probe(struct platform_device *pdev) dev->netdev_ops = <q_eth_netdev_ops; dev->ethtool_ops = <q_etop_ethtool_ops; priv = netdev_priv(dev); - priv->res = res; priv->pdev = pdev; priv->pldata = dev_get_platdata(&pdev->dev); priv->netdev = dev; diff --git a/drivers/net/mdio/mdio-octeon.c b/drivers/net/mdio/mdio-octeon.c index 2beb83154d39c..cb53dccbde1a3 100644 --- a/drivers/net/mdio/mdio-octeon.c +++ b/drivers/net/mdio/mdio-octeon.c @@ -17,37 +17,20 @@ static int octeon_mdiobus_probe(struct platform_device *pdev) { struct cavium_mdiobus *bus; struct mii_bus *mii_bus; - struct resource *res_mem; - resource_size_t mdio_phys; - resource_size_t regsize; union cvmx_smix_en smi_en; - int err = -ENOENT; + int err; mii_bus = devm_mdiobus_alloc_size(&pdev->dev, sizeof(*bus)); if (!mii_bus) return -ENOMEM; - res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (res_mem == NULL) { - dev_err(&pdev->dev, "found no memory resource\n"); - return -ENXIO; - } - bus = mii_bus->priv; bus->mii_bus = mii_bus; - mdio_phys = res_mem->start; - regsize = resource_size(res_mem); - if (!devm_request_mem_region(&pdev->dev, mdio_phys, regsize, - res_mem->name)) { - dev_err(&pdev->dev, "request_mem_region failed\n"); - return -ENXIO; - } - - bus->register_base = devm_ioremap(&pdev->dev, mdio_phys, regsize); - if (!bus->register_base) { + bus->register_base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(bus->register_base)) { dev_err(&pdev->dev, "dev_ioremap failed\n"); - return -ENOMEM; + return PTR_ERR(bus->register_base); } smi_en.u64 = 0; From 6c36b5c244d6cb22ef8ea2f6b5da46f5171b37a5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Dec 2024 21:02:34 +0000 Subject: [PATCH 0227/1450] net: tipc: remove one synchronize_net() from tipc_nametbl_stop() tipc_exit_net() is very slow and is abused by syzbot. tipc_nametbl_stop() is called for each netns being dismantled. Calling synchronize_net() right before freeing tn->nametbl is a big hammer. Replace this with kfree_rcu(). Note that RCU is not properly used here, otherwise tn->nametbl should be cleared before the synchronize_net() or kfree_rcu(), or even before the cleanup loop. We might need to fix this at some point. Also note tipc uses other synchronize_rcu() calls, more work is needed to make tipc_exit_net() much faster. List of remaining calls to synchronize_rcu() tipc_detach_loopback() (dev_remove_pack()) tipc_bcast_stop() tipc_sk_rht_destroy() Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241204210234.319484-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/tipc/name_table.c | 4 ++-- net/tipc/name_table.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index d1180370fdf41..e74940eab3a47 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -949,8 +949,8 @@ void tipc_nametbl_stop(struct net *net) } spin_unlock_bh(&tn->nametbl_lock); - synchronize_net(); - kfree(nt); + /* TODO: clear tn->nametbl, implement proper RCU rules ? */ + kfree_rcu(nt, rcu); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 3bcd9ef8cee30..7ff6eeebaae64 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -90,6 +90,7 @@ struct publication { /** * struct name_table - table containing all existing port name publications + * @rcu: RCU callback head used for deferred freeing * @services: name sequence hash lists * @node_scope: all local publications with node scope * - used by name_distr during re-init of name table @@ -102,6 +103,7 @@ struct publication { * @snd_nxt: next sequence number to be used */ struct name_table { + struct rcu_head rcu; struct hlist_head services[TIPC_NAMETBL_SIZE]; struct list_head node_scope; struct list_head cluster_scope; From 48697bdfb65d21bab8c686830b04bf2e47b96d52 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 4 Dec 2024 16:32:39 +0000 Subject: [PATCH 0228/1450] selftests: net: cleanup busy_poller.c Fix various integer type conversions by using strtoull and a temporary variable which is bounds checked before being casted into the appropriate cfg_* variable for use by the test program. While here: - free the strdup'd cfg string for overall hygenie. - initialize napi_id = 0 in setup_queue to avoid warnings on some compilers. Signed-off-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204163239.294123-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/busy_poller.c | 88 +++++++++++++---------- 1 file changed, 50 insertions(+), 38 deletions(-) diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c index 99b0e8c17fcad..04c7ff577bb89 100644 --- a/tools/testing/selftests/net/busy_poller.c +++ b/tools/testing/selftests/net/busy_poller.c @@ -54,16 +54,16 @@ struct epoll_params { #define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params) #endif -static uint32_t cfg_port = 8000; +static uint16_t cfg_port = 8000; static struct in_addr cfg_bind_addr = { .s_addr = INADDR_ANY }; static char *cfg_outfile; static int cfg_max_events = 8; -static int cfg_ifindex; +static uint32_t cfg_ifindex; /* busy poll params */ static uint32_t cfg_busy_poll_usecs; -static uint32_t cfg_busy_poll_budget; -static uint32_t cfg_prefer_busy_poll; +static uint16_t cfg_busy_poll_budget; +static uint8_t cfg_prefer_busy_poll; /* IRQ params */ static uint32_t cfg_defer_hard_irqs; @@ -79,6 +79,7 @@ static void usage(const char *filepath) static void parse_opts(int argc, char **argv) { + unsigned long long tmp; int ret; int c; @@ -86,31 +87,40 @@ static void parse_opts(int argc, char **argv) usage(argv[0]); while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) { + /* most options take integer values, except o and b, so reduce + * code duplication a bit for the common case by calling + * strtoull here and leave bounds checking and casting per + * option below. + */ + if (c != 'o' && c != 'b') + tmp = strtoull(optarg, NULL, 0); + switch (c) { case 'u': - cfg_busy_poll_usecs = strtoul(optarg, NULL, 0); - if (cfg_busy_poll_usecs == ULONG_MAX || - cfg_busy_poll_usecs > UINT32_MAX) + if (tmp == ULLONG_MAX || tmp > UINT32_MAX) error(1, ERANGE, "busy_poll_usecs too large"); + + cfg_busy_poll_usecs = (uint32_t)tmp; break; case 'P': - cfg_prefer_busy_poll = strtoul(optarg, NULL, 0); - if (cfg_prefer_busy_poll == ULONG_MAX || - cfg_prefer_busy_poll > 1) + if (tmp == ULLONG_MAX || tmp > 1) error(1, ERANGE, "prefer busy poll should be 0 or 1"); + + cfg_prefer_busy_poll = (uint8_t)tmp; break; case 'g': - cfg_busy_poll_budget = strtoul(optarg, NULL, 0); - if (cfg_busy_poll_budget == ULONG_MAX || - cfg_busy_poll_budget > UINT16_MAX) + if (tmp == ULLONG_MAX || tmp > UINT16_MAX) error(1, ERANGE, "busy poll budget must be [0, UINT16_MAX]"); + + cfg_busy_poll_budget = (uint16_t)tmp; break; case 'p': - cfg_port = strtoul(optarg, NULL, 0); - if (cfg_port > UINT16_MAX) + if (tmp == ULLONG_MAX || tmp > UINT16_MAX) error(1, ERANGE, "port must be <= 65535"); + + cfg_port = (uint16_t)tmp; break; case 'b': ret = inet_aton(optarg, &cfg_bind_addr); @@ -124,41 +134,39 @@ static void parse_opts(int argc, char **argv) error(1, 0, "outfile invalid"); break; case 'm': - cfg_max_events = strtol(optarg, NULL, 0); - - if (cfg_max_events == LONG_MIN || - cfg_max_events == LONG_MAX || - cfg_max_events <= 0) + if (tmp == ULLONG_MAX || tmp > INT_MAX) error(1, ERANGE, - "max events must be > 0 and < LONG_MAX"); + "max events must be > 0 and <= INT_MAX"); + + cfg_max_events = (int)tmp; break; case 'd': - cfg_defer_hard_irqs = strtoul(optarg, NULL, 0); - - if (cfg_defer_hard_irqs == ULONG_MAX || - cfg_defer_hard_irqs > INT32_MAX) + if (tmp == ULLONG_MAX || tmp > INT32_MAX) error(1, ERANGE, "defer_hard_irqs must be <= INT32_MAX"); + + cfg_defer_hard_irqs = (uint32_t)tmp; break; case 'r': - cfg_gro_flush_timeout = strtoull(optarg, NULL, 0); - - if (cfg_gro_flush_timeout == ULLONG_MAX) + if (tmp == ULLONG_MAX || tmp > UINT64_MAX) error(1, ERANGE, - "gro_flush_timeout must be < ULLONG_MAX"); + "gro_flush_timeout must be < UINT64_MAX"); + + cfg_gro_flush_timeout = (uint64_t)tmp; break; case 's': - cfg_irq_suspend_timeout = strtoull(optarg, NULL, 0); - - if (cfg_irq_suspend_timeout == ULLONG_MAX) + if (tmp == ULLONG_MAX || tmp > UINT64_MAX) error(1, ERANGE, "irq_suspend_timeout must be < ULLONG_MAX"); + + cfg_irq_suspend_timeout = (uint64_t)tmp; break; case 'i': - cfg_ifindex = strtoul(optarg, NULL, 0); - if (cfg_ifindex == ULONG_MAX) + if (tmp == ULLONG_MAX || tmp > INT_MAX) error(1, ERANGE, - "ifindex must be < ULONG_MAX"); + "ifindex must be <= INT_MAX"); + + cfg_ifindex = (int)tmp; break; } } @@ -215,7 +223,7 @@ static void setup_queue(void) struct netdev_napi_set_req *set_req = NULL; struct ynl_sock *ys; struct ynl_error yerr; - uint32_t napi_id; + uint32_t napi_id = 0; ys = ynl_sock_create(&ynl_netdev_family, &yerr); if (!ys) @@ -277,8 +285,8 @@ static void run_poller(void) * here */ epoll_params.busy_poll_usecs = cfg_busy_poll_usecs; - epoll_params.busy_poll_budget = (uint16_t)cfg_busy_poll_budget; - epoll_params.prefer_busy_poll = (uint8_t)cfg_prefer_busy_poll; + epoll_params.busy_poll_budget = cfg_busy_poll_budget; + epoll_params.prefer_busy_poll = cfg_prefer_busy_poll; epoll_params.__pad = 0; val = 1; @@ -342,5 +350,9 @@ int main(int argc, char *argv[]) parse_opts(argc, argv); setup_queue(); run_poller(); + + if (cfg_outfile) + free(cfg_outfile); + return 0; } From bac3d0f21c5a42f042ac9b9f6dcbc11544efdefa Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Dec 2024 10:42:00 +0000 Subject: [PATCH 0229/1450] net: phy: marvell: use phydev->eee_cfg.eee_enabled Rather than calling genphy_c45_ethtool_get_eee() to retrieve whether EEE is enabled, use the value stored in the phy_device eee_cfg structure. Signed-off-by: Russell King (Oracle) Reviewed-by: Heiner Kallweit Link: https://patch.msgid.link/E1tJ9J2-006LIh-Fl@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/marvell.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index b885bc0fe6e07..ffe223ad9e5f1 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1550,7 +1550,6 @@ static int m88e1540_get_fld(struct phy_device *phydev, u8 *msecs) static int m88e1540_set_fld(struct phy_device *phydev, const u8 *msecs) { - struct ethtool_keee eee; int val, ret; if (*msecs == ETHTOOL_PHY_FAST_LINK_DOWN_OFF) @@ -1560,8 +1559,7 @@ static int m88e1540_set_fld(struct phy_device *phydev, const u8 *msecs) /* According to the Marvell data sheet EEE must be disabled for * Fast Link Down detection to work properly */ - ret = genphy_c45_ethtool_get_eee(phydev, &eee); - if (!ret && eee.eee_enabled) { + if (phydev->eee_cfg.eee_enabled) { phydev_warn(phydev, "Fast Link Down detection requires EEE to be disabled!\n"); return -EBUSY; } From 92f7acb825ec272261a2057e0f9e0b1c76198dae Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Dec 2024 10:42:05 +0000 Subject: [PATCH 0230/1450] net: phy: avoid genphy_c45_ethtool_get_eee() setting eee_enabled genphy_c45_ethtool_get_eee() is only called from phy_ethtool_get_eee(), which then calls eeecfg_to_eee(). eeecfg_to_eee() will overwrite keee.eee_enabled, so there's no point setting keee.eee_enabled in genphy_c45_ethtool_get_eee(). Remove this assignment. Signed-off-by: Russell King (Oracle) Reviewed-by: Heiner Kallweit Link: https://patch.msgid.link/E1tJ9J7-006LIn-Jr@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy-c45.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 944ae98ad1106..d162f78bc68db 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -1521,15 +1521,13 @@ EXPORT_SYMBOL(genphy_c45_eee_is_active); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data) { - bool is_enabled; int ret; ret = genphy_c45_eee_is_active(phydev, data->advertised, - data->lp_advertised, &is_enabled); + data->lp_advertised, NULL); if (ret < 0) return ret; - data->eee_enabled = is_enabled; data->eee_active = phydev->eee_active; linkmode_copy(data->supported, phydev->supported_eee); From 8f1c716090a7ed20fea802b63b37758169d59b81 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Dec 2024 10:42:10 +0000 Subject: [PATCH 0231/1450] net: phy: remove genphy_c45_eee_is_active()'s is_enabled arg All callers to genphy_c45_eee_is_active() now pass NULL as the is_enabled argument, which means we never use the value computed in this function. Remove the argument and clean up this function. Signed-off-by: Russell King (Oracle) Reviewed-by: Heiner Kallweit Link: https://patch.msgid.link/E1tJ9JC-006LIt-Ne@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy-c45.c | 12 ++++-------- drivers/net/phy/phy.c | 5 ++--- include/linux/phy.h | 2 +- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index d162f78bc68db..0dac08e85304b 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -1469,18 +1469,17 @@ EXPORT_SYMBOL_GPL(genphy_c45_plca_get_status); * @phydev: target phy_device struct * @adv: variable to store advertised linkmodes * @lp: variable to store LP advertised linkmodes - * @is_enabled: variable to store EEE enabled/disabled configuration value * * Description: this function will read local and link partner PHY * advertisements. Compare them return current EEE state. */ int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, - unsigned long *lp, bool *is_enabled) + unsigned long *lp) { __ETHTOOL_DECLARE_LINK_MODE_MASK(tmp_adv) = {}; __ETHTOOL_DECLARE_LINK_MODE_MASK(tmp_lp) = {}; __ETHTOOL_DECLARE_LINK_MODE_MASK(common); - bool eee_enabled, eee_active; + bool eee_active; int ret; ret = genphy_c45_read_eee_adv(phydev, tmp_adv); @@ -1491,9 +1490,8 @@ int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, if (ret) return ret; - eee_enabled = !linkmode_empty(tmp_adv); linkmode_and(common, tmp_adv, tmp_lp); - if (eee_enabled && !linkmode_empty(common)) + if (!linkmode_empty(tmp_adv) && !linkmode_empty(common)) eee_active = phy_check_valid(phydev->speed, phydev->duplex, common); else @@ -1503,8 +1501,6 @@ int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, linkmode_copy(adv, tmp_adv); if (lp) linkmode_copy(lp, tmp_lp); - if (is_enabled) - *is_enabled = eee_enabled; return eee_active; } @@ -1524,7 +1520,7 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev, int ret; ret = genphy_c45_eee_is_active(phydev, data->advertised, - data->lp_advertised, NULL); + data->lp_advertised); if (ret < 0) return ret; diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 0c228aa180198..4cf3442542371 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -988,8 +988,7 @@ static int phy_check_link_status(struct phy_device *phydev) if (phydev->link && phydev->state != PHY_RUNNING) { phy_check_downshift(phydev); phydev->state = PHY_RUNNING; - err = genphy_c45_eee_is_active(phydev, - NULL, NULL, NULL); + err = genphy_c45_eee_is_active(phydev, NULL, NULL); phydev->eee_active = err > 0; phydev->enable_tx_lpi = phydev->eee_cfg.tx_lpi_enabled && phydev->eee_active; @@ -1658,7 +1657,7 @@ int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable) if (!phydev->drv) return -EIO; - ret = genphy_c45_eee_is_active(phydev, NULL, NULL, NULL); + ret = genphy_c45_eee_is_active(phydev, NULL, NULL); if (ret < 0) return ret; if (!ret) diff --git a/include/linux/phy.h b/include/linux/phy.h index 61a1bc81f597f..bb157136351e7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1991,7 +1991,7 @@ int genphy_c45_plca_set_cfg(struct phy_device *phydev, int genphy_c45_plca_get_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, - unsigned long *lp, bool *is_enabled); + unsigned long *lp); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, From f899c594e138eda72804b16babbdeff92707d7b0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Dec 2024 10:42:15 +0000 Subject: [PATCH 0232/1450] net: phy: update phy_ethtool_get_eee() documentation Update the phy_ethtool_get_eee() documentation to make it clear that all members of struct ethtool_keee are written by this function. keee.supported, keee.advertised, keee.lp_advertised and keee.eee_active are all written by genphy_c45_ethtool_get_eee(). keee.tx_lpi_timer, keee.tx_lpi_enabled and keee.eee_enabled are all written by eeecfg_to_eee(). Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1tJ9JH-006LIz-SO@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 4cf3442542371..e4b04cdaa9950 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1701,8 +1701,8 @@ EXPORT_SYMBOL(phy_get_eee_err); * @phydev: target phy_device struct * @data: ethtool_keee data * - * Description: reports the Supported/Advertisement/LP Advertisement - * capabilities, etc. + * Description: get the current EEE settings, filling in all members of + * @data. */ int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data) { From 7b60c3bf93fa813e6522686025aae31ab54db2d2 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:33 +0100 Subject: [PATCH 0233/1450] net: usb: lan78xx: Remove LAN8835 PHY fixup Remove the PHY fixup for the LAN8835 PHY in the lan78xx driver due to the following reasons: - There is no publicly available information about the LAN8835 PHY. However, it appears to be the integrated PHY used in the LAN7800 and LAN7850 USB Ethernet controllers. These PHYs use the GMII interface, not RGMII as configured by the fixup. - The correct driver for handling the LAN8835 PHY functionality is the Microchip PHY driver (`drivers/net/phy/microchip.c`), which properly supports these integrated PHYs. - The PHY ID `0x0007C130` is actually used by the LAN8742A PHY, which only supports RMII. This interface is incompatible with the LAN78xx MAC, as the LAN7801 (the only LAN78xx version without an integrated PHY) supports only RGMII. - The mask applied for this fixup is overly broad, inadvertently covering both Microchip LAN88xx PHYs and unrelated SMSC LAN8742A PHYs, leading to potential conflicts with other devices. - Testing has shown that removing this fixup for LAN7800 and LAN7850 does not result in any noticeable difference in functionality, as the Microchip PHY driver (`drivers/net/phy/microchip.c`) handles all necessary configurations for these integrated PHYs. - Registering this fixup globally (not limited to USB devices) risks conflicts by unintentionally modifying other interfaces whenever a LAN7801 adapter is connected to the system. Note that both LAN7800 and LAN7850 USB Ethernet controllers use an integrated PHY with the ID `0x0007C132`. Additionally, the LAN7515, a specialized part for Raspberry Pi, includes an integrated LAN7800 USB Ethernet controller and USB hub in a multifunctional chip design, and it also uses the same PHY ID (`0x0007C132`). Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 531b1b6a37d19..6e468e77d796a 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -473,7 +473,6 @@ struct lan78xx_net { }; /* define external phy id */ -#define PHY_LAN8835 (0x0007C130) #define PHY_KSZ9031RNX (0x00221620) /* use ethtool to change the level for any given device */ @@ -2234,29 +2233,6 @@ static void lan78xx_remove_irq_domain(struct lan78xx_net *dev) dev->domain_data.irqdomain = NULL; } -static int lan8835_fixup(struct phy_device *phydev) -{ - int buf; - struct lan78xx_net *dev = netdev_priv(phydev->attached_dev); - - /* LED2/PME_N/IRQ_N/RGMII_ID pin to IRQ_N mode */ - buf = phy_read_mmd(phydev, MDIO_MMD_PCS, 0x8010); - buf &= ~0x1800; - buf |= 0x0800; - phy_write_mmd(phydev, MDIO_MMD_PCS, 0x8010, buf); - - /* RGMII MAC TXC Delay Enable */ - lan78xx_write_reg(dev, MAC_RGMII_ID, - MAC_RGMII_ID_TXC_DELAY_EN_); - - /* RGMII TX DLL Tune Adjust */ - lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00); - - dev->interface = PHY_INTERFACE_MODE_RGMII_TXID; - - return 1; -} - static int ksz9031rnx_fixup(struct phy_device *phydev) { struct lan78xx_net *dev = netdev_priv(phydev->attached_dev); @@ -2315,14 +2291,6 @@ static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) netdev_err(dev->net, "Failed to register fixup for PHY_KSZ9031RNX\n"); return NULL; } - /* external PHY fixup for LAN8835 */ - ret = phy_register_fixup_for_uid(PHY_LAN8835, 0xfffffff0, - lan8835_fixup); - if (ret < 0) { - netdev_err(dev->net, "Failed to register fixup for PHY_LAN8835\n"); - return NULL; - } - /* add more external PHY fixup here if needed */ phydev->is_internal = false; } @@ -2384,8 +2352,6 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) } else { phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0); - phy_unregister_fixup_for_uid(PHY_LAN8835, - 0xfffffff0); } } return -EIO; @@ -4243,7 +4209,6 @@ static void lan78xx_disconnect(struct usb_interface *intf) phydev = net->phydev; phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0); - phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0); phy_disconnect(net->phydev); From 6782d06a47ad6f8844e71f3912ab60a47f7bc7c3 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:34 +0100 Subject: [PATCH 0234/1450] net: usb: lan78xx: Remove KSZ9031 PHY fixup Remove the KSZ9031RNX PHY fixup from the lan78xx driver. The fixup applied specific RGMII pad skew configurations globally, but these settings violate the RGMII specification and cause more harm than benefit. Key issues with the fixup: 1. **Non-Compliant Timing**: The fixup's delay settings fall outside the RGMII specification requirements of 1.5 ns to 2.0 ns: - RX Path: Total delay of **2.16 ns** (PHY internal delay of 1.2 ns + 0.96 ns skew). - TX Path: Total delay of **0.96 ns**, significantly below the RGMII minimum of 1.5 ns. 2. **Redundant or Incorrect Configurations**: - The RGMII skew registers written by the fixup do not meaningfully alter the PHY's default behavior and fail to account for its internal delays. - The TX_DATA pad skew was not configured, relying on power-on defaults that are insufficient for RGMII compliance. 3. **Micrel Driver Support**: By setting `PHY_INTERFACE_MODE_RGMII_ID`, the Micrel driver can calculate and assign appropriate skew values for the KSZ9031 PHY. This ensures better timing configurations without relying on external fixups. 4. **System Interference**: The fixup applied globally, reconfiguring all KSZ9031 PHYs in the system, even those unrelated to the LAN78xx adapter. This could lead to unintended and harmful behavior on unrelated interfaces. While the fixup is removed, a better mechanism is still needed to dynamically determine the optimal combination of PHY and MAC delays to fully meet RGMII requirements without relying on Device Tree or global fixups. This would allow for robust operation across different hardware configurations. The Micrel driver is capable of using the interface mode value to calculate and apply better skew values, providing a configuration much closer to the RGMII specification than the fixup. Removing the fixup ensures better default behavior and prevents harm to other system interfaces. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-3-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 38 +++++--------------------------------- 1 file changed, 5 insertions(+), 33 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 6e468e77d796a..918b88bd9524e 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -472,9 +472,6 @@ struct lan78xx_net { struct irq_domain_data domain_data; }; -/* define external phy id */ -#define PHY_KSZ9031RNX (0x00221620) - /* use ethtool to change the level for any given device */ static int msg_level = -1; module_param(msg_level, int, 0); @@ -2233,23 +2230,6 @@ static void lan78xx_remove_irq_domain(struct lan78xx_net *dev) dev->domain_data.irqdomain = NULL; } -static int ksz9031rnx_fixup(struct phy_device *phydev) -{ - struct lan78xx_net *dev = netdev_priv(phydev->attached_dev); - - /* Micrel9301RNX PHY configuration */ - /* RGMII Control Signal Pad Skew */ - phy_write_mmd(phydev, MDIO_MMD_WIS, 4, 0x0077); - /* RGMII RX Data Pad Skew */ - phy_write_mmd(phydev, MDIO_MMD_WIS, 5, 0x7777); - /* RGMII RX Clock Pad Skew */ - phy_write_mmd(phydev, MDIO_MMD_WIS, 8, 0x1FF); - - dev->interface = PHY_INTERFACE_MODE_RGMII_RXID; - - return 1; -} - static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) { u32 buf; @@ -2283,14 +2263,11 @@ static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) netdev_err(dev->net, "no PHY driver found\n"); return NULL; } - dev->interface = PHY_INTERFACE_MODE_RGMII; - /* external PHY fixup for KSZ9031RNX */ - ret = phy_register_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0, - ksz9031rnx_fixup); - if (ret < 0) { - netdev_err(dev->net, "Failed to register fixup for PHY_KSZ9031RNX\n"); - return NULL; - } + dev->interface = PHY_INTERFACE_MODE_RGMII_ID; + /* The PHY driver is responsible to configure proper RGMII + * interface delays. Disable RGMII delays on MAC side. + */ + lan78xx_write_reg(dev, MAC_RGMII_ID, 0); phydev->is_internal = false; } @@ -2349,9 +2326,6 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) if (phy_is_pseudo_fixed_link(phydev)) { fixed_phy_unregister(phydev); phy_device_free(phydev); - } else { - phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, - 0xfffffff0); } } return -EIO; @@ -4208,8 +4182,6 @@ static void lan78xx_disconnect(struct usb_interface *intf) phydev = net->phydev; - phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0); - phy_disconnect(net->phydev); if (phy_is_pseudo_fixed_link(phydev)) { From 39aa1d620d10cdd276f4728da50f136dbe939643 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:35 +0100 Subject: [PATCH 0235/1450] net: usb: lan78xx: move functions to avoid forward definitions Move following functions to avoid forward declarations in the code: - lan78xx_start_hw() - lan78xx_stop_hw() - lan78xx_flush_fifo() - lan78xx_start_tx_path() - lan78xx_stop_tx_path() - lan78xx_flush_tx_fifo() - lan78xx_start_rx_path() - lan78xx_stop_rx_path() - lan78xx_flush_rx_fifo() These functions will be used in an upcoming PHYlink migration patch. No modifications to the functionality of the code are made. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-4-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 300 +++++++++++++++++++------------------- 1 file changed, 150 insertions(+), 150 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 918b88bd9524e..dd9b5d3abcb3f 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -808,6 +808,156 @@ static void lan78xx_update_stats(struct lan78xx_net *dev) usb_autopm_put_interface(dev->intf); } +static int lan78xx_start_hw(struct lan78xx_net *dev, u32 reg, u32 hw_enable) +{ + return lan78xx_update_reg(dev, reg, hw_enable, hw_enable); +} + +static int lan78xx_stop_hw(struct lan78xx_net *dev, u32 reg, u32 hw_enabled, + u32 hw_disabled) +{ + unsigned long timeout; + bool stopped = true; + int ret; + u32 buf; + + /* Stop the h/w block (if not already stopped) */ + + ret = lan78xx_read_reg(dev, reg, &buf); + if (ret < 0) + return ret; + + if (buf & hw_enabled) { + buf &= ~hw_enabled; + + ret = lan78xx_write_reg(dev, reg, buf); + if (ret < 0) + return ret; + + stopped = false; + timeout = jiffies + HW_DISABLE_TIMEOUT; + do { + ret = lan78xx_read_reg(dev, reg, &buf); + if (ret < 0) + return ret; + + if (buf & hw_disabled) + stopped = true; + else + msleep(HW_DISABLE_DELAY_MS); + } while (!stopped && !time_after(jiffies, timeout)); + } + + ret = stopped ? 0 : -ETIME; + + return ret; +} + +static int lan78xx_flush_fifo(struct lan78xx_net *dev, u32 reg, u32 fifo_flush) +{ + return lan78xx_update_reg(dev, reg, fifo_flush, fifo_flush); +} + +static int lan78xx_start_tx_path(struct lan78xx_net *dev) +{ + int ret; + + netif_dbg(dev, drv, dev->net, "start tx path"); + + /* Start the MAC transmitter */ + + ret = lan78xx_start_hw(dev, MAC_TX, MAC_TX_TXEN_); + if (ret < 0) + return ret; + + /* Start the Tx FIFO */ + + ret = lan78xx_start_hw(dev, FCT_TX_CTL, FCT_TX_CTL_EN_); + if (ret < 0) + return ret; + + return 0; +} + +static int lan78xx_stop_tx_path(struct lan78xx_net *dev) +{ + int ret; + + netif_dbg(dev, drv, dev->net, "stop tx path"); + + /* Stop the Tx FIFO */ + + ret = lan78xx_stop_hw(dev, FCT_TX_CTL, FCT_TX_CTL_EN_, FCT_TX_CTL_DIS_); + if (ret < 0) + return ret; + + /* Stop the MAC transmitter */ + + ret = lan78xx_stop_hw(dev, MAC_TX, MAC_TX_TXEN_, MAC_TX_TXD_); + if (ret < 0) + return ret; + + return 0; +} + +/* The caller must ensure the Tx path is stopped before calling + * lan78xx_flush_tx_fifo(). + */ +static int lan78xx_flush_tx_fifo(struct lan78xx_net *dev) +{ + return lan78xx_flush_fifo(dev, FCT_TX_CTL, FCT_TX_CTL_RST_); +} + +static int lan78xx_start_rx_path(struct lan78xx_net *dev) +{ + int ret; + + netif_dbg(dev, drv, dev->net, "start rx path"); + + /* Start the Rx FIFO */ + + ret = lan78xx_start_hw(dev, FCT_RX_CTL, FCT_RX_CTL_EN_); + if (ret < 0) + return ret; + + /* Start the MAC receiver*/ + + ret = lan78xx_start_hw(dev, MAC_RX, MAC_RX_RXEN_); + if (ret < 0) + return ret; + + return 0; +} + +static int lan78xx_stop_rx_path(struct lan78xx_net *dev) +{ + int ret; + + netif_dbg(dev, drv, dev->net, "stop rx path"); + + /* Stop the MAC receiver */ + + ret = lan78xx_stop_hw(dev, MAC_RX, MAC_RX_RXEN_, MAC_RX_RXD_); + if (ret < 0) + return ret; + + /* Stop the Rx FIFO */ + + ret = lan78xx_stop_hw(dev, FCT_RX_CTL, FCT_RX_CTL_EN_, FCT_RX_CTL_DIS_); + if (ret < 0) + return ret; + + return 0; +} + +/* The caller must ensure the Rx path is stopped before calling + * lan78xx_flush_rx_fifo(). + */ +static int lan78xx_flush_rx_fifo(struct lan78xx_net *dev) +{ + return lan78xx_flush_fifo(dev, FCT_RX_CTL, FCT_RX_CTL_RST_); +} + /* Loop until the read is completed with timeout called with phy_mutex held */ static int lan78xx_phy_wait_not_busy(struct lan78xx_net *dev) { @@ -2662,156 +2812,6 @@ static int lan78xx_urb_config_init(struct lan78xx_net *dev) return result; } -static int lan78xx_start_hw(struct lan78xx_net *dev, u32 reg, u32 hw_enable) -{ - return lan78xx_update_reg(dev, reg, hw_enable, hw_enable); -} - -static int lan78xx_stop_hw(struct lan78xx_net *dev, u32 reg, u32 hw_enabled, - u32 hw_disabled) -{ - unsigned long timeout; - bool stopped = true; - int ret; - u32 buf; - - /* Stop the h/w block (if not already stopped) */ - - ret = lan78xx_read_reg(dev, reg, &buf); - if (ret < 0) - return ret; - - if (buf & hw_enabled) { - buf &= ~hw_enabled; - - ret = lan78xx_write_reg(dev, reg, buf); - if (ret < 0) - return ret; - - stopped = false; - timeout = jiffies + HW_DISABLE_TIMEOUT; - do { - ret = lan78xx_read_reg(dev, reg, &buf); - if (ret < 0) - return ret; - - if (buf & hw_disabled) - stopped = true; - else - msleep(HW_DISABLE_DELAY_MS); - } while (!stopped && !time_after(jiffies, timeout)); - } - - ret = stopped ? 0 : -ETIME; - - return ret; -} - -static int lan78xx_flush_fifo(struct lan78xx_net *dev, u32 reg, u32 fifo_flush) -{ - return lan78xx_update_reg(dev, reg, fifo_flush, fifo_flush); -} - -static int lan78xx_start_tx_path(struct lan78xx_net *dev) -{ - int ret; - - netif_dbg(dev, drv, dev->net, "start tx path"); - - /* Start the MAC transmitter */ - - ret = lan78xx_start_hw(dev, MAC_TX, MAC_TX_TXEN_); - if (ret < 0) - return ret; - - /* Start the Tx FIFO */ - - ret = lan78xx_start_hw(dev, FCT_TX_CTL, FCT_TX_CTL_EN_); - if (ret < 0) - return ret; - - return 0; -} - -static int lan78xx_stop_tx_path(struct lan78xx_net *dev) -{ - int ret; - - netif_dbg(dev, drv, dev->net, "stop tx path"); - - /* Stop the Tx FIFO */ - - ret = lan78xx_stop_hw(dev, FCT_TX_CTL, FCT_TX_CTL_EN_, FCT_TX_CTL_DIS_); - if (ret < 0) - return ret; - - /* Stop the MAC transmitter */ - - ret = lan78xx_stop_hw(dev, MAC_TX, MAC_TX_TXEN_, MAC_TX_TXD_); - if (ret < 0) - return ret; - - return 0; -} - -/* The caller must ensure the Tx path is stopped before calling - * lan78xx_flush_tx_fifo(). - */ -static int lan78xx_flush_tx_fifo(struct lan78xx_net *dev) -{ - return lan78xx_flush_fifo(dev, FCT_TX_CTL, FCT_TX_CTL_RST_); -} - -static int lan78xx_start_rx_path(struct lan78xx_net *dev) -{ - int ret; - - netif_dbg(dev, drv, dev->net, "start rx path"); - - /* Start the Rx FIFO */ - - ret = lan78xx_start_hw(dev, FCT_RX_CTL, FCT_RX_CTL_EN_); - if (ret < 0) - return ret; - - /* Start the MAC receiver*/ - - ret = lan78xx_start_hw(dev, MAC_RX, MAC_RX_RXEN_); - if (ret < 0) - return ret; - - return 0; -} - -static int lan78xx_stop_rx_path(struct lan78xx_net *dev) -{ - int ret; - - netif_dbg(dev, drv, dev->net, "stop rx path"); - - /* Stop the MAC receiver */ - - ret = lan78xx_stop_hw(dev, MAC_RX, MAC_RX_RXEN_, MAC_RX_RXD_); - if (ret < 0) - return ret; - - /* Stop the Rx FIFO */ - - ret = lan78xx_stop_hw(dev, FCT_RX_CTL, FCT_RX_CTL_EN_, FCT_RX_CTL_DIS_); - if (ret < 0) - return ret; - - return 0; -} - -/* The caller must ensure the Rx path is stopped before calling - * lan78xx_flush_rx_fifo(). - */ -static int lan78xx_flush_rx_fifo(struct lan78xx_net *dev) -{ - return lan78xx_flush_fifo(dev, FCT_RX_CTL, FCT_RX_CTL_RST_); -} - static int lan78xx_reset(struct lan78xx_net *dev) { struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]); From 9bcdc610cfabe8784f80b8c84f950cc5693f146b Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:36 +0100 Subject: [PATCH 0236/1450] net: usb: lan78xx: Improve error reporting with %pe specifier Replace integer error codes with the `%pe` format specifier in register read and write error messages. This change provides human-readable error strings, making logs more informative and debugging easier. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-5-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index dd9b5d3abcb3f..94320deaaeea8 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -621,8 +621,8 @@ static int lan78xx_read_reg(struct lan78xx_net *dev, u32 index, u32 *data) *data = *buf; } else if (net_ratelimit()) { netdev_warn(dev->net, - "Failed to read register index 0x%08x. ret = %d", - index, ret); + "Failed to read register index 0x%08x. ret = %pe", + index, ERR_PTR(ret)); } kfree(buf); @@ -652,8 +652,8 @@ static int lan78xx_write_reg(struct lan78xx_net *dev, u32 index, u32 data) if (unlikely(ret < 0) && net_ratelimit()) { netdev_warn(dev->net, - "Failed to write register index 0x%08x. ret = %d", - index, ret); + "Failed to write register index 0x%08x. ret = %pe", + index, ERR_PTR(ret)); } kfree(buf); From 32ee0dc764505278229078e496e7b56a6d65224b Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:37 +0100 Subject: [PATCH 0237/1450] net: usb: lan78xx: Fix error handling in MII read/write functions Ensure proper error handling in `lan78xx_mdiobus_read` and `lan78xx_mdiobus_write` by checking return values of register read/write operations and returning errors to the caller. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-6-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 94320deaaeea8..ee308be1e618d 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2136,12 +2136,16 @@ static int lan78xx_mdiobus_read(struct mii_bus *bus, int phy_id, int idx) /* set the address, index & direction (read from PHY) */ addr = mii_access(phy_id, idx, MII_READ); ret = lan78xx_write_reg(dev, MII_ACC, addr); + if (ret < 0) + goto done; ret = lan78xx_phy_wait_not_busy(dev); if (ret < 0) goto done; ret = lan78xx_read_reg(dev, MII_DATA, &val); + if (ret < 0) + goto done; ret = (int)(val & 0xFFFF); @@ -2172,10 +2176,14 @@ static int lan78xx_mdiobus_write(struct mii_bus *bus, int phy_id, int idx, val = (u32)regval; ret = lan78xx_write_reg(dev, MII_DATA, val); + if (ret < 0) + goto done; /* set the address, index & direction (write to PHY) */ addr = mii_access(phy_id, idx, MII_WRITE); ret = lan78xx_write_reg(dev, MII_ACC, addr); + if (ret < 0) + goto done; ret = lan78xx_phy_wait_not_busy(dev); if (ret < 0) @@ -2184,7 +2192,7 @@ static int lan78xx_mdiobus_write(struct mii_bus *bus, int phy_id, int idx, done: mutex_unlock(&dev->phy_mutex); usb_autopm_put_interface(dev->intf); - return 0; + return ret; } static int lan78xx_mdio_init(struct lan78xx_net *dev) From 8b1b2ca83b200fa46fdfb81e80ad5fe34537e6d4 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:38 +0100 Subject: [PATCH 0238/1450] net: usb: lan78xx: Improve error handling in EEPROM and OTP operations Refine error handling in EEPROM and OTP read/write functions by: - Return error values immediately upon detection. - Avoid overwriting correct error codes with `-EIO`. - Preserve initial error codes as they were appropriate for specific failures. - Use `-ETIMEDOUT` for timeout conditions instead of `-EIO`. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-7-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 240 ++++++++++++++++++++++++-------------- 1 file changed, 152 insertions(+), 88 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index ee308be1e618d..29f6e1a36e206 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1000,8 +1000,8 @@ static int lan78xx_wait_eeprom(struct lan78xx_net *dev) do { ret = lan78xx_read_reg(dev, E2P_CMD, &val); - if (unlikely(ret < 0)) - return -EIO; + if (ret < 0) + return ret; if (!(val & E2P_CMD_EPC_BUSY_) || (val & E2P_CMD_EPC_TIMEOUT_)) @@ -1011,7 +1011,7 @@ static int lan78xx_wait_eeprom(struct lan78xx_net *dev) if (val & (E2P_CMD_EPC_TIMEOUT_ | E2P_CMD_EPC_BUSY_)) { netdev_warn(dev->net, "EEPROM read operation timeout"); - return -EIO; + return -ETIMEDOUT; } return 0; @@ -1025,8 +1025,8 @@ static int lan78xx_eeprom_confirm_not_busy(struct lan78xx_net *dev) do { ret = lan78xx_read_reg(dev, E2P_CMD, &val); - if (unlikely(ret < 0)) - return -EIO; + if (ret < 0) + return ret; if (!(val & E2P_CMD_EPC_BUSY_)) return 0; @@ -1035,75 +1035,81 @@ static int lan78xx_eeprom_confirm_not_busy(struct lan78xx_net *dev) } while (!time_after(jiffies, start_time + HZ)); netdev_warn(dev->net, "EEPROM is busy"); - return -EIO; + return -ETIMEDOUT; } static int lan78xx_read_raw_eeprom(struct lan78xx_net *dev, u32 offset, u32 length, u8 *data) { - u32 val; - u32 saved; + u32 val, saved; int i, ret; - int retval; /* depends on chip, some EEPROM pins are muxed with LED function. * disable & restore LED function to access EEPROM. */ ret = lan78xx_read_reg(dev, HW_CFG, &val); + if (ret < 0) + return ret; + saved = val; if (dev->chipid == ID_REV_CHIP_ID_7800_) { val &= ~(HW_CFG_LED1_EN_ | HW_CFG_LED0_EN_); ret = lan78xx_write_reg(dev, HW_CFG, val); + if (ret < 0) + return ret; } - retval = lan78xx_eeprom_confirm_not_busy(dev); - if (retval) - return retval; + ret = lan78xx_eeprom_confirm_not_busy(dev); + if (ret == -ETIMEDOUT) + goto read_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; for (i = 0; i < length; i++) { val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_READ_; val |= (offset & E2P_CMD_EPC_ADDR_MASK_); ret = lan78xx_write_reg(dev, E2P_CMD, val); - if (unlikely(ret < 0)) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; - retval = lan78xx_wait_eeprom(dev); - if (retval < 0) - goto exit; + ret = lan78xx_wait_eeprom(dev); + /* Looks like not USB specific error, try to recover */ + if (ret == -ETIMEDOUT) + goto read_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; ret = lan78xx_read_reg(dev, E2P_DATA, &val); - if (unlikely(ret < 0)) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; data[i] = val & 0xFF; offset++; } - retval = 0; -exit: +read_raw_eeprom_done: if (dev->chipid == ID_REV_CHIP_ID_7800_) - ret = lan78xx_write_reg(dev, HW_CFG, saved); + return lan78xx_write_reg(dev, HW_CFG, saved); - return retval; + return 0; } static int lan78xx_read_eeprom(struct lan78xx_net *dev, u32 offset, u32 length, u8 *data) { - u8 sig; int ret; + u8 sig; ret = lan78xx_read_raw_eeprom(dev, 0, 1, &sig); - if ((ret == 0) && (sig == EEPROM_INDICATOR)) - ret = lan78xx_read_raw_eeprom(dev, offset, length, data); - else - ret = -EINVAL; + if (ret < 0) + return ret; - return ret; + if (sig != EEPROM_INDICATOR) + return -ENODATA; + + return lan78xx_read_raw_eeprom(dev, offset, length, data); } static int lan78xx_write_raw_eeprom(struct lan78xx_net *dev, u32 offset, @@ -1112,113 +1118,144 @@ static int lan78xx_write_raw_eeprom(struct lan78xx_net *dev, u32 offset, u32 val; u32 saved; int i, ret; - int retval; /* depends on chip, some EEPROM pins are muxed with LED function. * disable & restore LED function to access EEPROM. */ ret = lan78xx_read_reg(dev, HW_CFG, &val); + if (ret < 0) + return ret; + saved = val; if (dev->chipid == ID_REV_CHIP_ID_7800_) { val &= ~(HW_CFG_LED1_EN_ | HW_CFG_LED0_EN_); ret = lan78xx_write_reg(dev, HW_CFG, val); + if (ret < 0) + return ret; } - retval = lan78xx_eeprom_confirm_not_busy(dev); - if (retval) - goto exit; + ret = lan78xx_eeprom_confirm_not_busy(dev); + /* Looks like not USB specific error, try to recover */ + if (ret == -ETIMEDOUT) + goto write_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; /* Issue write/erase enable command */ val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_EWEN_; ret = lan78xx_write_reg(dev, E2P_CMD, val); - if (unlikely(ret < 0)) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; - retval = lan78xx_wait_eeprom(dev); - if (retval < 0) - goto exit; + ret = lan78xx_wait_eeprom(dev); + /* Looks like not USB specific error, try to recover */ + if (ret == -ETIMEDOUT) + goto write_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; for (i = 0; i < length; i++) { /* Fill data register */ val = data[i]; ret = lan78xx_write_reg(dev, E2P_DATA, val); - if (ret < 0) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; /* Send "write" command */ val = E2P_CMD_EPC_BUSY_ | E2P_CMD_EPC_CMD_WRITE_; val |= (offset & E2P_CMD_EPC_ADDR_MASK_); ret = lan78xx_write_reg(dev, E2P_CMD, val); - if (ret < 0) { - retval = -EIO; - goto exit; - } + if (ret < 0) + return ret; - retval = lan78xx_wait_eeprom(dev); - if (retval < 0) - goto exit; + ret = lan78xx_wait_eeprom(dev); + /* Looks like not USB specific error, try to recover */ + if (ret == -ETIMEDOUT) + goto write_raw_eeprom_done; + /* If USB fails, there is nothing to do */ + if (ret < 0) + return ret; offset++; } - retval = 0; -exit: +write_raw_eeprom_done: if (dev->chipid == ID_REV_CHIP_ID_7800_) - ret = lan78xx_write_reg(dev, HW_CFG, saved); + return lan78xx_write_reg(dev, HW_CFG, saved); - return retval; + return 0; } static int lan78xx_read_raw_otp(struct lan78xx_net *dev, u32 offset, u32 length, u8 *data) { - int i; - u32 buf; unsigned long timeout; + int ret, i; + u32 buf; - lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + ret = lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + if (ret < 0) + return ret; if (buf & OTP_PWR_DN_PWRDN_N_) { /* clear it and wait to be cleared */ - lan78xx_write_reg(dev, OTP_PWR_DN, 0); + ret = lan78xx_write_reg(dev, OTP_PWR_DN, 0); + if (ret < 0) + return ret; timeout = jiffies + HZ; do { usleep_range(1, 10); - lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + ret = lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + if (ret < 0) + return ret; + if (time_after(jiffies, timeout)) { netdev_warn(dev->net, "timeout on OTP_PWR_DN"); - return -EIO; + return -ETIMEDOUT; } } while (buf & OTP_PWR_DN_PWRDN_N_); } for (i = 0; i < length; i++) { - lan78xx_write_reg(dev, OTP_ADDR1, - ((offset + i) >> 8) & OTP_ADDR1_15_11); - lan78xx_write_reg(dev, OTP_ADDR2, - ((offset + i) & OTP_ADDR2_10_3)); + ret = lan78xx_write_reg(dev, OTP_ADDR1, + ((offset + i) >> 8) & OTP_ADDR1_15_11); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_ADDR2, + ((offset + i) & OTP_ADDR2_10_3)); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_FUNC_CMD, OTP_FUNC_CMD_READ_); + if (ret < 0) + return ret; - lan78xx_write_reg(dev, OTP_FUNC_CMD, OTP_FUNC_CMD_READ_); - lan78xx_write_reg(dev, OTP_CMD_GO, OTP_CMD_GO_GO_); + ret = lan78xx_write_reg(dev, OTP_CMD_GO, OTP_CMD_GO_GO_); + if (ret < 0) + return ret; timeout = jiffies + HZ; do { udelay(1); - lan78xx_read_reg(dev, OTP_STATUS, &buf); + ret = lan78xx_read_reg(dev, OTP_STATUS, &buf); + if (ret < 0) + return ret; + if (time_after(jiffies, timeout)) { netdev_warn(dev->net, "timeout on OTP_STATUS"); - return -EIO; + return -ETIMEDOUT; } } while (buf & OTP_STATUS_BUSY_); - lan78xx_read_reg(dev, OTP_RD_DATA, &buf); + ret = lan78xx_read_reg(dev, OTP_RD_DATA, &buf); + if (ret < 0) + return ret; data[i] = (u8)(buf & 0xFF); } @@ -1232,45 +1269,72 @@ static int lan78xx_write_raw_otp(struct lan78xx_net *dev, u32 offset, int i; u32 buf; unsigned long timeout; + int ret; - lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + ret = lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + if (ret < 0) + return ret; if (buf & OTP_PWR_DN_PWRDN_N_) { /* clear it and wait to be cleared */ - lan78xx_write_reg(dev, OTP_PWR_DN, 0); + ret = lan78xx_write_reg(dev, OTP_PWR_DN, 0); + if (ret < 0) + return ret; timeout = jiffies + HZ; do { udelay(1); - lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + ret = lan78xx_read_reg(dev, OTP_PWR_DN, &buf); + if (ret < 0) + return ret; + if (time_after(jiffies, timeout)) { netdev_warn(dev->net, "timeout on OTP_PWR_DN completion"); - return -EIO; + return -ETIMEDOUT; } } while (buf & OTP_PWR_DN_PWRDN_N_); } /* set to BYTE program mode */ - lan78xx_write_reg(dev, OTP_PRGM_MODE, OTP_PRGM_MODE_BYTE_); + ret = lan78xx_write_reg(dev, OTP_PRGM_MODE, OTP_PRGM_MODE_BYTE_); + if (ret < 0) + return ret; for (i = 0; i < length; i++) { - lan78xx_write_reg(dev, OTP_ADDR1, - ((offset + i) >> 8) & OTP_ADDR1_15_11); - lan78xx_write_reg(dev, OTP_ADDR2, - ((offset + i) & OTP_ADDR2_10_3)); - lan78xx_write_reg(dev, OTP_PRGM_DATA, data[i]); - lan78xx_write_reg(dev, OTP_TST_CMD, OTP_TST_CMD_PRGVRFY_); - lan78xx_write_reg(dev, OTP_CMD_GO, OTP_CMD_GO_GO_); + ret = lan78xx_write_reg(dev, OTP_ADDR1, + ((offset + i) >> 8) & OTP_ADDR1_15_11); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_ADDR2, + ((offset + i) & OTP_ADDR2_10_3)); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_PRGM_DATA, data[i]); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_TST_CMD, OTP_TST_CMD_PRGVRFY_); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, OTP_CMD_GO, OTP_CMD_GO_GO_); + if (ret < 0) + return ret; timeout = jiffies + HZ; do { udelay(1); - lan78xx_read_reg(dev, OTP_STATUS, &buf); + ret = lan78xx_read_reg(dev, OTP_STATUS, &buf); + if (ret < 0) + return ret; + if (time_after(jiffies, timeout)) { netdev_warn(dev->net, "Timeout on OTP_STATUS completion"); - return -EIO; + return -ETIMEDOUT; } } while (buf & OTP_STATUS_BUSY_); } From 77586156b517c1d38a22c0a8662fe9401ab0f580 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:39 +0100 Subject: [PATCH 0239/1450] net: usb: lan78xx: Add error handling to lan78xx_init_ltm Convert `lan78xx_init_ltm` to return error codes and handle errors properly. Previously, errors during the LTM initialization process were not propagated, potentially leading to undetected issues. This patch ensures: - Errors in `lan78xx_read_reg` and `lan78xx_write_reg` are checked and handled. - Errors are logged with detailed messages using `%pe` for clarity. - The function exits immediately on error, returning the error code. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-8-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 50 ++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 29f6e1a36e206..33cda7f3dd12e 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2807,13 +2807,16 @@ static int lan78xx_vlan_rx_kill_vid(struct net_device *netdev, return 0; } -static void lan78xx_init_ltm(struct lan78xx_net *dev) +static int lan78xx_init_ltm(struct lan78xx_net *dev) { + u32 regs[6] = { 0 }; int ret; u32 buf; - u32 regs[6] = { 0 }; ret = lan78xx_read_reg(dev, USB_CFG1, &buf); + if (ret < 0) + goto init_ltm_failed; + if (buf & USB_CFG1_LTM_ENABLE_) { u8 temp[2]; /* Get values from EEPROM first */ @@ -2824,7 +2827,7 @@ static void lan78xx_init_ltm(struct lan78xx_net *dev) 24, (u8 *)regs); if (ret < 0) - return; + return ret; } } else if (lan78xx_read_otp(dev, 0x3F, 2, temp) == 0) { if (temp[0] == 24) { @@ -2833,17 +2836,40 @@ static void lan78xx_init_ltm(struct lan78xx_net *dev) 24, (u8 *)regs); if (ret < 0) - return; + return ret; } } } - lan78xx_write_reg(dev, LTM_BELT_IDLE0, regs[0]); - lan78xx_write_reg(dev, LTM_BELT_IDLE1, regs[1]); - lan78xx_write_reg(dev, LTM_BELT_ACT0, regs[2]); - lan78xx_write_reg(dev, LTM_BELT_ACT1, regs[3]); - lan78xx_write_reg(dev, LTM_INACTIVE0, regs[4]); - lan78xx_write_reg(dev, LTM_INACTIVE1, regs[5]); + ret = lan78xx_write_reg(dev, LTM_BELT_IDLE0, regs[0]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_BELT_IDLE1, regs[1]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_BELT_ACT0, regs[2]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_BELT_ACT1, regs[3]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_INACTIVE0, regs[4]); + if (ret < 0) + goto init_ltm_failed; + + ret = lan78xx_write_reg(dev, LTM_INACTIVE1, regs[5]); + if (ret < 0) + goto init_ltm_failed; + + return 0; + +init_ltm_failed: + netdev_err(dev->net, "Failed to init LTM with error %pe\n", ERR_PTR(ret)); + return ret; } static int lan78xx_urb_config_init(struct lan78xx_net *dev) @@ -2939,7 +2965,9 @@ static int lan78xx_reset(struct lan78xx_net *dev) return ret; /* Init LTM */ - lan78xx_init_ltm(dev); + ret = lan78xx_init_ltm(dev); + if (ret < 0) + return ret; ret = lan78xx_write_reg(dev, BURST_CAP, dev->burst_cap); if (ret < 0) From 65520a70cb09200d916464ddaa04e996e689c576 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:40 +0100 Subject: [PATCH 0240/1450] net: usb: lan78xx: Add error handling to set_rx_max_frame_length and set_mtu Improve error handling in `lan78xx_set_rx_max_frame_length` by: - Checking return values from register read/write operations and propagating errors. - Exiting immediately on failure to ensure proper error reporting. In `lan78xx_change_mtu`, log errors when changing MTU fails, using `%pe` for clear error representation. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-9-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 33cda7f3dd12e..2d16c1fc850e9 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2599,27 +2599,36 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) static int lan78xx_set_rx_max_frame_length(struct lan78xx_net *dev, int size) { - u32 buf; bool rxenabled; + u32 buf; + int ret; - lan78xx_read_reg(dev, MAC_RX, &buf); + ret = lan78xx_read_reg(dev, MAC_RX, &buf); + if (ret < 0) + return ret; rxenabled = ((buf & MAC_RX_RXEN_) != 0); if (rxenabled) { buf &= ~MAC_RX_RXEN_; - lan78xx_write_reg(dev, MAC_RX, buf); + ret = lan78xx_write_reg(dev, MAC_RX, buf); + if (ret < 0) + return ret; } /* add 4 to size for FCS */ buf &= ~MAC_RX_MAX_SIZE_MASK_; buf |= (((size + 4) << MAC_RX_MAX_SIZE_SHIFT_) & MAC_RX_MAX_SIZE_MASK_); - lan78xx_write_reg(dev, MAC_RX, buf); + ret = lan78xx_write_reg(dev, MAC_RX, buf); + if (ret < 0) + return ret; if (rxenabled) { buf |= MAC_RX_RXEN_; - lan78xx_write_reg(dev, MAC_RX, buf); + ret = lan78xx_write_reg(dev, MAC_RX, buf); + if (ret < 0) + return ret; } return 0; @@ -2685,7 +2694,10 @@ static int lan78xx_change_mtu(struct net_device *netdev, int new_mtu) return ret; ret = lan78xx_set_rx_max_frame_length(dev, max_frame_len); - if (!ret) + if (ret < 0) + netdev_err(dev->net, "MTU changed to %d from %d failed with %pe\n", + new_mtu, netdev->mtu, ERR_PTR(ret)); + else WRITE_ONCE(netdev->mtu, new_mtu); usb_autopm_put_interface(dev->intf); From 0da202e6a56f6ec137fde151c1a1a9d39a4135c0 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:41 +0100 Subject: [PATCH 0241/1450] net: usb: lan78xx: Add error handling to lan78xx_irq_bus_sync_unlock Update `lan78xx_irq_bus_sync_unlock` to handle errors in register read/write operations. If an error occurs, log it and exit the function appropriately. This ensures proper handling of failures during IRQ synchronization. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-10-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 2d16c1fc850e9..2ae9565b5044f 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2382,13 +2382,22 @@ static void lan78xx_irq_bus_sync_unlock(struct irq_data *irqd) struct lan78xx_net *dev = container_of(data, struct lan78xx_net, domain_data); u32 buf; + int ret; /* call register access here because irq_bus_lock & irq_bus_sync_unlock * are only two callbacks executed in non-atomic contex. */ - lan78xx_read_reg(dev, INT_EP_CTL, &buf); + ret = lan78xx_read_reg(dev, INT_EP_CTL, &buf); + if (ret < 0) + goto irq_bus_sync_unlock; + if (buf != data->irqenable) - lan78xx_write_reg(dev, INT_EP_CTL, data->irqenable); + ret = lan78xx_write_reg(dev, INT_EP_CTL, data->irqenable); + +irq_bus_sync_unlock: + if (ret < 0) + netdev_err(dev->net, "Failed to sync IRQ enable register: %pe\n", + ERR_PTR(ret)); mutex_unlock(&data->irq_lock); } From 48fb3d3c4be602f0977f81d20de7deb0e3807575 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 4 Dec 2024 09:41:42 +0100 Subject: [PATCH 0242/1450] net: usb: lan78xx: Improve error handling in dataport and multicast writes Update `lan78xx_dataport_write` and `lan78xx_deferred_multicast_write` to: - Handle errors during register read/write operations. - Exit immediately on errors and log them using `%pe` for clarity. - Avoid silent failures by propagating error codes properly. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241204084142.1152696-11-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 67 ++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 2ae9565b5044f..d5f6367d37148 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1371,7 +1371,7 @@ static int lan78xx_dataport_wait_not_busy(struct lan78xx_net *dev) ret = lan78xx_read_reg(dev, DP_SEL, &dp_sel); if (unlikely(ret < 0)) - return -EIO; + return ret; if (dp_sel & DP_SEL_DPRDY_) return 0; @@ -1381,44 +1381,51 @@ static int lan78xx_dataport_wait_not_busy(struct lan78xx_net *dev) netdev_warn(dev->net, "%s timed out", __func__); - return -EIO; + return -ETIMEDOUT; } static int lan78xx_dataport_write(struct lan78xx_net *dev, u32 ram_select, u32 addr, u32 length, u32 *buf) { struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]); - u32 dp_sel; int i, ret; - if (usb_autopm_get_interface(dev->intf) < 0) - return 0; + ret = usb_autopm_get_interface(dev->intf); + if (ret < 0) + return ret; mutex_lock(&pdata->dataport_mutex); ret = lan78xx_dataport_wait_not_busy(dev); if (ret < 0) - goto done; + goto dataport_write; - ret = lan78xx_read_reg(dev, DP_SEL, &dp_sel); - - dp_sel &= ~DP_SEL_RSEL_MASK_; - dp_sel |= ram_select; - ret = lan78xx_write_reg(dev, DP_SEL, dp_sel); + ret = lan78xx_update_reg(dev, DP_SEL, DP_SEL_RSEL_MASK_, ram_select); + if (ret < 0) + goto dataport_write; for (i = 0; i < length; i++) { ret = lan78xx_write_reg(dev, DP_ADDR, addr + i); + if (ret < 0) + goto dataport_write; ret = lan78xx_write_reg(dev, DP_DATA, buf[i]); + if (ret < 0) + goto dataport_write; ret = lan78xx_write_reg(dev, DP_CMD, DP_CMD_WRITE_); + if (ret < 0) + goto dataport_write; ret = lan78xx_dataport_wait_not_busy(dev); if (ret < 0) - goto done; + goto dataport_write; } -done: +dataport_write: + if (ret < 0) + netdev_warn(dev->net, "dataport write failed %pe", ERR_PTR(ret)); + mutex_unlock(&pdata->dataport_mutex); usb_autopm_put_interface(dev->intf); @@ -1454,23 +1461,39 @@ static void lan78xx_deferred_multicast_write(struct work_struct *param) struct lan78xx_priv *pdata = container_of(param, struct lan78xx_priv, set_multicast); struct lan78xx_net *dev = pdata->dev; - int i; + int i, ret; netif_dbg(dev, drv, dev->net, "deferred multicast write 0x%08x\n", pdata->rfe_ctl); - lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, DP_SEL_VHF_VLAN_LEN, - DP_SEL_VHF_HASH_LEN, pdata->mchash_table); + ret = lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, + DP_SEL_VHF_VLAN_LEN, + DP_SEL_VHF_HASH_LEN, pdata->mchash_table); + if (ret < 0) + goto multicast_write_done; for (i = 1; i < NUM_OF_MAF; i++) { - lan78xx_write_reg(dev, MAF_HI(i), 0); - lan78xx_write_reg(dev, MAF_LO(i), - pdata->pfilter_table[i][1]); - lan78xx_write_reg(dev, MAF_HI(i), - pdata->pfilter_table[i][0]); + ret = lan78xx_write_reg(dev, MAF_HI(i), 0); + if (ret < 0) + goto multicast_write_done; + + ret = lan78xx_write_reg(dev, MAF_LO(i), + pdata->pfilter_table[i][1]); + if (ret < 0) + goto multicast_write_done; + + ret = lan78xx_write_reg(dev, MAF_HI(i), + pdata->pfilter_table[i][0]); + if (ret < 0) + goto multicast_write_done; } - lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl); + ret = lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl); + +multicast_write_done: + if (ret < 0) + netdev_warn(dev->net, "multicast write failed %pe", ERR_PTR(ret)); + return; } static void lan78xx_set_multicast(struct net_device *netdev) From 18eabadd73ae60023ab05e376246bd725fb0c113 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 4 Dec 2024 13:11:21 +0100 Subject: [PATCH 0243/1450] vrf: Make pcpu_dstats update functions available to other modules. Currently vrf is the only module that uses NETDEV_PCPU_STAT_DSTATS. In order to make this kind of statistics available to other modules, we need to define the update functions in netdevice.h. Therefore, let's define dev_dstats_*() functions for RX and TX packet updates (packets, bytes and drops). Use these new functions in vrf.c instead of vrf_rx_stats() and the other manual counter updates. While there, update the type of the "len" variables to "unsigned int", so that there're aligned with both skb->len and the new dstats update functions. Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/d7a552ee382c79f4854e7fcc224cf176cd21150d.1733313925.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 49 +++++++++++---------------------------- include/linux/netdevice.h | 40 ++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 35 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 67d25f4f94eff..ca81b212a246b 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -122,16 +122,6 @@ struct net_vrf { int ifindex; }; -static void vrf_rx_stats(struct net_device *dev, int len) -{ - struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); - - u64_stats_update_begin(&dstats->syncp); - u64_stats_inc(&dstats->rx_packets); - u64_stats_add(&dstats->rx_bytes, len); - u64_stats_update_end(&dstats->syncp); -} - static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; @@ -369,7 +359,7 @@ static bool qdisc_tx_is_default(const struct net_device *dev) static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { - int len = skb->len; + unsigned int len = skb->len; skb_orphan(skb); @@ -382,15 +372,10 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, skb->protocol = eth_type_trans(skb, dev); - if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { - vrf_rx_stats(dev, len); - } else { - struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); - - u64_stats_update_begin(&dstats->syncp); - u64_stats_inc(&dstats->rx_drops); - u64_stats_update_end(&dstats->syncp); - } + if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) + dev_dstats_rx_add(dev, len); + else + dev_dstats_rx_dropped(dev); return NETDEV_TX_OK; } @@ -578,20 +563,14 @@ static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { - struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); - - int len = skb->len; - netdev_tx_t ret = is_ip_tx_frame(skb, dev); - - u64_stats_update_begin(&dstats->syncp); - if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { + unsigned int len = skb->len; + netdev_tx_t ret; - u64_stats_inc(&dstats->tx_packets); - u64_stats_add(&dstats->tx_bytes, len); - } else { - u64_stats_inc(&dstats->tx_drops); - } - u64_stats_update_end(&dstats->syncp); + ret = is_ip_tx_frame(skb, dev); + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) + dev_dstats_tx_add(dev, len); + else + dev_dstats_tx_dropped(dev); return ret; } @@ -1364,7 +1343,7 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, if (!is_ndisc) { struct net_device *orig_dev = skb->dev; - vrf_rx_stats(vrf_dev, skb->len); + dev_dstats_rx_add(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; @@ -1420,7 +1399,7 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, goto out; } - vrf_rx_stats(vrf_dev, skb->len); + dev_dstats_rx_add(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d1a8d98b132c6..1351054416812 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2854,6 +2854,46 @@ static inline void dev_lstats_add(struct net_device *dev, unsigned int len) u64_stats_update_end(&lstats->syncp); } +static inline void dev_dstats_rx_add(struct net_device *dev, + unsigned int len) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->rx_packets); + u64_stats_add(&dstats->rx_bytes, len); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_rx_dropped(struct net_device *dev) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->rx_drops); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_tx_add(struct net_device *dev, + unsigned int len) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->tx_packets); + u64_stats_add(&dstats->tx_bytes, len); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_tx_dropped(struct net_device *dev) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->tx_drops); + u64_stats_update_end(&dstats->syncp); +} + #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ From be226352e8dc77d3313c096b2d8e7f69bf6980fc Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 4 Dec 2024 13:11:27 +0100 Subject: [PATCH 0244/1450] vxlan: Handle stats using NETDEV_PCPU_STAT_DSTATS. VXLAN uses the TSTATS infrastructure (dev_sw_netstats_*()) for RX and TX packet counters. It also uses the device core stats (dev_core_stats_*()) for RX and TX drops. Let's consolidate that using the DSTATS infrastructure, which can handle both packet counters and packet drops. Statistics that don't fit DSTATS are still updated atomically with DEV_STATS_INC(). While there, convert the "len" variable of vxlan_encap_bypass() to unsigned int, to respect the types of skb->len and dev_dstats_[rt]x_add(). Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/145558b184b3cda77911ca5682b6eb83c3ffed8e.1733313925.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 9ea63059d52d7..b46a799bd3904 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1818,14 +1818,14 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); - dev_core_stats_rx_dropped_inc(vxlan->dev); + dev_dstats_rx_dropped(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); reason = SKB_DROP_REASON_DEV_READY; goto drop; } - dev_sw_netstats_rx_add(vxlan->dev, skb->len); + dev_dstats_rx_add(vxlan->dev, skb->len); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len); gro_cells_receive(&vxlan->gro_cells, skb); @@ -1880,7 +1880,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (!pskb_may_pull(skb, arp_hdr_len(dev))) { - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); goto out; @@ -1938,7 +1938,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) reply->pkt_type = PACKET_HOST; if (netif_rx(reply) == NET_RX_DROP) { - dev_core_stats_rx_dropped_inc(dev); + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2097,7 +2097,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (netif_rx(reply) == NET_RX_DROP) { - dev_core_stats_rx_dropped_inc(dev); + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2271,8 +2271,8 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, { union vxlan_addr loopback; union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; + unsigned int len = skb->len; struct net_device *dev; - int len = skb->len; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; @@ -2299,16 +2299,16 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop) vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni); - dev_sw_netstats_tx_add(src_vxlan->dev, 1, len); + dev_dstats_tx_add(src_vxlan->dev, len); vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len); if (__netif_rx(skb) == NET_RX_SUCCESS) { - dev_sw_netstats_rx_add(dst_vxlan->dev, len); + dev_dstats_rx_add(dst_vxlan->dev, len); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX, len); } else { drop: - dev_core_stats_rx_dropped_inc(dev); + dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2621,7 +2621,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, return; drop: - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, reason); return; @@ -2666,7 +2666,7 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, return; drop: - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); @@ -2704,7 +2704,7 @@ static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev, return NETDEV_TX_OK; drop: - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); @@ -2801,7 +2801,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); - dev_core_stats_tx_dropped_inc(dev); + dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_NO_REMOTE); @@ -3371,7 +3371,7 @@ static void vxlan_setup(struct net_device *dev) dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_MAX_MTU; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); From 6fa6de30224619f836f4cb1209b5af3f5319806e Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 4 Dec 2024 13:11:30 +0100 Subject: [PATCH 0245/1450] geneve: Handle stats using NETDEV_PCPU_STAT_DSTATS. Geneve uses the TSTATS infrastructure (dev_sw_netstats_*()) for RX packet counters. All other counters are handled using atomic increments with DEV_STATS_INC(). Let's convert packet stats handling to DSTATS, which has a per-cpu counter for packet drops too, to avoid the cost of atomic increments in these cases. Statistics that don't fit DSTATS are still updated atomically with DEV_STATS_INC(). Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/7af5c09f3c26f0f231fbe383822ca5d1ce0278fa.1733313925.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/geneve.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index bc658bc608854..642155cb83157 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -235,7 +235,7 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); if (!tun_dst) { - DEV_STATS_INC(geneve->dev, rx_dropped); + dev_dstats_rx_dropped(geneve->dev); goto drop; } /* Update tunnel dst according to Geneve options. */ @@ -322,7 +322,7 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, len = skb->len; err = gro_cells_receive(&geneve->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) - dev_sw_netstats_rx_add(geneve->dev, len); + dev_dstats_rx_add(geneve->dev, len); return; drop: @@ -387,14 +387,14 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (unlikely((!geneve->cfg.inner_proto_inherit && inner_proto != htons(ETH_P_TEB)))) { - DEV_STATS_INC(geneve->dev, rx_dropped); + dev_dstats_rx_dropped(geneve->dev); goto drop; } opts_len = geneveh->opt_len * 4; if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto, !net_eq(geneve->net, dev_net(geneve->dev)))) { - DEV_STATS_INC(geneve->dev, rx_dropped); + dev_dstats_rx_dropped(geneve->dev); goto drop; } @@ -1023,7 +1023,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { netdev_dbg(dev, "no tunnel metadata\n"); dev_kfree_skb(skb); - DEV_STATS_INC(dev, tx_dropped); + dev_dstats_tx_dropped(dev); return NETDEV_TX_OK; } } else { @@ -1202,7 +1202,7 @@ static void geneve_setup(struct net_device *dev) dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; /* MTU range: 68 - (something less than 65535) */ dev->min_mtu = ETH_MIN_MTU; /* The max_mtu calculation does not take account of GENEVE From c77200c074917d0fd51e5c029c50c76c07b6d310 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 4 Dec 2024 13:11:32 +0100 Subject: [PATCH 0246/1450] bareudp: Handle stats using NETDEV_PCPU_STAT_DSTATS. Bareudp uses the TSTATS infrastructure (dev_sw_netstats_*()) for RX packet counters. It was also recently converted to use the device core stats (dev_core_stats_*()) for RX and TX drops (see commit 788d5d655bc9 ("bareudp: Use pcpu stats to update rx_dropped counter.")). Since core stats are to be avoided in drivers, and for consistency with VXLAN and Geneve, let's convert packet stats handling to DSTATS, which can handle RX/TX stats and packet drops. Statistics that don't fit DSTATS are still updated atomically with DEV_STATS_INC(). Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/0f4f8448db3ff449ac6e939872b28cf3f8982da7.1733313925.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index a2abfade82dd1..70814303aab86 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -84,7 +84,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (skb_copy_bits(skb, BAREUDP_BASE_HLEN, &ipversion, sizeof(ipversion))) { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } ipversion >>= 4; @@ -94,7 +94,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) } else if (ipversion == 6 && bareudp->multi_proto_mode) { proto = htons(ETH_P_IPV6); } else { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } } else if (bareudp->ethertype == htons(ETH_P_MPLS_UC)) { @@ -108,7 +108,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) ipv4_is_multicast(tunnel_hdr->daddr)) { proto = htons(ETH_P_MPLS_MC); } else { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } } else { @@ -124,7 +124,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) (addr_type & IPV6_ADDR_MULTICAST)) { proto = htons(ETH_P_MPLS_MC); } else { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } } @@ -136,7 +136,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) proto, !net_eq(bareudp->net, dev_net(bareudp->dev)))) { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } @@ -144,7 +144,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) tun_dst = udp_tun_rx_dst(skb, family, key, 0, 0); if (!tun_dst) { - dev_core_stats_rx_dropped_inc(bareudp->dev); + dev_dstats_rx_dropped(bareudp->dev); goto drop; } skb_dst_set(skb, &tun_dst->dst); @@ -194,7 +194,7 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) len = skb->len; err = gro_cells_receive(&bareudp->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) - dev_sw_netstats_rx_add(bareudp->dev, len); + dev_dstats_rx_add(bareudp->dev, len); return 0; drop: @@ -589,7 +589,7 @@ static void bareudp_setup(struct net_device *dev) dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int bareudp_validate(struct nlattr *tb[], struct nlattr *data[], From 00ab246750821b226f14ebc94ad21431dc82820b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 6 Dec 2024 11:30:56 +0100 Subject: [PATCH 0247/1450] tools: ynl-gen-c: annotate valid choices for --mode This makes argparse validate the input and helps users understand which modes are possible. Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20241206113100.e2ab5cf6937c.Ie149a0ca5df713860964b44fe9d9ae547f2e1553@changeid Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 8098bcbb6f404..7f6e5157770d3 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -2706,7 +2706,8 @@ def find_kernel_root(full_path): def main(): parser = argparse.ArgumentParser(description='Netlink simple parsing generator') - parser.add_argument('--mode', dest='mode', type=str, required=True) + parser.add_argument('--mode', dest='mode', type=str, required=True, + choices=('user', 'kernel', 'uapi')) parser.add_argument('--spec', dest='spec', type=str, required=True) parser.add_argument('--header', dest='header', action='store_true', default=None) parser.add_argument('--source', dest='header', action='store_false') From 81d89e6e88d5d592c1792940753d69d9753b3a8a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 6 Dec 2024 11:30:57 +0100 Subject: [PATCH 0248/1450] tools: ynl-gen-c: don't require -o argument Without -o the tool currently crashes, but it's not marked as required. The only thing we can't do without it is to generate the correct #include for user source files, but we can put a placeholder instead. Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20241206113100.89d35bf124d6.I9228fb704e6d5c9d8e046ef15025a47a48439c1e@changeid Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 7f6e5157770d3..ec22889487952 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -2761,7 +2761,10 @@ def main(): cw.p('#define ' + hdr_prot) cw.nl() - hdr_file=os.path.basename(args.out_file[:-2]) + ".h" + if args.out_file: + hdr_file = os.path.basename(args.out_file[:-2]) + ".h" + else: + hdr_file = "generated_header_file.h" if args.mode == 'kernel': cw.p('#include ') From 3ca459eaba1bf96a8c7878de84fa8872259a01e3 Mon Sep 17 00:00:00 2001 From: Stas Sergeev Date: Thu, 5 Dec 2024 10:36:14 +0300 Subject: [PATCH 0249/1450] tun: fix group permission check Currently tun checks the group permission even if the user have matched. Besides going against the usual permission semantic, this has a very interesting implication: if the tun group is not among the supplementary groups of the tun user, then effectively no one can access the tun device. CAP_SYS_ADMIN still can, but its the same as not setting the tun ownership. This patch relaxes the group checking so that either the user match or the group match is enough. This avoids the situation when no one can access the device even though the ownership is properly set. Also I simplified the logic by removing the redundant inversions: tun_not_capable() --> !tun_capable() Signed-off-by: Stas Sergeev Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Link: https://patch.msgid.link/20241205073614.294773-1-stsp2@yandex.ru Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d7a865ef370b6..8e94df88392c6 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -574,14 +574,18 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, return ret; } -static inline bool tun_not_capable(struct tun_struct *tun) +static inline bool tun_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); - return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || - (gid_valid(tun->group) && !in_egroup_p(tun->group))) && - !ns_capable(net->user_ns, CAP_NET_ADMIN); + if (ns_capable(net->user_ns, CAP_NET_ADMIN)) + return 1; + if (uid_valid(tun->owner) && uid_eq(cred->euid, tun->owner)) + return 1; + if (gid_valid(tun->group) && in_egroup_p(tun->group)) + return 1; + return 0; } static void tun_set_real_num_queues(struct tun_struct *tun) @@ -2778,7 +2782,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; - if (tun_not_capable(tun)) + if (!tun_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) From 6561f0e547be221f411fda5eddfcc5bd8bb058a5 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Dec 2024 09:42:24 +0000 Subject: [PATCH 0250/1450] net: pcs: pcs-lynx: implement pcs_inband_caps() method Report the PCS in-band capabilities to phylink for the Lynx PCS. Reviewed-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tJ8NM-006L5J-AH@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-lynx.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/net/pcs/pcs-lynx.c b/drivers/net/pcs/pcs-lynx.c index b79aedad855be..767a8c0714acc 100644 --- a/drivers/net/pcs/pcs-lynx.c +++ b/drivers/net/pcs/pcs-lynx.c @@ -35,6 +35,27 @@ enum sgmii_speed { #define phylink_pcs_to_lynx(pl_pcs) container_of((pl_pcs), struct lynx_pcs, pcs) #define lynx_to_phylink_pcs(lynx) (&(lynx)->pcs) +static unsigned int lynx_pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) +{ + switch (interface) { + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + case PHY_INTERFACE_MODE_10GBASER: + case PHY_INTERFACE_MODE_2500BASEX: + return LINK_INBAND_DISABLE; + + case PHY_INTERFACE_MODE_USXGMII: + return LINK_INBAND_ENABLE; + + default: + return 0; + } +} + static void lynx_pcs_get_state_usxgmii(struct mdio_device *pcs, struct phylink_link_state *state) { @@ -306,6 +327,7 @@ static void lynx_pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode, } static const struct phylink_pcs_ops lynx_pcs_phylink_ops = { + .pcs_inband_caps = lynx_pcs_inband_caps, .pcs_get_state = lynx_pcs_get_state, .pcs_config = lynx_pcs_config, .pcs_an_restart = lynx_pcs_an_restart, From 520d29bdda86915b3caf8c72825a574bff212553 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Dec 2024 09:42:29 +0000 Subject: [PATCH 0251/1450] net: pcs: pcs-mtk-lynxi: implement pcs_inband_caps() method Report the PCS in-band capabilities to phylink for the LynxI PCS. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tJ8NR-006L5P-E3@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-mtk-lynxi.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/pcs/pcs-mtk-lynxi.c b/drivers/net/pcs/pcs-mtk-lynxi.c index 4f63abe638c4c..7de8045352298 100644 --- a/drivers/net/pcs/pcs-mtk-lynxi.c +++ b/drivers/net/pcs/pcs-mtk-lynxi.c @@ -88,6 +88,21 @@ static struct mtk_pcs_lynxi *pcs_to_mtk_pcs_lynxi(struct phylink_pcs *pcs) return container_of(pcs, struct mtk_pcs_lynxi, pcs); } +static unsigned int mtk_pcs_lynxi_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) +{ + switch (interface) { + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_2500BASEX: + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + default: + return 0; + } +} + static void mtk_pcs_lynxi_get_state(struct phylink_pcs *pcs, struct phylink_link_state *state) { @@ -241,6 +256,7 @@ static void mtk_pcs_lynxi_disable(struct phylink_pcs *pcs) } static const struct phylink_pcs_ops mtk_pcs_lynxi_ops = { + .pcs_inband_caps = mtk_pcs_lynxi_inband_caps, .pcs_get_state = mtk_pcs_lynxi_get_state, .pcs_config = mtk_pcs_lynxi_config, .pcs_an_restart = mtk_pcs_lynxi_restart_an, From 484d0170d6c6bbb5213d037664e9a551f793bacd Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Dec 2024 09:42:34 +0000 Subject: [PATCH 0252/1450] net: pcs: xpcs: implement pcs_inband_caps() method Report the PCS inband capabilities to phylink for XPCS. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tJ8NW-006L5V-I9@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-xpcs.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index 7246a910728d1..f70ca39f0905b 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -567,6 +567,33 @@ static int xpcs_validate(struct phylink_pcs *pcs, unsigned long *supported, return 0; } +static unsigned int xpcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface) +{ + struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs); + const struct dw_xpcs_compat *compat; + + compat = xpcs_find_compat(xpcs, interface); + if (!compat) + return 0; + + switch (compat->an_mode) { + case DW_AN_C73: + return LINK_INBAND_ENABLE; + + case DW_AN_C37_SGMII: + case DW_AN_C37_1000BASEX: + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + + case DW_10GBASER: + case DW_2500BASEX: + return LINK_INBAND_DISABLE; + + default: + return 0; + } +} + void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces) { const struct dw_xpcs_compat *compat; @@ -1306,6 +1333,7 @@ static const struct dw_xpcs_desc xpcs_desc_list[] = { static const struct phylink_pcs_ops xpcs_phylink_ops = { .pcs_validate = xpcs_validate, + .pcs_inband_caps = xpcs_inband_caps, .pcs_pre_config = xpcs_pre_config, .pcs_config = xpcs_config, .pcs_get_state = xpcs_get_state, From 7ea2745766d776866cfbc981b21ed3cfdf50124e Mon Sep 17 00:00:00 2001 From: Justin Lai Date: Fri, 6 Dec 2024 16:48:51 +0800 Subject: [PATCH 0253/1450] rtase: Refine the if statement Refine the if statement to improve readability. Signed-off-by: Justin Lai Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20241206084851.760475-1-justinlai0215@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index 6106aa5333bcc..585d0b21c9e00 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -2018,7 +2018,7 @@ static int rtase_init_board(struct pci_dev *pdev, struct net_device **dev_out, SET_NETDEV_DEV(dev, &pdev->dev); ret = pci_enable_device(pdev); - if (ret < 0) + if (ret) goto err_out_free_dev; /* make sure PCI base addr 1 is MMIO */ @@ -2034,7 +2034,7 @@ static int rtase_init_board(struct pci_dev *pdev, struct net_device **dev_out, } ret = pci_request_regions(pdev, KBUILD_MODNAME); - if (ret < 0) + if (ret) goto err_out_disable; ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); @@ -2110,7 +2110,7 @@ static int rtase_init_one(struct pci_dev *pdev, dev_dbg(&pdev->dev, "Automotive Switch Ethernet driver loaded\n"); ret = rtase_init_board(pdev, &dev, &ioaddr); - if (ret != 0) + if (ret) return ret; tp = netdev_priv(dev); @@ -2120,7 +2120,7 @@ static int rtase_init_one(struct pci_dev *pdev, /* identify chip attached to board */ ret = rtase_check_mac_version_valid(tp); - if (ret != 0) { + if (ret) { dev_err(&pdev->dev, "unknown chip version: 0x%08x, contact rtase maintainers (see MAINTAINERS file)\n", tp->hw_ver); @@ -2131,7 +2131,7 @@ static int rtase_init_one(struct pci_dev *pdev, rtase_init_hardware(tp); ret = rtase_alloc_interrupt(pdev, tp); - if (ret < 0) { + if (ret) { dev_err(&pdev->dev, "unable to alloc MSIX/MSI\n"); goto err_out_del_napi; } @@ -2176,7 +2176,7 @@ static int rtase_init_one(struct pci_dev *pdev, netif_carrier_off(dev); ret = register_netdev(dev); - if (ret != 0) + if (ret) goto err_out_free_dma; netdev_dbg(dev, "%pM, IRQ %d\n", dev->dev_addr, dev->irq); From 9d23e48654620fdccfcc74cc2cef04eaf7353d07 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Wed, 23 Oct 2024 20:29:54 +0300 Subject: [PATCH 0254/1450] phy: rockchip: samsung-hdptx: Set drvdata before enabling runtime PM In some cases, rk_hdptx_phy_runtime_resume() may be invoked before platform_set_drvdata() is executed in ->probe(), leading to a NULL pointer dereference when using the return of dev_get_drvdata(). Ensure platform_set_drvdata() is called before devm_pm_runtime_enable(). Reported-by: Dmitry Osipenko Fixes: 553be2830c5f ("phy: rockchip: Add Samsung HDMI/eDP Combo PHY driver") Signed-off-by: Cristian Ciocaltea Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20241023-phy-sam-hdptx-rpm-fix-v1-1-87f4c994e346@collabora.com Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c index ceab9c71d3b53..0965b9d4f9cf1 100644 --- a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c +++ b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c @@ -1101,6 +1101,8 @@ static int rk_hdptx_phy_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(hdptx->grf), "Could not get GRF syscon\n"); + platform_set_drvdata(pdev, hdptx); + ret = devm_pm_runtime_enable(dev); if (ret) return dev_err_probe(dev, ret, "Failed to enable runtime PM\n"); @@ -1110,7 +1112,6 @@ static int rk_hdptx_phy_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(hdptx->phy), "Failed to create HDMI PHY\n"); - platform_set_drvdata(pdev, hdptx); phy_set_drvdata(hdptx->phy, hdptx); phy_set_bus_width(hdptx->phy, 8); From 195c3d4631816f02071f0e01d2d2def51cf5067a Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:52 +0530 Subject: [PATCH 0255/1450] octeontx2-pf: map skb data as device writeable Crypto hardware need write permission for in-place encrypt or decrypt operation on skb-data to support IPsec crypto offload. That patch uses skb_unshare to make skb data writeable for ipsec crypto offload and map skb fragment memory as device read-write. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/nic/otx2_txrx.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 04bc06a80e23a..3b0457e52a6a4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "otx2_reg.h" #include "otx2_common.h" @@ -83,10 +84,17 @@ static unsigned int frag_num(unsigned int i) static dma_addr_t otx2_dma_map_skb_frag(struct otx2_nic *pfvf, struct sk_buff *skb, int seg, int *len) { + enum dma_data_direction dir = DMA_TO_DEVICE; const skb_frag_t *frag; struct page *page; int offset; + /* Crypto hardware need write permission for ipsec crypto offload */ + if (unlikely(xfrm_offload(skb))) { + dir = DMA_BIDIRECTIONAL; + skb = skb_unshare(skb, GFP_ATOMIC); + } + /* First segment is always skb->data */ if (!seg) { page = virt_to_page(skb->data); @@ -98,16 +106,22 @@ static dma_addr_t otx2_dma_map_skb_frag(struct otx2_nic *pfvf, offset = skb_frag_off(frag); *len = skb_frag_size(frag); } - return otx2_dma_map_page(pfvf, page, offset, *len, DMA_TO_DEVICE); + return otx2_dma_map_page(pfvf, page, offset, *len, dir); } static void otx2_dma_unmap_skb_frags(struct otx2_nic *pfvf, struct sg_list *sg) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct sk_buff *skb = NULL; int seg; + skb = (struct sk_buff *)sg->skb; + if (unlikely(xfrm_offload(skb))) + dir = DMA_BIDIRECTIONAL; + for (seg = 0; seg < sg->num_segs; seg++) { otx2_dma_unmap_page(pfvf, sg->dma_addr[seg], - sg->size[seg], DMA_TO_DEVICE); + sg->size[seg], dir); } sg->num_segs = 0; } From c460b7442a6b020c6f0a2a5f837436b1ce56e95b Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:53 +0530 Subject: [PATCH 0256/1450] octeontx2-pf: Move skb fragment map/unmap to common code Move skb fragment map/unmap function to common file so as to reuse same for outbound IPsec crypto offload Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- .../marvell/octeontx2/nic/otx2_common.c | 46 +++++++++++++++++++ .../marvell/octeontx2/nic/otx2_common.h | 4 ++ .../marvell/octeontx2/nic/otx2_txrx.c | 46 ------------------- 3 files changed, 50 insertions(+), 46 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 523ecb798a7a2..971115a5d2cc0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "otx2_reg.h" #include "otx2_common.h" @@ -1947,3 +1948,48 @@ EXPORT_SYMBOL(otx2_mbox_up_handler_ ## _fn_name); MBOX_UP_CGX_MESSAGES MBOX_UP_MCS_MESSAGES #undef M + +dma_addr_t otx2_dma_map_skb_frag(struct otx2_nic *pfvf, + struct sk_buff *skb, int seg, int *len) +{ + enum dma_data_direction dir = DMA_TO_DEVICE; + const skb_frag_t *frag; + struct page *page; + int offset; + + /* Crypto hardware need write permission for ipsec crypto offload */ + if (unlikely(xfrm_offload(skb))) { + dir = DMA_BIDIRECTIONAL; + skb = skb_unshare(skb, GFP_ATOMIC); + } + + /* First segment is always skb->data */ + if (!seg) { + page = virt_to_page(skb->data); + offset = offset_in_page(skb->data); + *len = skb_headlen(skb); + } else { + frag = &skb_shinfo(skb)->frags[seg - 1]; + page = skb_frag_page(frag); + offset = skb_frag_off(frag); + *len = skb_frag_size(frag); + } + return otx2_dma_map_page(pfvf, page, offset, *len, dir); +} + +void otx2_dma_unmap_skb_frags(struct otx2_nic *pfvf, struct sg_list *sg) +{ + enum dma_data_direction dir = DMA_TO_DEVICE; + struct sk_buff *skb = NULL; + int seg; + + skb = (struct sk_buff *)sg->skb; + if (unlikely(xfrm_offload(skb))) + dir = DMA_BIDIRECTIONAL; + + for (seg = 0; seg < sg->num_segs; seg++) { + otx2_dma_unmap_page(pfvf, sg->dma_addr[seg], + sg->size[seg], dir); + } + sg->num_segs = 0; +} diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 566848663fea7..bb78d825046de 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -1149,4 +1149,8 @@ static inline int mcam_entry_cmp(const void *a, const void *b) { return *(u16 *)a - *(u16 *)b; } + +dma_addr_t otx2_dma_map_skb_frag(struct otx2_nic *pfvf, + struct sk_buff *skb, int seg, int *len); +void otx2_dma_unmap_skb_frags(struct otx2_nic *pfvf, struct sg_list *sg); #endif /* OTX2_COMMON_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 3b0457e52a6a4..a49041e55c330 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "otx2_reg.h" #include "otx2_common.h" @@ -81,51 +80,6 @@ static unsigned int frag_num(unsigned int i) #endif } -static dma_addr_t otx2_dma_map_skb_frag(struct otx2_nic *pfvf, - struct sk_buff *skb, int seg, int *len) -{ - enum dma_data_direction dir = DMA_TO_DEVICE; - const skb_frag_t *frag; - struct page *page; - int offset; - - /* Crypto hardware need write permission for ipsec crypto offload */ - if (unlikely(xfrm_offload(skb))) { - dir = DMA_BIDIRECTIONAL; - skb = skb_unshare(skb, GFP_ATOMIC); - } - - /* First segment is always skb->data */ - if (!seg) { - page = virt_to_page(skb->data); - offset = offset_in_page(skb->data); - *len = skb_headlen(skb); - } else { - frag = &skb_shinfo(skb)->frags[seg - 1]; - page = skb_frag_page(frag); - offset = skb_frag_off(frag); - *len = skb_frag_size(frag); - } - return otx2_dma_map_page(pfvf, page, offset, *len, dir); -} - -static void otx2_dma_unmap_skb_frags(struct otx2_nic *pfvf, struct sg_list *sg) -{ - enum dma_data_direction dir = DMA_TO_DEVICE; - struct sk_buff *skb = NULL; - int seg; - - skb = (struct sk_buff *)sg->skb; - if (unlikely(xfrm_offload(skb))) - dir = DMA_BIDIRECTIONAL; - - for (seg = 0; seg < sg->num_segs; seg++) { - otx2_dma_unmap_page(pfvf, sg->dma_addr[seg], - sg->size[seg], dir); - } - sg->num_segs = 0; -} - static void otx2_xdp_snd_pkt_handler(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, struct nix_cqe_tx_s *cqe) From a7ef63dbd5886c396aa1130d5ce42634ab1db91e Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:54 +0530 Subject: [PATCH 0257/1450] octeontx2-af: Disable backpressure between CPT and NIX NIX can assert backpressure to CPT on the NIX<=>CPT link. Keep the backpressure disabled for now. NIX block anyways handles backpressure asserted by MAC due to PFC or flow control pkts. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- .../net/ethernet/marvell/octeontx2/af/mbox.h | 4 ++ .../ethernet/marvell/octeontx2/af/rvu_nix.c | 68 ++++++++++++++++--- .../marvell/octeontx2/nic/otx2_common.c | 44 ++++++++++-- .../marvell/octeontx2/nic/otx2_common.h | 1 + .../marvell/octeontx2/nic/otx2_dcbnl.c | 3 + .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 3 + 6 files changed, 106 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index 62c07407eb941..005ca8a056c0a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -313,6 +313,10 @@ M(NIX_BANDPROF_FREE, 0x801e, nix_bandprof_free, nix_bandprof_free_req, \ msg_rsp) \ M(NIX_BANDPROF_GET_HWINFO, 0x801f, nix_bandprof_get_hwinfo, msg_req, \ nix_bandprof_get_hwinfo_rsp) \ +M(NIX_CPT_BP_ENABLE, 0x8020, nix_cpt_bp_enable, nix_bp_cfg_req, \ + nix_bp_cfg_rsp) \ +M(NIX_CPT_BP_DISABLE, 0x8021, nix_cpt_bp_disable, nix_bp_cfg_req, \ + msg_rsp) \ M(NIX_READ_INLINE_IPSEC_CFG, 0x8023, nix_read_inline_ipsec_cfg, \ msg_req, nix_inline_ipsec_cfg) \ M(NIX_MCAST_GRP_CREATE, 0x802b, nix_mcast_grp_create, nix_mcast_grp_create_req, \ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index a5d1e2bddd58d..613655fcd34f4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -569,9 +569,17 @@ void rvu_nix_flr_free_bpids(struct rvu *rvu, u16 pcifunc) mutex_unlock(&rvu->rsrc_lock); } -int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, - struct nix_bp_cfg_req *req, - struct msg_rsp *rsp) +static u16 nix_get_channel(u16 chan, bool cpt_link) +{ + /* CPT channel for a given link channel is always + * assumed to be BIT(11) set in link channel. + */ + return cpt_link ? chan | BIT(11) : chan; +} + +static int nix_bp_disable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct msg_rsp *rsp, bool cpt_link) { u16 pcifunc = req->hdr.pcifunc; int blkaddr, pf, type, err; @@ -579,6 +587,7 @@ int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, struct rvu_pfvf *pfvf; struct nix_hw *nix_hw; struct nix_bp *bp; + u16 chan_v; u64 cfg; pf = rvu_get_pf(pcifunc); @@ -589,6 +598,9 @@ int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, if (is_sdp_pfvf(pcifunc)) type = NIX_INTF_TYPE_SDP; + if (cpt_link && !rvu->hw->cpt_links) + return 0; + pfvf = rvu_get_pfvf(rvu, pcifunc); err = nix_get_struct_ptrs(rvu, pcifunc, &nix_hw, &blkaddr); if (err) @@ -597,8 +609,9 @@ int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, bp = &nix_hw->bp; chan_base = pfvf->rx_chan_base + req->chan_base; for (chan = chan_base; chan < (chan_base + req->chan_cnt); chan++) { - cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan)); - rvu_write64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan), + chan_v = nix_get_channel(chan, cpt_link); + cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan_v)); + rvu_write64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan_v), cfg & ~BIT_ULL(16)); if (type == NIX_INTF_TYPE_LBK) { @@ -617,6 +630,20 @@ int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, return 0; } +int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct msg_rsp *rsp) +{ + return nix_bp_disable(rvu, req, rsp, false); +} + +int rvu_mbox_handler_nix_cpt_bp_disable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct msg_rsp *rsp) +{ + return nix_bp_disable(rvu, req, rsp, true); +} + static int rvu_nix_get_bpid(struct rvu *rvu, struct nix_bp_cfg_req *req, int type, int chan_id) { @@ -696,15 +723,17 @@ static int rvu_nix_get_bpid(struct rvu *rvu, struct nix_bp_cfg_req *req, return bpid; } -int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, - struct nix_bp_cfg_req *req, - struct nix_bp_cfg_rsp *rsp) +static int nix_bp_enable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct nix_bp_cfg_rsp *rsp, + bool cpt_link) { int blkaddr, pf, type, chan_id = 0; u16 pcifunc = req->hdr.pcifunc; struct rvu_pfvf *pfvf; u16 chan_base, chan; s16 bpid, bpid_base; + u16 chan_v; u64 cfg; pf = rvu_get_pf(pcifunc); @@ -717,6 +746,9 @@ int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, type != NIX_INTF_TYPE_SDP) return 0; + if (cpt_link && !rvu->hw->cpt_links) + return 0; + pfvf = rvu_get_pfvf(rvu, pcifunc); blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc); @@ -730,9 +762,11 @@ int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, return -EINVAL; } - cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan)); + chan_v = nix_get_channel(chan, cpt_link); + + cfg = rvu_read64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan_v)); cfg &= ~GENMASK_ULL(8, 0); - rvu_write64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan), + rvu_write64(rvu, blkaddr, NIX_AF_RX_CHANX_CFG(chan_v), cfg | (bpid & GENMASK_ULL(8, 0)) | BIT_ULL(16)); chan_id++; bpid = rvu_nix_get_bpid(rvu, req, type, chan_id); @@ -750,6 +784,20 @@ int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, return 0; } +int rvu_mbox_handler_nix_bp_enable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct nix_bp_cfg_rsp *rsp) +{ + return nix_bp_enable(rvu, req, rsp, false); +} + +int rvu_mbox_handler_nix_cpt_bp_enable(struct rvu *rvu, + struct nix_bp_cfg_req *req, + struct nix_bp_cfg_rsp *rsp) +{ + return nix_bp_enable(rvu, req, rsp, true); +} + static void nix_setup_lso_tso_l3(struct rvu *rvu, int blkaddr, u64 format, bool v4, u64 *fidx) { diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 971115a5d2cc0..4c8774899eaf5 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -17,6 +17,11 @@ #include "otx2_struct.h" #include "cn10k.h" +static bool otx2_is_pfc_enabled(struct otx2_nic *pfvf) +{ + return IS_ENABLED(CONFIG_DCB) && !!pfvf->pfc_en; +} + static void otx2_nix_rq_op_stats(struct queue_stats *stats, struct otx2_nic *pfvf, int qidx) { @@ -1723,18 +1728,43 @@ int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable) return -ENOMEM; req->chan_base = 0; -#ifdef CONFIG_DCB - req->chan_cnt = pfvf->pfc_en ? IEEE_8021QAZ_MAX_TCS : 1; - req->bpid_per_chan = pfvf->pfc_en ? 1 : 0; -#else - req->chan_cnt = 1; - req->bpid_per_chan = 0; -#endif + if (otx2_is_pfc_enabled(pfvf)) { + req->chan_cnt = IEEE_8021QAZ_MAX_TCS; + req->bpid_per_chan = 1; + } else { + req->chan_cnt = 1; + req->bpid_per_chan = 0; + } return otx2_sync_mbox_msg(&pfvf->mbox); } EXPORT_SYMBOL(otx2_nix_config_bp); +int otx2_nix_cpt_config_bp(struct otx2_nic *pfvf, bool enable) +{ + struct nix_bp_cfg_req *req; + + if (enable) + req = otx2_mbox_alloc_msg_nix_cpt_bp_enable(&pfvf->mbox); + else + req = otx2_mbox_alloc_msg_nix_cpt_bp_disable(&pfvf->mbox); + + if (!req) + return -ENOMEM; + + req->chan_base = 0; + if (otx2_is_pfc_enabled(pfvf)) { + req->chan_cnt = IEEE_8021QAZ_MAX_TCS; + req->bpid_per_chan = 1; + } else { + req->chan_cnt = 1; + req->bpid_per_chan = 0; + } + + return otx2_sync_mbox_msg(&pfvf->mbox); +} +EXPORT_SYMBOL(otx2_nix_cpt_config_bp); + /* Mbox message handlers */ void mbox_handler_cgx_stats(struct otx2_nic *pfvf, struct cgx_stats_rsp *rsp) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index bb78d825046de..37a32ac0dae75 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -985,6 +985,7 @@ int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable); void otx2_ctx_disable(struct mbox *mbox, int type, bool npa); int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable); +int otx2_nix_cpt_config_bp(struct otx2_nic *pfvf, bool enable); void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, int qidx); void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq); int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c index 294fba58b6709..f110dfa423605 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c @@ -435,6 +435,9 @@ static int otx2_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) return err; } + /* Default disable backpressure on NIX-CPT */ + otx2_nix_cpt_config_bp(pfvf, false); + /* Request Per channel Bpids */ if (pfc->pfc_en) otx2_nix_config_bp(pfvf, true); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index e310f99b17366..8ec0296dee84b 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1551,6 +1551,9 @@ int otx2_init_hw_resources(struct otx2_nic *pf) if (err) goto err_free_npa_lf; + /* Default disable backpressure on NIX-CPT */ + otx2_nix_cpt_config_bp(pf, false); + /* Enable backpressure for CGX mapped PF/VFs */ if (!is_otx2_lbkvf(pf->pdev)) otx2_nix_config_bp(pf, true); From fe079ab05d49ffaac1e333cb38cf2c2792f7cf40 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:55 +0530 Subject: [PATCH 0258/1450] cn10k-ipsec: Init hardware for outbound ipsec crypto offload One crypto hardware logical function (cpt-lf) per netdev is required for outbound ipsec crypto offload. Allocate, attach and initialize one crypto hardware function when enabling outbound ipsec crypto offload. Crypto hardware function will be detached and freed on disabling outbound ipsec crypto offload. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- MAINTAINERS | 1 + .../ethernet/marvell/octeontx2/nic/Makefile | 1 + .../marvell/octeontx2/nic/cn10k_ipsec.c | 415 ++++++++++++++++++ .../marvell/octeontx2/nic/cn10k_ipsec.h | 109 +++++ .../marvell/octeontx2/nic/otx2_common.h | 18 + .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 14 +- .../ethernet/marvell/octeontx2/nic/otx2_vf.c | 10 +- 7 files changed, 566 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h diff --git a/MAINTAINERS b/MAINTAINERS index 79756f2100e00..2d75560e64acd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13942,6 +13942,7 @@ M: Sunil Goutham M: Geetha sowjanya M: Subbaraya Sundeep M: hariprasad +M: Bharat Bhushan L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/marvell/octeontx2/nic/ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile index dbc9712668658..cb6513ab35e74 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile @@ -15,5 +15,6 @@ rvu_rep-y := rep.o rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o rvu_nicpf-$(CONFIG_MACSEC) += cn10k_macsec.o +rvu_nicpf-$(CONFIG_XFRM_OFFLOAD) += cn10k_ipsec.o ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c new file mode 100644 index 0000000000000..e09ce42075c70 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -0,0 +1,415 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Marvell IPSEC offload driver + * + * Copyright (C) 2024 Marvell. + */ + +#include +#include +#include + +#include "otx2_common.h" +#include "cn10k_ipsec.h" + +static bool is_dev_support_ipsec_offload(struct pci_dev *pdev) +{ + return is_dev_cn10ka_b0(pdev) || is_dev_cn10kb(pdev); +} + +static bool cn10k_cpt_device_set_inuse(struct otx2_nic *pf) +{ + enum cn10k_cpt_hw_state_e state; + + while (true) { + state = atomic_cmpxchg(&pf->ipsec.cpt_state, + CN10K_CPT_HW_AVAILABLE, + CN10K_CPT_HW_IN_USE); + if (state == CN10K_CPT_HW_AVAILABLE) + return true; + if (state == CN10K_CPT_HW_UNAVAILABLE) + return false; + + mdelay(1); + } +} + +static void cn10k_cpt_device_set_available(struct otx2_nic *pf) +{ + atomic_set(&pf->ipsec.cpt_state, CN10K_CPT_HW_AVAILABLE); +} + +static void cn10k_cpt_device_set_unavailable(struct otx2_nic *pf) +{ + atomic_set(&pf->ipsec.cpt_state, CN10K_CPT_HW_UNAVAILABLE); +} + +static int cn10k_outb_cptlf_attach(struct otx2_nic *pf) +{ + struct rsrc_attach *attach; + int ret = -ENOMEM; + + mutex_lock(&pf->mbox.lock); + /* Get memory to put this msg */ + attach = otx2_mbox_alloc_msg_attach_resources(&pf->mbox); + if (!attach) + goto unlock; + + attach->cptlfs = true; + attach->modify = true; + + /* Send attach request to AF */ + ret = otx2_sync_mbox_msg(&pf->mbox); + +unlock: + mutex_unlock(&pf->mbox.lock); + return ret; +} + +static int cn10k_outb_cptlf_detach(struct otx2_nic *pf) +{ + struct rsrc_detach *detach; + int ret = -ENOMEM; + + mutex_lock(&pf->mbox.lock); + detach = otx2_mbox_alloc_msg_detach_resources(&pf->mbox); + if (!detach) + goto unlock; + + detach->partial = true; + detach->cptlfs = true; + + /* Send detach request to AF */ + ret = otx2_sync_mbox_msg(&pf->mbox); + +unlock: + mutex_unlock(&pf->mbox.lock); + return ret; +} + +static int cn10k_outb_cptlf_alloc(struct otx2_nic *pf) +{ + struct cpt_lf_alloc_req_msg *req; + int ret = -ENOMEM; + + mutex_lock(&pf->mbox.lock); + req = otx2_mbox_alloc_msg_cpt_lf_alloc(&pf->mbox); + if (!req) + goto unlock; + + /* PF function */ + req->nix_pf_func = pf->pcifunc; + /* Enable SE-IE Engine Group */ + req->eng_grpmsk = 1 << CN10K_DEF_CPT_IPSEC_EGRP; + + ret = otx2_sync_mbox_msg(&pf->mbox); + +unlock: + mutex_unlock(&pf->mbox.lock); + return ret; +} + +static void cn10k_outb_cptlf_free(struct otx2_nic *pf) +{ + mutex_lock(&pf->mbox.lock); + otx2_mbox_alloc_msg_cpt_lf_free(&pf->mbox); + otx2_sync_mbox_msg(&pf->mbox); + mutex_unlock(&pf->mbox.lock); +} + +static int cn10k_outb_cptlf_config(struct otx2_nic *pf) +{ + struct cpt_inline_ipsec_cfg_msg *req; + int ret = -ENOMEM; + + mutex_lock(&pf->mbox.lock); + req = otx2_mbox_alloc_msg_cpt_inline_ipsec_cfg(&pf->mbox); + if (!req) + goto unlock; + + req->dir = CPT_INLINE_OUTBOUND; + req->enable = 1; + req->nix_pf_func = pf->pcifunc; + ret = otx2_sync_mbox_msg(&pf->mbox); +unlock: + mutex_unlock(&pf->mbox.lock); + return ret; +} + +static void cn10k_outb_cptlf_iq_enable(struct otx2_nic *pf) +{ + u64 reg_val; + + /* Set Execution Enable of instruction queue */ + reg_val = otx2_read64(pf, CN10K_CPT_LF_INPROG); + reg_val |= BIT_ULL(16); + otx2_write64(pf, CN10K_CPT_LF_INPROG, reg_val); + + /* Set iqueue's enqueuing */ + reg_val = otx2_read64(pf, CN10K_CPT_LF_CTL); + reg_val |= BIT_ULL(0); + otx2_write64(pf, CN10K_CPT_LF_CTL, reg_val); +} + +static void cn10k_outb_cptlf_iq_disable(struct otx2_nic *pf) +{ + u32 inflight, grb_cnt, gwb_cnt; + u32 nq_ptr, dq_ptr; + int timeout = 20; + u64 reg_val; + int cnt; + + /* Disable instructions enqueuing */ + otx2_write64(pf, CN10K_CPT_LF_CTL, 0ull); + + /* Wait for instruction queue to become empty. + * CPT_LF_INPROG.INFLIGHT count is zero + */ + do { + reg_val = otx2_read64(pf, CN10K_CPT_LF_INPROG); + inflight = FIELD_GET(CPT_LF_INPROG_INFLIGHT, reg_val); + if (!inflight) + break; + + usleep_range(10000, 20000); + if (timeout-- < 0) { + netdev_err(pf->netdev, "Timeout to cleanup CPT IQ\n"); + break; + } + } while (1); + + /* Disable executions in the LF's queue, + * the queue should be empty at this point + */ + reg_val &= ~BIT_ULL(16); + otx2_write64(pf, CN10K_CPT_LF_INPROG, reg_val); + + /* Wait for instruction queue to become empty */ + cnt = 0; + do { + reg_val = otx2_read64(pf, CN10K_CPT_LF_INPROG); + if (reg_val & BIT_ULL(31)) + cnt = 0; + else + cnt++; + reg_val = otx2_read64(pf, CN10K_CPT_LF_Q_GRP_PTR); + nq_ptr = FIELD_GET(CPT_LF_Q_GRP_PTR_DQ_PTR, reg_val); + dq_ptr = FIELD_GET(CPT_LF_Q_GRP_PTR_DQ_PTR, reg_val); + } while ((cnt < 10) && (nq_ptr != dq_ptr)); + + cnt = 0; + do { + reg_val = otx2_read64(pf, CN10K_CPT_LF_INPROG); + inflight = FIELD_GET(CPT_LF_INPROG_INFLIGHT, reg_val); + grb_cnt = FIELD_GET(CPT_LF_INPROG_GRB_CNT, reg_val); + gwb_cnt = FIELD_GET(CPT_LF_INPROG_GWB_CNT, reg_val); + if (inflight == 0 && gwb_cnt < 40 && + (grb_cnt == 0 || grb_cnt == 40)) + cnt++; + else + cnt = 0; + } while (cnt < 10); +} + +/* Allocate memory for CPT outbound Instruction queue. + * Instruction queue memory format is: + * ----------------------------- + * | Instruction Group memory | + * | (CPT_LF_Q_SIZE[SIZE_DIV40] | + * | x 16 Bytes) | + * | | + * ----------------------------- <-- CPT_LF_Q_BASE[ADDR] + * | Flow Control (128 Bytes) | + * | | + * ----------------------------- + * | Instruction Memory | + * | (CPT_LF_Q_SIZE[SIZE_DIV40] | + * | × 40 × 64 bytes) | + * | | + * ----------------------------- + */ +static int cn10k_outb_cptlf_iq_alloc(struct otx2_nic *pf) +{ + struct cn10k_cpt_inst_queue *iq = &pf->ipsec.iq; + + iq->size = CN10K_CPT_INST_QLEN_BYTES + CN10K_CPT_Q_FC_LEN + + CN10K_CPT_INST_GRP_QLEN_BYTES + OTX2_ALIGN; + + iq->real_vaddr = dma_alloc_coherent(pf->dev, iq->size, + &iq->real_dma_addr, GFP_KERNEL); + if (!iq->real_vaddr) + return -ENOMEM; + + /* iq->vaddr/dma_addr points to Flow Control location */ + iq->vaddr = iq->real_vaddr + CN10K_CPT_INST_GRP_QLEN_BYTES; + iq->dma_addr = iq->real_dma_addr + CN10K_CPT_INST_GRP_QLEN_BYTES; + + /* Align pointers */ + iq->vaddr = PTR_ALIGN(iq->vaddr, OTX2_ALIGN); + iq->dma_addr = PTR_ALIGN(iq->dma_addr, OTX2_ALIGN); + return 0; +} + +static void cn10k_outb_cptlf_iq_free(struct otx2_nic *pf) +{ + struct cn10k_cpt_inst_queue *iq = &pf->ipsec.iq; + + if (iq->real_vaddr) + dma_free_coherent(pf->dev, iq->size, iq->real_vaddr, + iq->real_dma_addr); + + iq->real_vaddr = NULL; + iq->vaddr = NULL; +} + +static int cn10k_outb_cptlf_iq_init(struct otx2_nic *pf) +{ + u64 reg_val; + int ret; + + /* Allocate Memory for CPT IQ */ + ret = cn10k_outb_cptlf_iq_alloc(pf); + if (ret) + return ret; + + /* Disable IQ */ + cn10k_outb_cptlf_iq_disable(pf); + + /* Set IQ base address */ + otx2_write64(pf, CN10K_CPT_LF_Q_BASE, pf->ipsec.iq.dma_addr); + + /* Set IQ size */ + reg_val = FIELD_PREP(CPT_LF_Q_SIZE_DIV40, CN10K_CPT_SIZE_DIV40 + + CN10K_CPT_EXTRA_SIZE_DIV40); + otx2_write64(pf, CN10K_CPT_LF_Q_SIZE, reg_val); + + return 0; +} + +static int cn10k_outb_cptlf_init(struct otx2_nic *pf) +{ + int ret; + + /* Initialize CPTLF Instruction Queue (IQ) */ + ret = cn10k_outb_cptlf_iq_init(pf); + if (ret) + return ret; + + /* Configure CPTLF for outbound ipsec offload */ + ret = cn10k_outb_cptlf_config(pf); + if (ret) + goto iq_clean; + + /* Enable CPTLF IQ */ + cn10k_outb_cptlf_iq_enable(pf); + return 0; +iq_clean: + cn10k_outb_cptlf_iq_free(pf); + return ret; +} + +static int cn10k_outb_cpt_init(struct net_device *netdev) +{ + struct otx2_nic *pf = netdev_priv(netdev); + int ret; + + /* Attach a CPT LF for outbound ipsec offload */ + ret = cn10k_outb_cptlf_attach(pf); + if (ret) + return ret; + + /* Allocate a CPT LF for outbound ipsec offload */ + ret = cn10k_outb_cptlf_alloc(pf); + if (ret) + goto detach; + + /* Initialize the CPTLF for outbound ipsec offload */ + ret = cn10k_outb_cptlf_init(pf); + if (ret) + goto lf_free; + + pf->ipsec.io_addr = (__force u64)otx2_get_regaddr(pf, + CN10K_CPT_LF_NQX(0)); + + /* Set ipsec offload enabled for this device */ + pf->flags |= OTX2_FLAG_IPSEC_OFFLOAD_ENABLED; + + cn10k_cpt_device_set_available(pf); + return 0; + +lf_free: + cn10k_outb_cptlf_free(pf); +detach: + cn10k_outb_cptlf_detach(pf); + return ret; +} + +static int cn10k_outb_cpt_clean(struct otx2_nic *pf) +{ + int ret; + + if (!cn10k_cpt_device_set_inuse(pf)) { + netdev_err(pf->netdev, "CPT LF device unavailable\n"); + return -ENODEV; + } + + /* Set ipsec offload disabled for this device */ + pf->flags &= ~OTX2_FLAG_IPSEC_OFFLOAD_ENABLED; + + /* Disable CPTLF Instruction Queue (IQ) */ + cn10k_outb_cptlf_iq_disable(pf); + + /* Set IQ base address and size to 0 */ + otx2_write64(pf, CN10K_CPT_LF_Q_BASE, 0); + otx2_write64(pf, CN10K_CPT_LF_Q_SIZE, 0); + + /* Free CPTLF IQ */ + cn10k_outb_cptlf_iq_free(pf); + + /* Free and detach CPT LF */ + cn10k_outb_cptlf_free(pf); + ret = cn10k_outb_cptlf_detach(pf); + if (ret) + netdev_err(pf->netdev, "Failed to detach CPT LF\n"); + + cn10k_cpt_device_set_unavailable(pf); + return ret; +} + +int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) +{ + struct otx2_nic *pf = netdev_priv(netdev); + + /* IPsec offload supported on cn10k */ + if (!is_dev_support_ipsec_offload(pf->pdev)) + return -EOPNOTSUPP; + + /* Initialize CPT for outbound ipsec offload */ + if (enable) + return cn10k_outb_cpt_init(netdev); + + return cn10k_outb_cpt_clean(pf); +} + +int cn10k_ipsec_init(struct net_device *netdev) +{ + struct otx2_nic *pf = netdev_priv(netdev); + + if (!is_dev_support_ipsec_offload(pf->pdev)) + return 0; + + cn10k_cpt_device_set_unavailable(pf); + return 0; +} +EXPORT_SYMBOL(cn10k_ipsec_init); + +void cn10k_ipsec_clean(struct otx2_nic *pf) +{ + if (!is_dev_support_ipsec_offload(pf->pdev)) + return; + + if (!(pf->flags & OTX2_FLAG_IPSEC_OFFLOAD_ENABLED)) + return; + + cn10k_outb_cpt_clean(pf); +} +EXPORT_SYMBOL(cn10k_ipsec_clean); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h new file mode 100644 index 0000000000000..f3eb5aee4b9d6 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell IPSEC offload driver + * + * Copyright (C) 2024 Marvell. + */ + +#ifndef CN10K_IPSEC_H +#define CN10K_IPSEC_H + +#include + +/* CPT instruction size in bytes */ +#define CN10K_CPT_INST_SIZE 64 + +/* CPT instruction (CPT_INST_S) queue length */ +#define CN10K_CPT_INST_QLEN 8200 + +/* CPT instruction queue size passed to HW is in units of + * 40*CPT_INST_S messages. + */ +#define CN10K_CPT_SIZE_DIV40 (CN10K_CPT_INST_QLEN / 40) + +/* CPT needs 320 free entries */ +#define CN10K_CPT_INST_QLEN_EXTRA_BYTES (320 * CN10K_CPT_INST_SIZE) +#define CN10K_CPT_EXTRA_SIZE_DIV40 (320 / 40) + +/* CPT instruction queue length in bytes */ +#define CN10K_CPT_INST_QLEN_BYTES \ + ((CN10K_CPT_SIZE_DIV40 * 40 * CN10K_CPT_INST_SIZE) + \ + CN10K_CPT_INST_QLEN_EXTRA_BYTES) + +/* CPT instruction group queue length in bytes */ +#define CN10K_CPT_INST_GRP_QLEN_BYTES \ + ((CN10K_CPT_SIZE_DIV40 + CN10K_CPT_EXTRA_SIZE_DIV40) * 16) + +/* CPT FC length in bytes */ +#define CN10K_CPT_Q_FC_LEN 128 + +/* Default CPT engine group for ipsec offload */ +#define CN10K_DEF_CPT_IPSEC_EGRP 1 + +/* CN10K CPT LF registers */ +#define CPT_LFBASE (BLKTYPE_CPT << RVU_FUNC_BLKADDR_SHIFT) +#define CN10K_CPT_LF_CTL (CPT_LFBASE | 0x10) +#define CN10K_CPT_LF_INPROG (CPT_LFBASE | 0x40) +#define CN10K_CPT_LF_Q_BASE (CPT_LFBASE | 0xf0) +#define CN10K_CPT_LF_Q_SIZE (CPT_LFBASE | 0x100) +#define CN10K_CPT_LF_Q_INST_PTR (CPT_LFBASE | 0x110) +#define CN10K_CPT_LF_Q_GRP_PTR (CPT_LFBASE | 0x120) +#define CN10K_CPT_LF_NQX(a) (CPT_LFBASE | 0x400 | (a) << 3) +#define CN10K_CPT_LF_CTX_FLUSH (CPT_LFBASE | 0x510) + +struct cn10k_cpt_inst_queue { + u8 *vaddr; + u8 *real_vaddr; + dma_addr_t dma_addr; + dma_addr_t real_dma_addr; + u32 size; +}; + +enum cn10k_cpt_hw_state_e { + CN10K_CPT_HW_UNAVAILABLE, + CN10K_CPT_HW_AVAILABLE, + CN10K_CPT_HW_IN_USE +}; + +struct cn10k_ipsec { + /* Outbound CPT */ + u64 io_addr; + atomic_t cpt_state; + struct cn10k_cpt_inst_queue iq; +}; + +/* CPT LF_INPROG Register */ +#define CPT_LF_INPROG_INFLIGHT GENMASK_ULL(8, 0) +#define CPT_LF_INPROG_GRB_CNT GENMASK_ULL(39, 32) +#define CPT_LF_INPROG_GWB_CNT GENMASK_ULL(47, 40) + +/* CPT LF_Q_GRP_PTR Register */ +#define CPT_LF_Q_GRP_PTR_DQ_PTR GENMASK_ULL(14, 0) +#define CPT_LF_Q_GRP_PTR_NQ_PTR GENMASK_ULL(46, 32) + +/* CPT LF_Q_SIZE Register */ +#define CPT_LF_Q_BASE_ADDR GENMASK_ULL(52, 7) + +/* CPT LF_Q_SIZE Register */ +#define CPT_LF_Q_SIZE_DIV40 GENMASK_ULL(14, 0) + +#ifdef CONFIG_XFRM_OFFLOAD +int cn10k_ipsec_init(struct net_device *netdev); +void cn10k_ipsec_clean(struct otx2_nic *pf); +int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable); +#else +static inline __maybe_unused int cn10k_ipsec_init(struct net_device *netdev) +{ + return 0; +} + +static inline __maybe_unused void cn10k_ipsec_clean(struct otx2_nic *pf) +{ +} + +static inline __maybe_unused +int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) +{ + return 0; +} +#endif +#endif // CN10K_IPSEC_H diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 37a32ac0dae75..5e2da67d58bb4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -30,6 +30,7 @@ #include #include "qos.h" #include "rep.h" +#include "cn10k_ipsec.h" /* IPv4 flag more fragment bit */ #define IPV4_FLAG_MORE 0x20 @@ -40,6 +41,7 @@ #define PCI_DEVID_OCTEONTX2_RVU_AFVF 0xA0F8 #define PCI_SUBSYS_DEVID_96XX_RVU_PFVF 0xB200 +#define PCI_SUBSYS_DEVID_CN10K_A_RVU_PFVF 0xB900 #define PCI_SUBSYS_DEVID_CN10K_B_RVU_PFVF 0xBD00 #define PCI_DEVID_OCTEONTX2_SDP_REP 0xA0F7 @@ -448,6 +450,7 @@ struct otx2_nic { #define OTX2_FLAG_TC_MARK_ENABLED BIT_ULL(17) #define OTX2_FLAG_REP_MODE_ENABLED BIT_ULL(18) #define OTX2_FLAG_PORT_UP BIT_ULL(19) +#define OTX2_FLAG_IPSEC_OFFLOAD_ENABLED BIT_ULL(20) u64 flags; u64 *cq_op_addr; @@ -522,6 +525,9 @@ struct otx2_nic { u16 rep_pf_map[RVU_MAX_REP]; u16 esw_mode; #endif + + /* Inline ipsec */ + struct cn10k_ipsec ipsec; }; static inline bool is_otx2_lbkvf(struct pci_dev *pdev) @@ -572,6 +578,15 @@ static inline bool is_dev_cn10kb(struct pci_dev *pdev) return pdev->subsystem_device == PCI_SUBSYS_DEVID_CN10K_B_RVU_PFVF; } +static inline bool is_dev_cn10ka_b0(struct pci_dev *pdev) +{ + if (pdev->subsystem_device == PCI_SUBSYS_DEVID_CN10K_A_RVU_PFVF && + (pdev->revision & 0xFF) == 0x54) + return true; + + return false; +} + static inline void otx2_setup_dev_hw_settings(struct otx2_nic *pfvf) { struct otx2_hw *hw = &pfvf->hw; @@ -621,6 +636,9 @@ static inline void __iomem *otx2_get_regaddr(struct otx2_nic *nic, u64 offset) case BLKTYPE_NPA: blkaddr = BLKADDR_NPA; break; + case BLKTYPE_CPT: + blkaddr = BLKADDR_CPT0; + break; default: blkaddr = BLKADDR_RVUM; break; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 8ec0296dee84b..2f652035d8547 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -26,6 +26,7 @@ #include "cn10k.h" #include "qos.h" #include +#include "cn10k_ipsec.h" #define DRV_NAME "rvu_nicpf" #define DRV_STRING "Marvell RVU NIC Physical Function Driver" @@ -2276,6 +2277,10 @@ static int otx2_set_features(struct net_device *netdev, return otx2_enable_rxvlan(pf, features & NETIF_F_HW_VLAN_CTAG_RX); + if (changed & NETIF_F_HW_ESP) + return cn10k_ipsec_ethtool_init(netdev, + features & NETIF_F_HW_ESP); + return otx2_handle_ntuple_tc_features(netdev, features); } @@ -3165,10 +3170,14 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) /* reset CGX/RPM MAC stats */ otx2_reset_mac_stats(pf); + err = cn10k_ipsec_init(netdev); + if (err) + goto err_mcs_free; + err = register_netdev(netdev); if (err) { dev_err(dev, "Failed to register netdevice\n"); - goto err_mcs_free; + goto err_ipsec_clean; } err = otx2_wq_init(pf); @@ -3209,6 +3218,8 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) otx2_mcam_flow_del(pf); err_unreg_netdev: unregister_netdev(netdev); +err_ipsec_clean: + cn10k_ipsec_clean(pf); err_mcs_free: cn10k_mcs_free(pf); err_del_mcam_entries: @@ -3406,6 +3417,7 @@ static void otx2_remove(struct pci_dev *pdev) otx2_unregister_dl(pf); unregister_netdev(netdev); + cn10k_ipsec_clean(pf); cn10k_mcs_free(pf); otx2_sriov_disable(pf->pdev); otx2_sriov_vfcfg_cleanup(pf); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index 839fc77c11b23..e926c6ce96cff 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -14,6 +14,7 @@ #include "otx2_reg.h" #include "otx2_ptp.h" #include "cn10k.h" +#include "cn10k_ipsec.h" #define DRV_NAME "rvu_nicvf" #define DRV_STRING "Marvell RVU NIC Virtual Function Driver" @@ -693,10 +694,14 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) pdev->bus->number, n); } + err = cn10k_ipsec_init(netdev); + if (err) + goto err_ptp_destroy; + err = register_netdev(netdev); if (err) { dev_err(dev, "Failed to register netdevice\n"); - goto err_ptp_destroy; + goto err_ipsec_clean; } err = otx2_vf_wq_init(vf); @@ -730,6 +735,8 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) otx2_shutdown_tc(vf); err_unreg_netdev: unregister_netdev(netdev); +err_ipsec_clean: + cn10k_ipsec_clean(vf); err_ptp_destroy: otx2_ptp_destroy(vf); err_detach_rsrc: @@ -782,6 +789,7 @@ static void otx2vf_remove(struct pci_dev *pdev) unregister_netdev(netdev); if (vf->otx2_wq) destroy_workqueue(vf->otx2_wq); + cn10k_ipsec_clean(vf); otx2_ptp_destroy(vf); otx2_mcam_flow_del(vf); otx2_shutdown_tc(vf); From c45211c2369734d1b03c75165988878d16867040 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:56 +0530 Subject: [PATCH 0259/1450] cn10k-ipsec: Add SA add/del support for outb ipsec crypto offload This patch adds support to add and delete Security Association (SA) xfrm ops. Hardware maintains SA context in memory allocated by software. Each SA context is 128 byte aligned and size of each context is multiple of 128-byte. Add support for transport and tunnel ipsec mode, ESP protocol, aead aes-gcm-icv16, key size 128/192/256-bits with 32bit salt. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- .../marvell/octeontx2/nic/cn10k_ipsec.c | 409 ++++++++++++++++++ .../marvell/octeontx2/nic/cn10k_ipsec.h | 114 +++++ 2 files changed, 523 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index e09ce42075c70..106a241625dcb 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -375,6 +375,385 @@ static int cn10k_outb_cpt_clean(struct otx2_nic *pf) return ret; } +static void cn10k_cpt_inst_flush(struct otx2_nic *pf, struct cpt_inst_s *inst, + u64 size) +{ + struct otx2_lmt_info *lmt_info; + u64 val = 0, tar_addr = 0; + + lmt_info = per_cpu_ptr(pf->hw.lmt_info, smp_processor_id()); + /* FIXME: val[0:10] LMT_ID. + * [12:15] no of LMTST - 1 in the burst. + * [19:63] data size of each LMTST in the burst except first. + */ + val = (lmt_info->lmt_id & 0x7FF); + /* Target address for LMTST flush tells HW how many 128bit + * words are present. + * tar_addr[6:4] size of first LMTST - 1 in units of 128b. + */ + tar_addr |= pf->ipsec.io_addr | (((size / 16) - 1) & 0x7) << 4; + dma_wmb(); + memcpy((u64 *)lmt_info->lmt_addr, inst, size); + cn10k_lmt_flush(val, tar_addr); +} + +static int cn10k_wait_for_cpt_respose(struct otx2_nic *pf, + struct cpt_res_s *res) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(100); + u64 *completion_ptr = (u64 *)res; + + do { + if (time_after(jiffies, timeout)) { + netdev_err(pf->netdev, "CPT response timeout\n"); + return -EBUSY; + } + } while ((READ_ONCE(*completion_ptr) & CN10K_CPT_COMP_E_MASK) == + CN10K_CPT_COMP_E_NOTDONE); + + if (!(res->compcode == CN10K_CPT_COMP_E_GOOD || + res->compcode == CN10K_CPT_COMP_E_WARN) || res->uc_compcode) { + netdev_err(pf->netdev, "compcode=%x doneint=%x\n", + res->compcode, res->doneint); + netdev_err(pf->netdev, "uc_compcode=%x uc_info=%llx esn=%llx\n", + res->uc_compcode, (u64)res->uc_info, res->esn); + } + return 0; +} + +static int cn10k_outb_write_sa(struct otx2_nic *pf, struct qmem *sa_info) +{ + dma_addr_t res_iova, dptr_iova, sa_iova; + struct cn10k_tx_sa_s *sa_dptr; + struct cpt_inst_s inst = {}; + struct cpt_res_s *res; + u32 sa_size, off; + u64 *sptr, *dptr; + u64 reg_val; + int ret; + + sa_iova = sa_info->iova; + if (!sa_iova) + return -EINVAL; + + res = dma_alloc_coherent(pf->dev, sizeof(struct cpt_res_s), + &res_iova, GFP_ATOMIC); + if (!res) + return -ENOMEM; + + sa_size = sizeof(struct cn10k_tx_sa_s); + sa_dptr = dma_alloc_coherent(pf->dev, sa_size, &dptr_iova, GFP_ATOMIC); + if (!sa_dptr) { + dma_free_coherent(pf->dev, sizeof(struct cpt_res_s), res, + res_iova); + return -ENOMEM; + } + + sptr = (__force u64 *)sa_info->base; + dptr = (__force u64 *)sa_dptr; + for (off = 0; off < (sa_size / 8); off++) + *(dptr + off) = (__force u64)cpu_to_be64(*(sptr + off)); + + res->compcode = CN10K_CPT_COMP_E_NOTDONE; + inst.res_addr = res_iova; + inst.dptr = (u64)dptr_iova; + inst.param2 = sa_size >> 3; + inst.dlen = sa_size; + inst.opcode_major = CN10K_IPSEC_MAJOR_OP_WRITE_SA; + inst.opcode_minor = CN10K_IPSEC_MINOR_OP_WRITE_SA; + inst.cptr = sa_iova; + inst.ctx_val = 1; + inst.egrp = CN10K_DEF_CPT_IPSEC_EGRP; + + /* Check if CPT-LF available */ + if (!cn10k_cpt_device_set_inuse(pf)) { + ret = -ENODEV; + goto free_mem; + } + + cn10k_cpt_inst_flush(pf, &inst, sizeof(struct cpt_inst_s)); + dma_wmb(); + ret = cn10k_wait_for_cpt_respose(pf, res); + if (ret) + goto set_available; + + /* Trigger CTX flush to write dirty data back to DRAM */ + reg_val = FIELD_PREP(CPT_LF_CTX_FLUSH, sa_iova >> 7); + otx2_write64(pf, CN10K_CPT_LF_CTX_FLUSH, reg_val); + +set_available: + cn10k_cpt_device_set_available(pf); +free_mem: + dma_free_coherent(pf->dev, sa_size, sa_dptr, dptr_iova); + dma_free_coherent(pf->dev, sizeof(struct cpt_res_s), res, res_iova); + return ret; +} + +static int cn10k_ipsec_get_hw_ctx_offset(void) +{ + /* Offset on Hardware-context offset in word */ + return (offsetof(struct cn10k_tx_sa_s, hw_ctx) / sizeof(u64)) & 0x7F; +} + +static int cn10k_ipsec_get_ctx_push_size(void) +{ + /* Context push size is round up and in multiple of 8 Byte */ + return (roundup(offsetof(struct cn10k_tx_sa_s, hw_ctx), 8) / 8) & 0x7F; +} + +static int cn10k_ipsec_get_aes_key_len(int key_len) +{ + /* key_len is aes key length in bytes */ + switch (key_len) { + case 16: + return CN10K_IPSEC_SA_AES_KEY_LEN_128; + case 24: + return CN10K_IPSEC_SA_AES_KEY_LEN_192; + default: + return CN10K_IPSEC_SA_AES_KEY_LEN_256; + } +} + +static void cn10k_outb_prepare_sa(struct xfrm_state *x, + struct cn10k_tx_sa_s *sa_entry) +{ + int key_len = (x->aead->alg_key_len + 7) / 8; + struct net_device *netdev = x->xso.dev; + u8 *key = x->aead->alg_key; + struct otx2_nic *pf; + u32 *tmp_salt; + u64 *tmp_key; + int idx; + + memset(sa_entry, 0, sizeof(struct cn10k_tx_sa_s)); + + /* context size, 128 Byte aligned up */ + pf = netdev_priv(netdev); + sa_entry->ctx_size = (pf->ipsec.sa_size / OTX2_ALIGN) & 0xF; + sa_entry->hw_ctx_off = cn10k_ipsec_get_hw_ctx_offset(); + sa_entry->ctx_push_size = cn10k_ipsec_get_ctx_push_size(); + + /* Ucode to skip two words of CPT_CTX_HW_S */ + sa_entry->ctx_hdr_size = 1; + + /* Allow Atomic operation (AOP) */ + sa_entry->aop_valid = 1; + + /* Outbound, ESP TRANSPORT/TUNNEL Mode, AES-GCM with */ + sa_entry->sa_dir = CN10K_IPSEC_SA_DIR_OUTB; + sa_entry->ipsec_protocol = CN10K_IPSEC_SA_IPSEC_PROTO_ESP; + sa_entry->enc_type = CN10K_IPSEC_SA_ENCAP_TYPE_AES_GCM; + sa_entry->iv_src = CN10K_IPSEC_SA_IV_SRC_PACKET; + if (x->props.mode == XFRM_MODE_TUNNEL) + sa_entry->ipsec_mode = CN10K_IPSEC_SA_IPSEC_MODE_TUNNEL; + else + sa_entry->ipsec_mode = CN10K_IPSEC_SA_IPSEC_MODE_TRANSPORT; + + /* Last 4 bytes are salt */ + key_len -= 4; + sa_entry->aes_key_len = cn10k_ipsec_get_aes_key_len(key_len); + memcpy(sa_entry->cipher_key, key, key_len); + tmp_key = (u64 *)sa_entry->cipher_key; + + for (idx = 0; idx < key_len / 8; idx++) + tmp_key[idx] = (__force u64)cpu_to_be64(tmp_key[idx]); + + memcpy(&sa_entry->iv_gcm_salt, key + key_len, 4); + tmp_salt = (u32 *)&sa_entry->iv_gcm_salt; + *tmp_salt = (__force u32)cpu_to_be32(*tmp_salt); + + /* Write SA context data to memory before enabling */ + wmb(); + + /* Enable SA */ + sa_entry->sa_valid = 1; +} + +static int cn10k_ipsec_validate_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + if (x->props.aalgo != SADB_AALG_NONE) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload authenticated xfrm states"); + return -EINVAL; + } + if (x->props.ealgo != SADB_X_EALG_AES_GCM_ICV16) { + NL_SET_ERR_MSG_MOD(extack, + "Only AES-GCM-ICV16 xfrm state may be offloaded"); + return -EINVAL; + } + if (x->props.calgo != SADB_X_CALG_NONE) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload compressed xfrm states"); + return -EINVAL; + } + if (x->props.flags & XFRM_STATE_ESN) { + NL_SET_ERR_MSG_MOD(extack, "Cannot offload ESN xfrm states"); + return -EINVAL; + } + if (x->props.family != AF_INET && x->props.family != AF_INET6) { + NL_SET_ERR_MSG_MOD(extack, + "Only IPv4/v6 xfrm states may be offloaded"); + return -EINVAL; + } + if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload other than crypto-mode"); + return -EINVAL; + } + if (x->props.mode != XFRM_MODE_TRANSPORT && + x->props.mode != XFRM_MODE_TUNNEL) { + NL_SET_ERR_MSG_MOD(extack, + "Only tunnel/transport xfrm states may be offloaded"); + return -EINVAL; + } + if (x->id.proto != IPPROTO_ESP) { + NL_SET_ERR_MSG_MOD(extack, + "Only ESP xfrm state may be offloaded"); + return -EINVAL; + } + if (x->encap) { + NL_SET_ERR_MSG_MOD(extack, + "Encapsulated xfrm state may not be offloaded"); + return -EINVAL; + } + if (!x->aead) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states without aead"); + return -EINVAL; + } + + if (x->aead->alg_icv_len != 128) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with AEAD ICV length other than 128bit"); + return -EINVAL; + } + if (x->aead->alg_key_len != 128 + 32 && + x->aead->alg_key_len != 192 + 32 && + x->aead->alg_key_len != 256 + 32) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with AEAD key length other than 128/192/256bit"); + return -EINVAL; + } + if (x->tfcpad) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with tfc padding"); + return -EINVAL; + } + if (!x->geniv) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states without geniv"); + return -EINVAL; + } + if (strcmp(x->geniv, "seqiv")) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with geniv other than seqiv"); + return -EINVAL; + } + return 0; +} + +static int cn10k_ipsec_inb_add_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG_MOD(extack, "xfrm inbound offload not supported"); + return -EOPNOTSUPP; +} + +static int cn10k_ipsec_outb_add_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + struct net_device *netdev = x->xso.dev; + struct cn10k_tx_sa_s *sa_entry; + struct qmem *sa_info; + struct otx2_nic *pf; + int err; + + err = cn10k_ipsec_validate_state(x, extack); + if (err) + return err; + + pf = netdev_priv(netdev); + + err = qmem_alloc(pf->dev, &sa_info, pf->ipsec.sa_size, OTX2_ALIGN); + if (err) + return err; + + sa_entry = (struct cn10k_tx_sa_s *)sa_info->base; + cn10k_outb_prepare_sa(x, sa_entry); + + err = cn10k_outb_write_sa(pf, sa_info); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error writing outbound SA"); + qmem_free(pf->dev, sa_info); + return err; + } + + x->xso.offload_handle = (unsigned long)sa_info; + pf->ipsec.outb_sa_count++; + return 0; +} + +static int cn10k_ipsec_add_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) + return cn10k_ipsec_inb_add_state(x, extack); + else + return cn10k_ipsec_outb_add_state(x, extack); +} + +static void cn10k_ipsec_del_state(struct xfrm_state *x) +{ + struct net_device *netdev = x->xso.dev; + struct cn10k_tx_sa_s *sa_entry; + struct qmem *sa_info; + struct otx2_nic *pf; + int err; + + if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) + return; + + pf = netdev_priv(netdev); + + sa_info = (struct qmem *)x->xso.offload_handle; + sa_entry = (struct cn10k_tx_sa_s *)sa_info->base; + memset(sa_entry, 0, sizeof(struct cn10k_tx_sa_s)); + /* Disable SA in CPT h/w */ + sa_entry->ctx_push_size = cn10k_ipsec_get_ctx_push_size(); + sa_entry->ctx_size = (pf->ipsec.sa_size / OTX2_ALIGN) & 0xF; + sa_entry->aop_valid = 1; + + err = cn10k_outb_write_sa(pf, sa_info); + if (err) + netdev_err(netdev, "Error (%d) deleting SA\n", err); + + x->xso.offload_handle = 0; + qmem_free(pf->dev, sa_info); + + /* If no more SA's then update netdev feature for potential change + * in NETIF_F_HW_ESP. + */ + if (!--pf->ipsec.outb_sa_count) + queue_work(pf->ipsec.sa_workq, &pf->ipsec.sa_work); +} + +static const struct xfrmdev_ops cn10k_ipsec_xfrmdev_ops = { + .xdo_dev_state_add = cn10k_ipsec_add_state, + .xdo_dev_state_delete = cn10k_ipsec_del_state, +}; + +static void cn10k_ipsec_sa_wq_handler(struct work_struct *work) +{ + struct cn10k_ipsec *ipsec = container_of(work, struct cn10k_ipsec, + sa_work); + struct otx2_nic *pf = container_of(ipsec, struct otx2_nic, ipsec); + + rtnl_lock(); + netdev_update_features(pf->netdev); + rtnl_unlock(); +} + int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) { struct otx2_nic *pf = netdev_priv(netdev); @@ -387,16 +766,41 @@ int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) if (enable) return cn10k_outb_cpt_init(netdev); + /* Don't do CPT cleanup if SA installed */ + if (pf->ipsec.outb_sa_count) { + netdev_err(pf->netdev, "SA installed on this device\n"); + return -EBUSY; + } + return cn10k_outb_cpt_clean(pf); } int cn10k_ipsec_init(struct net_device *netdev) { struct otx2_nic *pf = netdev_priv(netdev); + u32 sa_size; if (!is_dev_support_ipsec_offload(pf->pdev)) return 0; + /* Each SA entry size is 128 Byte round up in size */ + sa_size = sizeof(struct cn10k_tx_sa_s) % OTX2_ALIGN ? + (sizeof(struct cn10k_tx_sa_s) / OTX2_ALIGN + 1) * + OTX2_ALIGN : sizeof(struct cn10k_tx_sa_s); + pf->ipsec.sa_size = sa_size; + + INIT_WORK(&pf->ipsec.sa_work, cn10k_ipsec_sa_wq_handler); + pf->ipsec.sa_workq = alloc_workqueue("cn10k_ipsec_sa_workq", 0, 0); + if (!pf->ipsec.sa_workq) { + netdev_err(pf->netdev, "SA alloc workqueue failed\n"); + return -ENOMEM; + } + + /* Set xfrm device ops + * NETIF_F_HW_ESP is not set as ipsec setup is not yet complete. + */ + netdev->xfrmdev_ops = &cn10k_ipsec_xfrmdev_ops; + cn10k_cpt_device_set_unavailable(pf); return 0; } @@ -410,6 +814,11 @@ void cn10k_ipsec_clean(struct otx2_nic *pf) if (!(pf->flags & OTX2_FLAG_IPSEC_OFFLOAD_ENABLED)) return; + if (pf->ipsec.sa_workq) { + destroy_workqueue(pf->ipsec.sa_workq); + pf->ipsec.sa_workq = NULL; + } + cn10k_outb_cpt_clean(pf); } EXPORT_SYMBOL(cn10k_ipsec_clean); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h index f3eb5aee4b9d6..5ac4de4ae974b 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h @@ -50,6 +50,20 @@ #define CN10K_CPT_LF_NQX(a) (CPT_LFBASE | 0x400 | (a) << 3) #define CN10K_CPT_LF_CTX_FLUSH (CPT_LFBASE | 0x510) +/* IPSEC Instruction opcodes */ +#define CN10K_IPSEC_MAJOR_OP_WRITE_SA 0x01UL +#define CN10K_IPSEC_MINOR_OP_WRITE_SA 0x09UL + +enum cn10k_cpt_comp_e { + CN10K_CPT_COMP_E_NOTDONE = 0x00, + CN10K_CPT_COMP_E_GOOD = 0x01, + CN10K_CPT_COMP_E_FAULT = 0x02, + CN10K_CPT_COMP_E_HWERR = 0x04, + CN10K_CPT_COMP_E_INSTERR = 0x05, + CN10K_CPT_COMP_E_WARN = 0x06, + CN10K_CPT_COMP_E_MASK = 0x3F +}; + struct cn10k_cpt_inst_queue { u8 *vaddr; u8 *real_vaddr; @@ -69,6 +83,103 @@ struct cn10k_ipsec { u64 io_addr; atomic_t cpt_state; struct cn10k_cpt_inst_queue iq; + + /* SA info */ + u32 sa_size; + u32 outb_sa_count; + struct work_struct sa_work; + struct workqueue_struct *sa_workq; +}; + +/* CN10K IPSEC Security Association (SA) */ +/* SA direction */ +#define CN10K_IPSEC_SA_DIR_INB 0 +#define CN10K_IPSEC_SA_DIR_OUTB 1 +/* SA protocol */ +#define CN10K_IPSEC_SA_IPSEC_PROTO_AH 0 +#define CN10K_IPSEC_SA_IPSEC_PROTO_ESP 1 +/* SA Encryption Type */ +#define CN10K_IPSEC_SA_ENCAP_TYPE_AES_GCM 5 +/* SA IPSEC mode Transport/Tunnel */ +#define CN10K_IPSEC_SA_IPSEC_MODE_TRANSPORT 0 +#define CN10K_IPSEC_SA_IPSEC_MODE_TUNNEL 1 +/* SA AES Key Length */ +#define CN10K_IPSEC_SA_AES_KEY_LEN_128 1 +#define CN10K_IPSEC_SA_AES_KEY_LEN_192 2 +#define CN10K_IPSEC_SA_AES_KEY_LEN_256 3 +/* IV Source */ +#define CN10K_IPSEC_SA_IV_SRC_COUNTER 0 +#define CN10K_IPSEC_SA_IV_SRC_PACKET 3 + +struct cn10k_tx_sa_s { + u64 esn_en : 1; /* W0 */ + u64 rsvd_w0_1_8 : 8; + u64 hw_ctx_off : 7; + u64 ctx_id : 16; + u64 rsvd_w0_32_47 : 16; + u64 ctx_push_size : 7; + u64 rsvd_w0_55 : 1; + u64 ctx_hdr_size : 2; + u64 aop_valid : 1; + u64 rsvd_w0_59 : 1; + u64 ctx_size : 4; + u64 w1; /* W1 */ + u64 sa_valid : 1; /* W2 */ + u64 sa_dir : 1; + u64 rsvd_w2_2_3 : 2; + u64 ipsec_mode : 1; + u64 ipsec_protocol : 1; + u64 aes_key_len : 2; + u64 enc_type : 3; + u64 rsvd_w2_11_19 : 9; + u64 iv_src : 2; + u64 rsvd_w2_22_31 : 10; + u64 rsvd_w2_32_63 : 32; + u64 w3; /* W3 */ + u8 cipher_key[32]; /* W4 - W7 */ + u32 rsvd_w8_0_31; /* W8 : IV */ + u32 iv_gcm_salt; + u64 rsvd_w9_w30[22]; /* W9 - W30 */ + u64 hw_ctx[6]; /* W31 - W36 */ +}; + +/* CPT Instruction Structure */ +struct cpt_inst_s { + u64 nixtxl : 3; /* W0 */ + u64 doneint : 1; + u64 rsvd_w0_4_15 : 12; + u64 dat_offset : 8; + u64 ext_param1 : 8; + u64 nixtx_offset : 20; + u64 rsvd_w0_52_63 : 12; + u64 res_addr; /* W1 */ + u64 tag : 32; /* W2 */ + u64 tt : 2; + u64 grp : 10; + u64 rsvd_w2_44_47 : 4; + u64 rvu_pf_func : 16; + u64 qord : 1; /* W3 */ + u64 rsvd_w3_1_2 : 2; + u64 wqe_ptr : 61; + u64 dlen : 16; /* W4 */ + u64 param2 : 16; + u64 param1 : 16; + u64 opcode_major : 8; + u64 opcode_minor : 8; + u64 dptr; /* W5 */ + u64 rptr; /* W6 */ + u64 cptr : 60; /* W7 */ + u64 ctx_val : 1; + u64 egrp : 3; +}; + +/* CPT Instruction Result Structure */ +struct cpt_res_s { + u64 compcode : 7; /* W0 */ + u64 doneint : 1; + u64 uc_compcode : 8; + u64 uc_info : 48; + u64 esn; /* W1 */ }; /* CPT LF_INPROG Register */ @@ -86,6 +197,9 @@ struct cn10k_ipsec { /* CPT LF_Q_SIZE Register */ #define CPT_LF_Q_SIZE_DIV40 GENMASK_ULL(14, 0) +/* CPT LF CTX Flush Register */ +#define CPT_LF_CTX_FLUSH GENMASK_ULL(45, 0) + #ifdef CONFIG_XFRM_OFFLOAD int cn10k_ipsec_init(struct net_device *netdev); void cn10k_ipsec_clean(struct otx2_nic *pf); From 6a77a158848a8c68930df27b8840660db8531222 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:57 +0530 Subject: [PATCH 0260/1450] cn10k-ipsec: Process outbound ipsec crypto offload Prepare and submit crypto hardware (CPT) instruction for outbound ipsec crypto offload. The CPT instruction have authentication offset, IV offset and encapsulation offset in input packet. Also provide SA context pointer which have details about algo, keys, salt etc. Crypto hardware encrypt, authenticate and provide the ESP packet to networking hardware. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- .../marvell/octeontx2/nic/cn10k_ipsec.c | 219 ++++++++++++++++++ .../marvell/octeontx2/nic/cn10k_ipsec.h | 42 ++++ .../marvell/octeontx2/nic/otx2_common.c | 23 ++ .../marvell/octeontx2/nic/otx2_common.h | 3 + .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 + .../marvell/octeontx2/nic/otx2_txrx.c | 32 ++- .../marvell/octeontx2/nic/otx2_txrx.h | 3 + 7 files changed, 321 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index 106a241625dcb..9a9b06f4c2cc3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -7,10 +7,15 @@ #include #include #include +#include +#include #include "otx2_common.h" +#include "otx2_struct.h" #include "cn10k_ipsec.h" +DEFINE_STATIC_KEY_FALSE(cn10k_ipsec_sa_enabled); + static bool is_dev_support_ipsec_offload(struct pci_dev *pdev) { return is_dev_cn10ka_b0(pdev) || is_dev_cn10kb(pdev); @@ -690,6 +695,9 @@ static int cn10k_ipsec_outb_add_state(struct xfrm_state *x, } x->xso.offload_handle = (unsigned long)sa_info; + /* Enable static branch when first SA setup */ + if (!pf->ipsec.outb_sa_count) + static_branch_enable(&cn10k_ipsec_sa_enabled); pf->ipsec.outb_sa_count++; return 0; } @@ -749,6 +757,8 @@ static void cn10k_ipsec_sa_wq_handler(struct work_struct *work) sa_work); struct otx2_nic *pf = container_of(ipsec, struct otx2_nic, ipsec); + /* Disable static branch when no more SA enabled */ + static_branch_disable(&cn10k_ipsec_sa_enabled); rtnl_lock(); netdev_update_features(pf->netdev); rtnl_unlock(); @@ -822,3 +832,212 @@ void cn10k_ipsec_clean(struct otx2_nic *pf) cn10k_outb_cpt_clean(pf); } EXPORT_SYMBOL(cn10k_ipsec_clean); + +static u16 cn10k_ipsec_get_ip_data_len(struct xfrm_state *x, + struct sk_buff *skb) +{ + struct ipv6hdr *ipv6h; + struct iphdr *iph; + u8 *src; + + src = (u8 *)skb->data + ETH_HLEN; + + if (x->props.family == AF_INET) { + iph = (struct iphdr *)src; + return ntohs(iph->tot_len); + } + + ipv6h = (struct ipv6hdr *)src; + return ntohs(ipv6h->payload_len) + sizeof(struct ipv6hdr); +} + +/* Prepare CPT and NIX SQE scatter/gather subdescriptor structure. + * SG of NIX and CPT are same in size. + * Layout of a NIX SQE and CPT SG entry: + * ----------------------------- + * | CPT Scatter Gather | + * | (SQE SIZE) | + * | | + * ----------------------------- + * | NIX SQE | + * | (SQE SIZE) | + * | | + * ----------------------------- + */ +bool otx2_sqe_add_sg_ipsec(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + struct sk_buff *skb, int num_segs, int *offset) +{ + struct cpt_sg_s *cpt_sg = NULL; + struct nix_sqe_sg_s *sg = NULL; + u64 dma_addr, *iova = NULL; + u64 *cpt_iova = NULL; + u16 *sg_lens = NULL; + int seg, len; + + sq->sg[sq->head].num_segs = 0; + cpt_sg = (struct cpt_sg_s *)(sq->sqe_base - sq->sqe_size); + + for (seg = 0; seg < num_segs; seg++) { + if ((seg % MAX_SEGS_PER_SG) == 0) { + sg = (struct nix_sqe_sg_s *)(sq->sqe_base + *offset); + sg->ld_type = NIX_SEND_LDTYPE_LDD; + sg->subdc = NIX_SUBDC_SG; + sg->segs = 0; + sg_lens = (void *)sg; + iova = (void *)sg + sizeof(*sg); + /* Next subdc always starts at a 16byte boundary. + * So if sg->segs is whether 2 or 3, offset += 16bytes. + */ + if ((num_segs - seg) >= (MAX_SEGS_PER_SG - 1)) + *offset += sizeof(*sg) + (3 * sizeof(u64)); + else + *offset += sizeof(*sg) + sizeof(u64); + + cpt_sg += (seg / MAX_SEGS_PER_SG) * 4; + cpt_iova = (void *)cpt_sg + sizeof(*cpt_sg); + } + dma_addr = otx2_dma_map_skb_frag(pfvf, skb, seg, &len); + if (dma_mapping_error(pfvf->dev, dma_addr)) + return false; + + sg_lens[seg % MAX_SEGS_PER_SG] = len; + sg->segs++; + *iova++ = dma_addr; + *cpt_iova++ = dma_addr; + + /* Save DMA mapping info for later unmapping */ + sq->sg[sq->head].dma_addr[seg] = dma_addr; + sq->sg[sq->head].size[seg] = len; + sq->sg[sq->head].num_segs++; + + *cpt_sg = *(struct cpt_sg_s *)sg; + cpt_sg->rsvd_63_50 = 0; + } + + sq->sg[sq->head].skb = (u64)skb; + return true; +} + +static u16 cn10k_ipsec_get_param1(u8 iv_offset) +{ + u16 param1_val; + + /* Set Crypto mode, disable L3/L4 checksum */ + param1_val = CN10K_IPSEC_INST_PARAM1_DIS_L4_CSUM | + CN10K_IPSEC_INST_PARAM1_DIS_L3_CSUM; + param1_val |= (u16)iv_offset << CN10K_IPSEC_INST_PARAM1_IV_OFFSET_SHIFT; + return param1_val; +} + +bool cn10k_ipsec_transmit(struct otx2_nic *pf, struct netdev_queue *txq, + struct otx2_snd_queue *sq, struct sk_buff *skb, + int num_segs, int size) +{ + struct cpt_inst_s inst; + struct cpt_res_s *res; + struct xfrm_state *x; + struct qmem *sa_info; + dma_addr_t dptr_iova; + struct sec_path *sp; + u8 encap_offset; + u8 auth_offset; + u8 gthr_size; + u8 iv_offset; + u16 dlen; + + /* Check for IPSEC offload enabled */ + if (!(pf->flags & OTX2_FLAG_IPSEC_OFFLOAD_ENABLED)) + goto drop; + + sp = skb_sec_path(skb); + if (unlikely(!sp->len)) + goto drop; + + x = xfrm_input_state(skb); + if (unlikely(!x)) + goto drop; + + if (x->props.mode != XFRM_MODE_TRANSPORT && + x->props.mode != XFRM_MODE_TUNNEL) + goto drop; + + dlen = cn10k_ipsec_get_ip_data_len(x, skb); + if (dlen == 0 && netif_msg_tx_err(pf)) { + netdev_err(pf->netdev, "Invalid IP header, ip-length zero\n"); + goto drop; + } + + /* Check for valid SA context */ + sa_info = (struct qmem *)x->xso.offload_handle; + if (!sa_info) + goto drop; + + memset(&inst, 0, sizeof(struct cpt_inst_s)); + + /* Get authentication offset */ + if (x->props.family == AF_INET) + auth_offset = sizeof(struct iphdr); + else + auth_offset = sizeof(struct ipv6hdr); + + /* IV offset is after ESP header */ + iv_offset = auth_offset + sizeof(struct ip_esp_hdr); + /* Encap will start after IV */ + encap_offset = iv_offset + GCM_RFC4106_IV_SIZE; + + /* CPT Instruction word-1 */ + res = (struct cpt_res_s *)(sq->cpt_resp->base + (64 * sq->head)); + res->compcode = 0; + inst.res_addr = sq->cpt_resp->iova + (64 * sq->head); + + /* CPT Instruction word-2 */ + inst.rvu_pf_func = pf->pcifunc; + + /* CPT Instruction word-3: + * Set QORD to force CPT_RES_S write completion + */ + inst.qord = 1; + + /* CPT Instruction word-4 */ + /* inst.dlen should not include ICV length */ + inst.dlen = dlen + ETH_HLEN - (x->aead->alg_icv_len / 8); + inst.opcode_major = CN10K_IPSEC_MAJOR_OP_OUTB_IPSEC; + inst.param1 = cn10k_ipsec_get_param1(iv_offset); + + inst.param2 = encap_offset << + CN10K_IPSEC_INST_PARAM2_ENC_DATA_OFFSET_SHIFT; + inst.param2 |= (u16)auth_offset << + CN10K_IPSEC_INST_PARAM2_AUTH_DATA_OFFSET_SHIFT; + + /* CPT Instruction word-5 */ + gthr_size = num_segs / MAX_SEGS_PER_SG; + gthr_size = (num_segs % MAX_SEGS_PER_SG) ? gthr_size + 1 : gthr_size; + + gthr_size &= 0xF; + dptr_iova = (sq->sqe_ring->iova + (sq->head * (sq->sqe_size * 2))); + inst.dptr = dptr_iova | ((u64)gthr_size << 60); + + /* CPT Instruction word-6 */ + inst.rptr = inst.dptr; + + /* CPT Instruction word-7 */ + inst.cptr = sa_info->iova; + inst.ctx_val = 1; + inst.egrp = CN10K_DEF_CPT_IPSEC_EGRP; + + /* CPT Instruction word-0 */ + inst.nixtxl = (size / 16) - 1; + inst.dat_offset = ETH_HLEN; + inst.nixtx_offset = sq->sqe_size; + + netdev_tx_sent_queue(txq, skb->len); + + /* Finally Flush the CPT instruction */ + sq->head++; + sq->head &= (sq->sqe_cnt - 1); + cn10k_cpt_inst_flush(pf, &inst, sizeof(struct cpt_inst_s)); + return true; +drop: + dev_kfree_skb_any(skb); + return false; +} diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h index 5ac4de4ae974b..9965df0faa3e7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h @@ -9,6 +9,8 @@ #include +DECLARE_STATIC_KEY_FALSE(cn10k_ipsec_sa_enabled); + /* CPT instruction size in bytes */ #define CN10K_CPT_INST_SIZE 64 @@ -53,6 +55,7 @@ /* IPSEC Instruction opcodes */ #define CN10K_IPSEC_MAJOR_OP_WRITE_SA 0x01UL #define CN10K_IPSEC_MINOR_OP_WRITE_SA 0x09UL +#define CN10K_IPSEC_MAJOR_OP_OUTB_IPSEC 0x2AUL enum cn10k_cpt_comp_e { CN10K_CPT_COMP_E_NOTDONE = 0x00, @@ -143,6 +146,16 @@ struct cn10k_tx_sa_s { u64 hw_ctx[6]; /* W31 - W36 */ }; +/* CPT instruction parameter-1 */ +#define CN10K_IPSEC_INST_PARAM1_DIS_L4_CSUM 0x1 +#define CN10K_IPSEC_INST_PARAM1_DIS_L3_CSUM 0x2 +#define CN10K_IPSEC_INST_PARAM1_CRYPTO_MODE 0x20 +#define CN10K_IPSEC_INST_PARAM1_IV_OFFSET_SHIFT 8 + +/* CPT instruction parameter-2 */ +#define CN10K_IPSEC_INST_PARAM2_ENC_DATA_OFFSET_SHIFT 0 +#define CN10K_IPSEC_INST_PARAM2_AUTH_DATA_OFFSET_SHIFT 8 + /* CPT Instruction Structure */ struct cpt_inst_s { u64 nixtxl : 3; /* W0 */ @@ -182,6 +195,15 @@ struct cpt_res_s { u64 esn; /* W1 */ }; +/* CPT SG structure */ +struct cpt_sg_s { + u64 seg1_size : 16; + u64 seg2_size : 16; + u64 seg3_size : 16; + u64 segs : 2; + u64 rsvd_63_50 : 14; +}; + /* CPT LF_INPROG Register */ #define CPT_LF_INPROG_INFLIGHT GENMASK_ULL(8, 0) #define CPT_LF_INPROG_GRB_CNT GENMASK_ULL(39, 32) @@ -204,6 +226,11 @@ struct cpt_res_s { int cn10k_ipsec_init(struct net_device *netdev); void cn10k_ipsec_clean(struct otx2_nic *pf); int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable); +bool otx2_sqe_add_sg_ipsec(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + struct sk_buff *skb, int num_segs, int *offset); +bool cn10k_ipsec_transmit(struct otx2_nic *pf, struct netdev_queue *txq, + struct otx2_snd_queue *sq, struct sk_buff *skb, + int num_segs, int size); #else static inline __maybe_unused int cn10k_ipsec_init(struct net_device *netdev) { @@ -219,5 +246,20 @@ int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) { return 0; } + +static inline bool __maybe_unused +otx2_sqe_add_sg_ipsec(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + struct sk_buff *skb, int num_segs, int *offset) +{ + return true; +} + +static inline bool __maybe_unused +cn10k_ipsec_transmit(struct otx2_nic *pf, struct netdev_queue *txq, + struct otx2_snd_queue *sq, struct sk_buff *skb, + int num_segs, int size) +{ + return true; +} #endif #endif // CN10K_IPSEC_H diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 4c8774899eaf5..bf56888e7fe7d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -970,6 +970,29 @@ int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) if (err) return err; + /* Allocate memory for NIX SQE (which includes NIX SG) and CPT SG. + * SG of NIX and CPT are same in size. Allocate memory for CPT SG + * same as NIX SQE for base address alignment. + * Layout of a NIX SQE and CPT SG entry: + * ----------------------------- + * | CPT Scatter Gather | + * | (SQE SIZE) | + * | | + * ----------------------------- + * | NIX SQE | + * | (SQE SIZE) | + * | | + * ----------------------------- + */ + err = qmem_alloc(pfvf->dev, &sq->sqe_ring, qset->sqe_cnt, + sq->sqe_size * 2); + if (err) + return err; + + err = qmem_alloc(pfvf->dev, &sq->cpt_resp, qset->sqe_cnt, 64); + if (err) + return err; + if (qidx < pfvf->hw.tx_queues) { err = qmem_alloc(pfvf->dev, &sq->tso_hdrs, qset->sqe_cnt, TSO_HEADER_SIZE); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 5e2da67d58bb4..44d737a0dd092 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -57,6 +57,9 @@ #define NIX_PF_PFC_PRIO_MAX 8 #endif +/* Number of segments per SG structure */ +#define MAX_SEGS_PER_SG 3 + enum arua_mapped_qtypes { AURA_NIX_RQ, AURA_NIX_SQ, diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 2f652035d8547..e1dde93e8af82 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1485,6 +1485,8 @@ static void otx2_free_sq_res(struct otx2_nic *pf) if (!sq->sqe) continue; qmem_free(pf->dev, sq->sqe); + qmem_free(pf->dev, sq->sqe_ring); + qmem_free(pf->dev, sq->cpt_resp); qmem_free(pf->dev, sq->tso_hdrs); kfree(sq->sg); kfree(sq->sqb_ptrs); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index a49041e55c330..4e0133d1d8921 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "otx2_reg.h" #include "otx2_common.h" @@ -32,6 +33,17 @@ static bool otx2_xdp_rcv_pkt_handler(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, bool *need_xdp_flush); +static void otx2_sq_set_sqe_base(struct otx2_snd_queue *sq, + struct sk_buff *skb) +{ + if (static_branch_unlikely(&cn10k_ipsec_sa_enabled) && + (xfrm_offload(skb))) + sq->sqe_base = sq->sqe_ring->base + sq->sqe_size + + (sq->head * (sq->sqe_size * 2)); + else + sq->sqe_base = sq->sqe->base; +} + static int otx2_nix_cq_op_status(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) { @@ -593,7 +605,6 @@ void otx2_sqe_flush(void *dev, struct otx2_snd_queue *sq, sq->head &= (sq->sqe_cnt - 1); } -#define MAX_SEGS_PER_SG 3 /* Add SQE scatter/gather subdescriptor structure */ static bool otx2_sqe_add_sg(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, struct sk_buff *skb, int num_segs, int *offset) @@ -1129,6 +1140,7 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, int offset, num_segs, free_desc; struct nix_sqe_hdr_s *sqe_hdr; struct otx2_nic *pfvf = dev; + bool ret; /* Check if there is enough room between producer * and consumer index. @@ -1145,6 +1157,7 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, /* If SKB doesn't fit in a single SQE, linearize it. * TODO: Consider adding JUMP descriptor instead. */ + if (unlikely(num_segs > OTX2_MAX_FRAGS_IN_SQE)) { if (__skb_linearize(skb)) { dev_kfree_skb_any(skb); @@ -1164,6 +1177,9 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, return true; } + /* Set sqe base address */ + otx2_sq_set_sqe_base(sq, skb); + /* Set SQE's SEND_HDR. * Do not clear the first 64bit as it contains constant info. */ @@ -1176,7 +1192,13 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, otx2_sqe_add_ext(pfvf, sq, skb, &offset); /* Add SG subdesc with data frags */ - if (!otx2_sqe_add_sg(pfvf, sq, skb, num_segs, &offset)) { + if (static_branch_unlikely(&cn10k_ipsec_sa_enabled) && + (xfrm_offload(skb))) + ret = otx2_sqe_add_sg_ipsec(pfvf, sq, skb, num_segs, &offset); + else + ret = otx2_sqe_add_sg(pfvf, sq, skb, num_segs, &offset); + + if (!ret) { otx2_dma_unmap_skb_frags(pfvf, &sq->sg[sq->head]); return false; } @@ -1185,11 +1207,15 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, sqe_hdr->sizem1 = (offset / 16) - 1; + if (static_branch_unlikely(&cn10k_ipsec_sa_enabled) && + (xfrm_offload(skb))) + return cn10k_ipsec_transmit(pfvf, txq, sq, skb, num_segs, + offset); + netdev_tx_sent_queue(txq, skb->len); /* Flush SQE to HW */ pfvf->hw_ops->sqe_flush(pfvf, sq, offset, qidx); - return true; } EXPORT_SYMBOL(otx2_sq_append_skb); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h index e1db5f9618772..d23810963fdbd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h @@ -101,6 +101,9 @@ struct otx2_snd_queue { struct queue_stats stats; u16 sqb_count; u64 *sqb_ptrs; + /* SQE ring and CPT response queue for Inline IPSEC */ + struct qmem *sqe_ring; + struct qmem *cpt_resp; } ____cacheline_aligned_in_smp; enum cq_type { From 32188be805d052a91b999a723fd93698d83a7fa5 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:58 +0530 Subject: [PATCH 0261/1450] cn10k-ipsec: Allow ipsec crypto offload for skb with SA Allow to use hardware offload for outbound ipsec crypto mode if security association (SA) is set for a given skb. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/nic/cn10k_ipsec.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index 9a9b06f4c2cc3..e9bf4632695e0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -746,9 +746,24 @@ static void cn10k_ipsec_del_state(struct xfrm_state *x) queue_work(pf->ipsec.sa_workq, &pf->ipsec.sa_work); } +static bool cn10k_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x) +{ + if (x->props.family == AF_INET) { + /* Offload with IPv4 options is not supported yet */ + if (ip_hdr(skb)->ihl > 5) + return false; + } else { + /* Offload with IPv6 extension headers is not support yet */ + if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr)) + return false; + } + return true; +} + static const struct xfrmdev_ops cn10k_ipsec_xfrmdev_ops = { .xdo_dev_state_add = cn10k_ipsec_add_state, .xdo_dev_state_delete = cn10k_ipsec_del_state, + .xdo_dev_offload_ok = cn10k_ipsec_offload_ok, }; static void cn10k_ipsec_sa_wq_handler(struct work_struct *work) From b3ae3dc3a30f3de78c0c3675ea980639b9ba212c Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:59 +0530 Subject: [PATCH 0262/1450] cn10k-ipsec: Enable outbound ipsec crypto offload Hardware is initialized and netdev transmit flow is hooked up for outbound ipsec crypto offload, so finally enable ipsec offload. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index e9bf4632695e0..c333e04daad38 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -821,10 +821,10 @@ int cn10k_ipsec_init(struct net_device *netdev) return -ENOMEM; } - /* Set xfrm device ops - * NETIF_F_HW_ESP is not set as ipsec setup is not yet complete. - */ + /* Set xfrm device ops */ netdev->xfrmdev_ops = &cn10k_ipsec_xfrmdev_ops; + netdev->hw_features |= NETIF_F_HW_ESP; + netdev->hw_enc_features |= NETIF_F_HW_ESP; cn10k_cpt_device_set_unavailable(pf); return 0; From 52e8726d6782a14c7f9e0fea5a5bc8e6a1992fd4 Mon Sep 17 00:00:00 2001 From: Norbert van Bolhuis Date: Fri, 8 Nov 2024 13:52:30 +0100 Subject: [PATCH 0263/1450] wifi: brcmfmac: fix scatter-gather handling by detecting end of sg list The scatter-gather handling uses a pre-allocated list (with nents entries). If the driver runs out of sg entries it will result in an oops. Let's detect this instead and make the SDIO block transfer fail. Signed-off-by: Norbert van Bolhuis Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241108125609.107016-1-nvbolhuis@gmail.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c index 42d991d9f8cb4..60eb95fc19a5a 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c @@ -455,6 +455,11 @@ static int brcmf_sdiod_sglist_rw(struct brcmf_sdio_dev *sdiodev, if (sg_data_sz > max_req_sz - req_sz) sg_data_sz = max_req_sz - req_sz; + if (!sgl) { + /* out of (pre-allocated) scatterlist entries */ + ret = -ENOMEM; + goto exit; + } sg_set_buf(sgl, pkt_data, sg_data_sz); sg_cnt++; From 01e767d6f7832f1ef171816953547b466bba9937 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 11 Nov 2024 14:40:35 +0100 Subject: [PATCH 0264/1450] wifi: wlcore: testmode: Constify strutc nla_policy 'struct nla_policy' is not modified in this driver. Constifying this structure moves some data to a read-only section, so increase overall security, especially when the structure holds some function pointers. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 5062 528 0 5590 15d6 drivers/net/wireless/ti/wlcore/testmode.o After: ===== text data bss dec hex filename 5178 404 0 5582 15ce drivers/net/wireless/ti/wlcore/testmode.o Signed-off-by: Christophe JAILLET Reviewed-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/78810e3ebb74ddbd3a4538f182bf1143b89baba7.1731332414.git.christophe.jaillet@wanadoo.fr --- drivers/net/wireless/ti/wlcore/testmode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ti/wlcore/testmode.c b/drivers/net/wireless/ti/wlcore/testmode.c index 3f338b8096c77..fc8ea58bc1659 100644 --- a/drivers/net/wireless/ti/wlcore/testmode.c +++ b/drivers/net/wireless/ti/wlcore/testmode.c @@ -45,7 +45,7 @@ enum wl1271_tm_attrs { }; #define WL1271_TM_ATTR_MAX (__WL1271_TM_ATTR_AFTER_LAST - 1) -static struct nla_policy wl1271_tm_policy[WL1271_TM_ATTR_MAX + 1] = { +static const struct nla_policy wl1271_tm_policy[WL1271_TM_ATTR_MAX + 1] = { [WL1271_TM_ATTR_CMD_ID] = { .type = NLA_U32 }, [WL1271_TM_ATTR_ANSWER] = { .type = NLA_U8 }, [WL1271_TM_ATTR_DATA] = { .type = NLA_BINARY, From aba23b0a6a0df84b06ed0323ce127bf7257e4025 Mon Sep 17 00:00:00 2001 From: Renjaya Raga Zenta Date: Thu, 21 Nov 2024 14:09:27 +0700 Subject: [PATCH 0265/1450] wifi: brcmfmac: fix brcmf_vif_clear_mgmt_ies when stopping AP This removes the following error log when stopping AP: ieee80211 phy0: brcmf_vif_set_mgmt_ie: vndr ie set error : -52 It happened if: 1) previously wlan interface was in station mode (wpa_supplicant) and connected to a hotspot 2) then started AP mode (hostapd) 3) and then stopped AP mode. The error happened when it tried to clear BRCMF_VNDR_IE_PRBREQ_FLAG. This flag is not set in `brcmf_config_ap_mgmt_ie`, but BRCMF_VNDR_IE_ASSOCRSP_FLAG is set instead. Signed-off-by: Renjaya Raga Zenta Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241121-brcmfmac-v1-1-02fc3fb427c2@gmail.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 689e779fe00fc..902ac3108782a 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -4999,12 +4999,16 @@ s32 brcmf_vif_set_mgmt_ie(struct brcmf_cfg80211_vif *vif, s32 pktflag, s32 brcmf_vif_clear_mgmt_ies(struct brcmf_cfg80211_vif *vif) { static const s32 pktflags[] = { - BRCMF_VNDR_IE_PRBREQ_FLAG, BRCMF_VNDR_IE_PRBRSP_FLAG, BRCMF_VNDR_IE_BEACON_FLAG }; int i; + if (vif->wdev.iftype == NL80211_IFTYPE_AP) + brcmf_vif_set_mgmt_ie(vif, BRCMF_VNDR_IE_ASSOCRSP_FLAG, NULL, 0); + else + brcmf_vif_set_mgmt_ie(vif, BRCMF_VNDR_IE_PRBREQ_FLAG, NULL, 0); + for (i = 0; i < ARRAY_SIZE(pktflags); i++) brcmf_vif_set_mgmt_ie(vif, pktflags[i], NULL, 0); From f143cece43dd05fa651fa14d97726b67b92e9d03 Mon Sep 17 00:00:00 2001 From: Pin-yen Lin Date: Wed, 27 Nov 2024 18:55:43 +0800 Subject: [PATCH 0266/1450] wifi: mwifiex: decrease timeout waiting for host sleep from 10s to 5s In commit 52250cbee7f6 ("mwifiex: use timeout variant for wait_event_interruptible") it was noted that sometimes we seemed to miss the signal that our host sleep settings took effect. A 10 second timeout was added to the code to make sure we didn't hang forever waiting. It appears that this problem still exists and we hit the timeout sometimes for Chromebooks in the field. Recently on ChromeOS we've started setting the DPM watchdog to trip if full system suspend takes over 10 seconds. Given the timeout in the original patch, obviously we're hitting the DPM watchdog before mwifiex gets a chance to timeout. While we could increase the DPM watchdog in ChromeOS to avoid this problem, it's probably better to simply decrease the timeout. Any time we're waiting several seconds for the firmware to respond it's likely that the firmware won't ever respond. With that in mind, decrease the timeout in mwifiex from 10 seconds to 5 seconds. Suggested-by: Doug Anderson Signed-off-by: Pin-yen Lin Reviewed-by: Douglas Anderson Acked-by: Brian Norris Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241127105709.4014302-1-treapking@chromium.org --- drivers/net/wireless/marvell/mwifiex/sta_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c index e06a0622973e7..f79589cafe572 100644 --- a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c +++ b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c @@ -545,7 +545,7 @@ int mwifiex_enable_hs(struct mwifiex_adapter *adapter) if (wait_event_interruptible_timeout(adapter->hs_activate_wait_q, adapter->hs_activate_wait_q_woken, - (10 * HZ)) <= 0) { + (5 * HZ)) <= 0) { mwifiex_dbg(adapter, ERROR, "hs_activate_wait_q terminated\n"); return false; From cb1b78f1c726c938bd47497c1ab16b01ce967f37 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 10 Sep 2024 00:44:32 +0000 Subject: [PATCH 0267/1450] tools: hv: Fix a complier warning in the fcopy uio daemon hv_fcopy_uio_daemon.c:436:53: warning: '%s' directive output may be truncated writing up to 14 bytes into a region of size 10 [-Wformat-truncation=] 436 | snprintf(uio_dev_path, sizeof(uio_dev_path), "/dev/%s", uio_name); Also added 'static' for the array 'desc[]'. Fixes: 82b0945ce2c2 ("tools: hv: Add new fcopy application based on uio driver") Cc: stable@vger.kernel.org # 6.10+ Signed-off-by: Dexuan Cui Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/20240910004433.50254-1-decui@microsoft.com Signed-off-by: Wei Liu Message-ID: <20240910004433.50254-1-decui@microsoft.com> --- tools/hv/hv_fcopy_uio_daemon.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c index 7a00f3066a980..12743d7f164f0 100644 --- a/tools/hv/hv_fcopy_uio_daemon.c +++ b/tools/hv/hv_fcopy_uio_daemon.c @@ -35,8 +35,6 @@ #define WIN8_SRV_MINOR 1 #define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) -#define MAX_FOLDER_NAME 15 -#define MAX_PATH_LEN 15 #define FCOPY_UIO "/sys/bus/vmbus/devices/eb765408-105f-49b6-b4aa-c123b64d17d4/uio" #define FCOPY_VER_COUNT 1 @@ -51,7 +49,7 @@ static const int fw_versions[] = { #define HV_RING_SIZE 0x4000 /* 16KB ring buffer size */ -unsigned char desc[HV_RING_SIZE]; +static unsigned char desc[HV_RING_SIZE]; static int target_fd; static char target_fname[PATH_MAX]; @@ -409,8 +407,8 @@ int main(int argc, char *argv[]) struct vmbus_br txbr, rxbr; void *ring; uint32_t len = HV_RING_SIZE; - char uio_name[MAX_FOLDER_NAME] = {0}; - char uio_dev_path[MAX_PATH_LEN] = {0}; + char uio_name[NAME_MAX] = {0}; + char uio_dev_path[PATH_MAX] = {0}; static struct option long_options[] = { {"help", no_argument, 0, 'h' }, From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Tue, 17 Sep 2024 11:09:17 +0530 Subject: [PATCH 0268/1450] x86/hyperv: Fix hv tsc page based sched_clock for hibernation read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is bigger than the variable hv_sched_clock_offset, which is cached during early boot, but depending on the timing this assumption may be false when a hibernated VM starts again (the clock counter starts from 0 again) and is resuming back (Note: hv_init_tsc_clocksource() is not called during hibernation/resume); consequently, read_hv_sched_clock_tsc() may return a negative integer (which is interpreted as a huge positive integer since the return type is u64) and new kernel messages are prefixed with huge timestamps before read_hv_sched_clock_tsc() grows big enough (which typically takes several seconds). Fix the issue by saving the Hyper-V clock counter just before the suspend, and using it to correct the hv_sched_clock_offset in resume. This makes hv tsc page based sched_clock continuous and ensures that post resume, it starts from where it left off during suspend. Override x86_platform.save_sched_clock_state and x86_platform.restore_sched_clock_state routines to correct this as soon as possible. Note: if Invariant TSC is available, the issue doesn't happen because 1) we don't register read_hv_sched_clock_tsc() for sched clock: See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework clocksource and sched clock setup"); 2) the common x86 code adjusts TSC similarly: see __restore_processor_state() -> tsc_verify_tsc_adjust(true) and x86_platform.restore_sched_clock_state(). Cc: stable@vger.kernel.org Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation") Co-developed-by: Dexuan Cui Signed-off-by: Dexuan Cui Signed-off-by: Naman Jain Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20240917053917.76787-1-namjain@linux.microsoft.com> --- arch/x86/kernel/cpu/mshyperv.c | 58 ++++++++++++++++++++++++++++++ drivers/clocksource/hyperv_timer.c | 14 +++++++- include/clocksource/hyperv_timer.h | 2 ++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index d18078834deda..dc12fe5ef3caa 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) hyperv_cleanup(); } #endif /* CONFIG_CRASH_DUMP */ + +static u64 hv_ref_counter_at_suspend; +static void (*old_save_sched_clock_state)(void); +static void (*old_restore_sched_clock_state)(void); + +/* + * Hyper-V clock counter resets during hibernation. Save and restore clock + * offset during suspend/resume, while also considering the time passed + * before suspend. This is to make sure that sched_clock using hv tsc page + * based clocksource, proceeds from where it left off during suspend and + * it shows correct time for the timestamps of kernel messages after resume. + */ +static void save_hv_clock_tsc_state(void) +{ + hv_ref_counter_at_suspend = hv_read_reference_counter(); +} + +static void restore_hv_clock_tsc_state(void) +{ + /* + * Adjust the offsets used by hv tsc clocksource to + * account for the time spent before hibernation. + * adjusted value = reference counter (time) at suspend + * - reference counter (time) now. + */ + hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter()); +} + +/* + * Functions to override save_sched_clock_state and restore_sched_clock_state + * functions of x86_platform. The Hyper-V clock counter is reset during + * suspend-resume and the offset used to measure time needs to be + * corrected, post resume. + */ +static void hv_save_sched_clock_state(void) +{ + old_save_sched_clock_state(); + save_hv_clock_tsc_state(); +} + +static void hv_restore_sched_clock_state(void) +{ + restore_hv_clock_tsc_state(); + old_restore_sched_clock_state(); +} + +static void __init x86_setup_ops_for_tsc_pg_clock(void) +{ + if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) + return; + + old_save_sched_clock_state = x86_platform.save_sched_clock_state; + x86_platform.save_sched_clock_state = hv_save_sched_clock_state; + + old_restore_sched_clock_state = x86_platform.restore_sched_clock_state; + x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state; +} #endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) @@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platform(void) /* Register Hyper-V specific clocksource */ hv_init_clocksource(); + x86_setup_ops_for_tsc_pg_clock(); hv_vtl_init_platform(); #endif /* diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 99177835cadec..b39dee7b93af0 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -27,7 +27,8 @@ #include static struct clock_event_device __percpu *hv_clock_event; -static u64 hv_sched_clock_offset __ro_after_init; +/* Note: offset can hold negative values after hibernation. */ +static u64 hv_sched_clock_offset __read_mostly; /* * If false, we're using the old mechanism for stimer0 interrupts @@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct clocksource *arg) hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); } +/* + * Called during resume from hibernation, from overridden + * x86_platform.restore_sched_clock_state routine. This is to adjust offsets + * used to calculate time for hv tsc page based sched_clock, to account for + * time spent before hibernation. + */ +void hv_adj_sched_clock_offset(u64 offset) +{ + hv_sched_clock_offset -= offset; +} + #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK static int hv_cs_enable(struct clocksource *cs) { diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h index 6cdc873ac907f..aa5233b1eba97 100644 --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(void); extern unsigned long hv_get_tsc_pfn(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); +extern void hv_adj_sched_clock_offset(u64 offset); + static __always_inline bool hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time) From 91ae69c7ed9e262f24240c425ad1eef2cf6639b7 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Wed, 16 Oct 2024 16:35:10 +0200 Subject: [PATCH 0269/1450] tools: hv: change permissions of NetworkManager configuration file Align permissions of the resulting .nmconnection file, instead of the input file from hv_kvp_daemon. To avoid the tiny time frame where the output file is world-readable, use umask instead of chmod. Fixes: 42999c904612 ("hv/hv_kvp_daemon:Support for keyfile based connection profile") Signed-off-by: Olaf Hering Reviewed-by: Shradha Gupta Link: https://lore.kernel.org/r/20241016143521.3735-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241016143521.3735-1-olaf@aepfle.de> --- tools/hv/hv_set_ifconfig.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hv/hv_set_ifconfig.sh b/tools/hv/hv_set_ifconfig.sh index 440a91b35823b..2f8baed2b8f79 100755 --- a/tools/hv/hv_set_ifconfig.sh +++ b/tools/hv/hv_set_ifconfig.sh @@ -81,7 +81,7 @@ echo "ONBOOT=yes" >> $1 cp $1 /etc/sysconfig/network-scripts/ -chmod 600 $2 +umask 0177 interface=$(echo $2 | awk -F - '{ print $2 }') filename="${2##*/}" From 67b5e1042d90d8a9814f22312c1147b4c9cd501a Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 30 Oct 2024 17:47:36 +0000 Subject: [PATCH 0270/1450] drivers: hv: Convert open-coded timeouts to secs_to_jiffies() We have several places where timeouts are open-coded as N (seconds) * HZ, but best practice is to use the utility functions from jiffies.h. Convert the timeouts to be compliant. This doesn't fix any bugs, it's a simple code improvement. Signed-off-by: Easwar Hariharan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20241030-open-coded-timeouts-v3-2-9ba123facf88@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20241030-open-coded-timeouts-v3-2-9ba123facf88@linux.microsoft.com> --- drivers/hv/hv_balloon.c | 9 +++++---- drivers/hv/hv_kvp.c | 4 ++-- drivers/hv/hv_snapshot.c | 3 ++- drivers/hv/vmbus_drv.c | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index c38dcdfcb914d..a99112e6f0b85 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -756,7 +756,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, * adding succeeded, it is ok to proceed even if the memory was * not onlined in time. */ - wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ); + wait_for_completion_timeout(&dm_device.ol_waitevent, secs_to_jiffies(5)); post_status(&dm_device); } } @@ -1373,7 +1373,8 @@ static int dm_thread_func(void *dm_dev) struct hv_dynmem_device *dm = dm_dev; while (!kthread_should_stop()) { - wait_for_completion_interruptible_timeout(&dm_device.config_event, 1 * HZ); + wait_for_completion_interruptible_timeout(&dm_device.config_event, + secs_to_jiffies(1)); /* * The host expects us to post information on the memory * pressure every second. @@ -1748,7 +1749,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1806,7 +1807,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5 * HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index d35b60c061148..29e01247a0870 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -655,7 +655,7 @@ void hv_kvp_onchannelcallback(void *context) if (host_negotiatied == NEGO_NOT_STARTED) { host_negotiatied = NEGO_IN_PROGRESS; schedule_delayed_work(&kvp_host_handshake_work, - HV_UTIL_NEGO_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_NEGO_TIMEOUT)); } return; } @@ -724,7 +724,7 @@ void hv_kvp_onchannelcallback(void *context) */ schedule_work(&kvp_sendkey_work); schedule_delayed_work(&kvp_timeout_work, - HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_TIMEOUT)); return; diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 0d2184be16912..86d87486ed40b 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -193,7 +193,8 @@ static void vss_send_op(void) vss_transaction.state = HVUTIL_USERSPACE_REQ; schedule_delayed_work(&vss_timeout_work, op == VSS_OP_FREEZE ? - VSS_FREEZE_TIMEOUT * HZ : HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(VSS_FREEZE_TIMEOUT) : + secs_to_jiffies(HV_UTIL_TIMEOUT)); rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg), NULL); if (rc) { diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 6d89d37b069ad..2892b8da20a5e 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2507,7 +2507,7 @@ static int vmbus_bus_resume(struct device *dev) vmbus_request_offers(); if (wait_for_completion_timeout( - &vmbus_connection.ready_for_resume_event, 10 * HZ) == 0) + &vmbus_connection.ready_for_resume_event, secs_to_jiffies(10)) == 0) pr_err("Some vmbus device is missing after suspending?\n"); /* Reset the event for the next suspend. */ From a9640fcdd400463442846677e62b8208b81cb031 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 5 Nov 2024 09:14:04 +0100 Subject: [PATCH 0271/1450] tools/hv: terminate fcopy daemon if read from uio fails Terminate endless loop in reading fails, to avoid flooding syslog. This happens if the state of "Guest services" integration service is changed from "enabled" to "disabled" at runtime in the VM settings. In this case pread returns EIO. Also handle an interrupted system call, and continue in this case. Signed-off-by: Olaf Hering Reviewed-by: Saurabh Sengar Link: https://lore.kernel.org/r/20241105081437.15689-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241105081437.15689-1-olaf@aepfle.de> --- tools/hv/hv_fcopy_uio_daemon.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c index 12743d7f164f0..0198321d14a29 100644 --- a/tools/hv/hv_fcopy_uio_daemon.c +++ b/tools/hv/hv_fcopy_uio_daemon.c @@ -466,8 +466,10 @@ int main(int argc, char *argv[]) */ ret = pread(fcopy_fd, &tmp, sizeof(int), 0); if (ret < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; syslog(LOG_ERR, "pread failed: %s", strerror(errno)); - continue; + goto close; } len = HV_RING_SIZE; From 96e052d1473843d644ceba2adf46d3d2180b8ca7 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:46 -0800 Subject: [PATCH 0272/1450] Drivers: hv: util: Don't force error code to ENODEV in util_probe() If the util_init function call in util_probe() returns an error code, util_probe() always return ENODEV, and the error code from the util_init function is lost. The error message output in the caller, vmbus_probe(), doesn't show the real error code. Fix this by just returning the error code from the util_init function. There doesn't seem to be a reason to force ENODEV, as other errors such as ENOMEM can already be returned from util_probe(). And the code in call_driver_probe() implies that ENODEV should mean that a matching driver wasn't found, which is not the case here. Suggested-by: Dexuan Cui Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-2-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-2-mhklinux@outlook.com> --- drivers/hv/hv_util.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index c4f525325790f..3707222201344 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -590,10 +590,8 @@ static int util_probe(struct hv_device *dev, srv->channel = dev->channel; if (srv->util_init) { ret = srv->util_init(srv); - if (ret) { - ret = -ENODEV; + if (ret) goto error1; - } } /* From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:47 -0800 Subject: [PATCH 0273/1450] Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is fully initialized, we can hit the panic below: hv_utils: Registering HyperV Utility Driver hv_vmbus: registering driver hv_utils ... BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1 RIP: 0010:hv_pkt_iter_first+0x12/0xd0 Call Trace: ... vmbus_recvpacket hv_kvp_onchannelcallback vmbus_on_event tasklet_action_common tasklet_action handle_softirqs irq_exit_rcu sysvec_hyperv_stimer0 asm_sysvec_hyperv_stimer0 ... kvp_register_done hvt_op_read vfs_read ksys_read __x64_sys_read This can happen because the KVP/VSS channel callback can be invoked even before the channel is fully opened: 1) as soon as hv_kvp_init() -> hvutil_transport_init() creates /dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and register itself to the driver by writing a message KVP_OP_REGISTER1 to the file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and reading the file for the driver's response, which is handled by hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done(). 2) the problem with kvp_register_done() is that it can cause the channel callback to be called even before the channel is fully opened, and when the channel callback is starting to run, util_probe()-> vmbus_open() may have not initialized the ringbuffer yet, so the callback can hit the panic of NULL pointer dereference. To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in __vmbus_open(), just before the first hv_ringbuffer_init(), and then we unload and reload the driver hv_utils, and run the daemon manually within the 10 seconds. Fix the panic by reordering the steps in util_probe() so the char dev entry used by the KVP or VSS daemon is not created until after vmbus_open() has completed. This reordering prevents the race condition from happening. Reported-by: Dexuan Cui Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration") Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-3-mhklinux@outlook.com> --- drivers/hv/hv_kvp.c | 6 ++++++ drivers/hv/hv_snapshot.c | 6 ++++++ drivers/hv/hv_util.c | 9 +++++++++ drivers/hv/hyperv_vmbus.h | 2 ++ include/linux/hyperv.h | 1 + 5 files changed, 24 insertions(+) diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 29e01247a0870..7400a5a4d2bd7 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv) */ kvp_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_kvp_init_transport(void) +{ hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL, kvp_on_msg, kvp_on_reset); if (!hvt) diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 86d87486ed40b..bde637a96c379 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -389,6 +389,12 @@ hv_vss_init(struct hv_util_service *srv) */ vss_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_vss_init_transport(void) +{ hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL, vss_on_msg, vss_on_reset); if (!hvt) { diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 3707222201344..36ee89c0358bd 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -141,6 +141,7 @@ static struct hv_util_service util_heartbeat = { static struct hv_util_service util_kvp = { .util_cb = hv_kvp_onchannelcallback, .util_init = hv_kvp_init, + .util_init_transport = hv_kvp_init_transport, .util_pre_suspend = hv_kvp_pre_suspend, .util_pre_resume = hv_kvp_pre_resume, .util_deinit = hv_kvp_deinit, @@ -149,6 +150,7 @@ static struct hv_util_service util_kvp = { static struct hv_util_service util_vss = { .util_cb = hv_vss_onchannelcallback, .util_init = hv_vss_init, + .util_init_transport = hv_vss_init_transport, .util_pre_suspend = hv_vss_pre_suspend, .util_pre_resume = hv_vss_pre_resume, .util_deinit = hv_vss_deinit, @@ -611,6 +613,13 @@ static int util_probe(struct hv_device *dev, if (ret) goto error; + if (srv->util_init_transport) { + ret = srv->util_init_transport(); + if (ret) { + vmbus_close(dev->channel); + goto error; + } + } return 0; error: diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index d2856023d53c9..52cb744b4d7fd 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -370,12 +370,14 @@ void vmbus_on_event(unsigned long data); void vmbus_on_msg_dpc(unsigned long data); int hv_kvp_init(struct hv_util_service *srv); +int hv_kvp_init_transport(void); void hv_kvp_deinit(void); int hv_kvp_pre_suspend(void); int hv_kvp_pre_resume(void); void hv_kvp_onchannelcallback(void *context); int hv_vss_init(struct hv_util_service *srv); +int hv_vss_init_transport(void); void hv_vss_deinit(void); int hv_vss_pre_suspend(void); int hv_vss_pre_resume(void); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 22c22fb910421..02a226bcf0edc 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1559,6 +1559,7 @@ struct hv_util_service { void *channel; void (*util_cb)(void *); int (*util_init)(struct hv_util_service *); + int (*util_init_transport)(void); void (*util_deinit)(void); int (*util_pre_suspend)(void); int (*util_pre_resume)(void); From 07dfa6e821e1c58cbd0f195173dddbd593721f9b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 12 Nov 2024 16:04:01 +0100 Subject: [PATCH 0274/1450] hv/hv_kvp_daemon: Pass NIC name to hv_get_dns_info as well The reference implementation of hv_get_dns_info which is in the tree uses /etc/resolv.conf to get DNS servers and this does not require to know which NIC is queried. Distro specific implementations, however, may want to provide per-NIC, fine grained information. E.g. NetworkManager keeps track of DNS servers per connection. Similar to hv_get_dhcp_info, pass NIC name as a parameter to hv_get_dns_info script. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20241112150401.217094-1-vkuznets@redhat.com Signed-off-by: Wei Liu Message-ID: <20241112150401.217094-1-vkuznets@redhat.com> --- tools/hv/hv_kvp_daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index ae57bf69ad4af..296a7a62c54d3 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -725,7 +725,7 @@ static void kvp_get_ipconfig_info(char *if_name, * . */ - sprintf(cmd, KVP_SCRIPTS_PATH "%s", "hv_get_dns_info"); + sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dns_info", if_name); /* * Execute the command to gather DNS info. From a4d024fe2e77063069c5f423f2f9be766450f0f9 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 2 Dec 2024 13:04:10 +0100 Subject: [PATCH 0275/1450] tools/hv: reduce resouce usage in hv_get_dns_info helper Remove the usage of cat. Replace the shell process with awk with 'exec'. Also use a generic shell because no bash specific features will be used. Signed-off-by: Olaf Hering Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241202120432.21115-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241202120432.21115-1-olaf@aepfle.de> --- tools/hv/hv_get_dns_info.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/hv/hv_get_dns_info.sh b/tools/hv/hv_get_dns_info.sh index 058c17b46ffcd..268521234d4b0 100755 --- a/tools/hv/hv_get_dns_info.sh +++ b/tools/hv/hv_get_dns_info.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # This example script parses /etc/resolv.conf to retrive DNS information. # In the interest of keeping the KVP daemon code free of distro specific @@ -10,4 +10,4 @@ # this script can be based on the Network Manager APIs for retrieving DNS # entries. -cat /etc/resolv.conf 2>/dev/null | awk '/^nameserver/ { print $2 }' +exec awk '/^nameserver/ { print $2 }' /etc/resolv.conf 2>/dev/null From becc7fe329c09a7744fa908fca83418fa94a45a0 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 2 Dec 2024 13:40:52 +0100 Subject: [PATCH 0276/1450] tools/hv: add a .gitignore file Remove generated files from 'git status' output after 'make -C tools/hv'. Signed-off-by: Olaf Hering Link: https://lore.kernel.org/r/20241202124107.28650-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241202124107.28650-1-olaf@aepfle.de> --- tools/hv/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/hv/.gitignore diff --git a/tools/hv/.gitignore b/tools/hv/.gitignore new file mode 100644 index 0000000000000..0c5bc15d602f0 --- /dev/null +++ b/tools/hv/.gitignore @@ -0,0 +1,3 @@ +hv_fcopy_uio_daemon +hv_kvp_daemon +hv_vss_daemon From 175c71c2aceef173ae6d3dceb41edfc2ac0d5937 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Sun, 8 Dec 2024 23:47:17 +0000 Subject: [PATCH 0277/1450] tools/hv: reduce resource usage in hv_kvp_daemon hv_kvp_daemon uses popen(3) and system(3) as convinience helper to launch external helpers. These helpers are invoked via a temporary shell process. There is no need to keep this temporary process around while the helper runs. Replace this temporary shell with the actual helper process via 'exec'. Signed-off-by: Olaf Hering Link: https://lore.kernel.org/linux-hyperv/20241202123520.27812-1-olaf@aepfle.de/ Signed-off-by: Wei Liu --- tools/hv/hv_kvp_daemon.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 296a7a62c54d3..04ba035d67e9d 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -725,7 +725,7 @@ static void kvp_get_ipconfig_info(char *if_name, * . */ - sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dns_info", if_name); + sprintf(cmd, "exec %s %s", KVP_SCRIPTS_PATH "hv_get_dns_info", if_name); /* * Execute the command to gather DNS info. @@ -742,7 +742,7 @@ static void kvp_get_ipconfig_info(char *if_name, * Enabled: DHCP enabled. */ - sprintf(cmd, KVP_SCRIPTS_PATH "%s %s", "hv_get_dhcp_info", if_name); + sprintf(cmd, "exec %s %s", KVP_SCRIPTS_PATH "hv_get_dhcp_info", if_name); file = popen(cmd, "r"); if (file == NULL) @@ -1606,8 +1606,9 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) * invoke the external script to do its magic. */ - str_len = snprintf(cmd, sizeof(cmd), KVP_SCRIPTS_PATH "%s %s %s", - "hv_set_ifconfig", if_filename, nm_filename); + str_len = snprintf(cmd, sizeof(cmd), "exec %s %s %s", + KVP_SCRIPTS_PATH "hv_set_ifconfig", + if_filename, nm_filename); /* * This is a little overcautious, but it's necessary to suppress some * false warnings from gcc 8.0.1. From d1fd972914239996dbd15c5142d7f6e09d95a002 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:29 +0000 Subject: [PATCH 0278/1450] ktime: Add us_to_ktime() Add a us_to_ktime() helper to go with ms_to_ktime() and ns_to_ktime(). Signed-off-by: David Howells cc: Thomas Gleixner cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/ktime.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 3a4e723eae0f1..383ed99858024 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -222,6 +222,11 @@ static inline ktime_t ns_to_ktime(u64 ns) return ns; } +static inline ktime_t us_to_ktime(u64 us) +{ + return us * NSEC_PER_USEC; +} + static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; From 0e56ebde245e4799ce74d38419426f2a80d39950 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:30 +0000 Subject: [PATCH 0279/1450] rxrpc: Fix handling of received connection abort Fix the handling of a connection abort that we've received. Though the abort is at the connection level, it needs propagating to the calls on that connection. Whilst the propagation bit is performed, the calls aren't then woken up to go and process their termination, and as no further input is forthcoming, they just hang. Also add some tracing for the logging of connection aborts. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 25 +++++++++++++++++++++++++ net/rxrpc/conn_event.c | 12 ++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d03e0bd8c028b..27c23873c8811 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -117,6 +117,7 @@ #define rxrpc_call_poke_traces \ EM(rxrpc_call_poke_abort, "Abort") \ EM(rxrpc_call_poke_complete, "Compl") \ + EM(rxrpc_call_poke_conn_abort, "Conn-abort") \ EM(rxrpc_call_poke_error, "Error") \ EM(rxrpc_call_poke_idle, "Idle") \ EM(rxrpc_call_poke_set_timeout, "Set-timo") \ @@ -282,6 +283,7 @@ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ EM(rxrpc_call_see_connected, "SEE connect ") \ + EM(rxrpc_call_see_conn_abort, "SEE conn-abt") \ EM(rxrpc_call_see_disconnected, "SEE disconn ") \ EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ EM(rxrpc_call_see_input, "SEE input ") \ @@ -981,6 +983,29 @@ TRACE_EVENT(rxrpc_rx_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_rx_conn_abort, + TP_PROTO(const struct rxrpc_connection *conn, const struct sk_buff *skb), + + TP_ARGS(conn, skb), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(u32, abort_code) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = rxrpc_skb(skb)->hdr.serial; + __entry->abort_code = skb->priority; + ), + + TP_printk("C=%08x ABORT %08x ac=%d", + __entry->conn, + __entry->serial, + __entry->abort_code) + ); + TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 598b4ee389fc1..2a1396cd892f3 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -63,11 +63,12 @@ int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, /* * Mark a connection as being remotely aborted. */ -static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn, +static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { - return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, - RXRPC_CALL_REMOTELY_ABORTED); + trace_rxrpc_rx_conn_abort(conn, skb); + rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + RXRPC_CALL_REMOTELY_ABORTED); } /* @@ -202,11 +203,14 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) for (i = 0; i < RXRPC_MAXCALLS; i++) { call = conn->channels[i].call; - if (call) + if (call) { + rxrpc_see_call(call, rxrpc_call_see_conn_abort); rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); + rxrpc_poke_call(call, rxrpc_call_poke_conn_abort); + } } _leave(""); From 29e03ec757292e55fa0f7efa051c84ddc4f3e668 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:31 +0000 Subject: [PATCH 0280/1450] rxrpc: Use umin() and umax() rather than min_t()/max_t() where possible Use umin() and umax() rather than min_t()/max_t() where the type specified is an unsigned type. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-4-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/call_event.c | 5 ++--- net/rxrpc/call_object.c | 4 ++-- net/rxrpc/conn_client.c | 2 +- net/rxrpc/input.c | 13 +++++-------- net/rxrpc/insecure.c | 2 +- net/rxrpc/io_thread.c | 2 +- net/rxrpc/output.c | 2 +- net/rxrpc/rtt.c | 6 +++--- net/rxrpc/rxkad.c | 6 +++--- net/rxrpc/rxperf.c | 2 +- net/rxrpc/sendmsg.c | 2 +- 11 files changed, 21 insertions(+), 25 deletions(-) diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 7bbb685047667..c4754cc9b8d47 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -233,8 +233,7 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) { - unsigned int winsize = min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra); + unsigned int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize; rxrpc_seq_t tx_top = call->tx_top; int space; @@ -467,7 +466,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } else { unsigned long nowj = jiffies, delayj, nextj; - delayj = max(nsecs_to_jiffies(delay), 1); + delayj = umax(nsecs_to_jiffies(delay), 1); nextj = nowj + delayj; if (time_before(nextj, call->timer.expires) || !timer_pending(&call->timer)) { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f9e983a12c149..0df647d1d3a25 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -220,9 +220,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) - call->next_rx_timo = min(p->timeouts.normal, 1); + call->next_rx_timo = umin(p->timeouts.normal, 1); if (p->timeouts.idle) - call->next_req_timo = min(p->timeouts.idle, 1); + call->next_req_timo = umin(p->timeouts.idle, 1); if (p->timeouts.hard) call->hard_timo = p->timeouts.hard; diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index bb11e8289d6dc..86fb18bcd1885 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -231,7 +231,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) distance = id - id_cursor; if (distance < 0) distance = -distance; - limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024); + limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024); if (distance > limit) goto mark_dont_reuse; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 16d49a861dbb5..49e35be7dc135 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -44,8 +44,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); + call->cong_ssthresh = umax(summary->flight_size / 2, 2); cwnd = 1; if (cwnd >= call->cong_ssthresh && call->cong_mode == RXRPC_CALL_SLOW_START) { @@ -113,8 +112,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, change = rxrpc_cong_begin_retransmission; call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); + call->cong_ssthresh = umax(summary->flight_size / 2, 2); cwnd = call->cong_ssthresh + 3; call->cong_extra = 0; call->cong_dup_acks = 0; @@ -206,9 +204,8 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); call->tx_last_sent = now; call->cong_mode = RXRPC_CALL_SLOW_START; - call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh, - call->cong_cwnd * 3 / 4); - call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND); + call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4); + call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND); } /* @@ -709,7 +706,7 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb call->tx_winsize = rwind; } - mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); + mtu = umin(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); peer = call->peer; if (mtu < peer->maxdata) { diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 6716c021a532e..751eb621021de 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -19,7 +19,7 @@ static int none_init_connection_security(struct rxrpc_connection *conn, */ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 1, gfp); + return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 07c74c77d8021..7af5adf53b25c 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -558,7 +558,7 @@ int rxrpc_io_thread(void *data) } timeout = nsecs_to_jiffies(delay_ns); - timeout = max(timeout, 1UL); + timeout = umax(timeout, 1); schedule_timeout(timeout); __set_current_state(TASK_RUNNING); continue; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5ea9601efd05a..85112ea31a39c 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -118,7 +118,7 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, txb->kvec[1].iov_len = ack->nAcks; wrap = RXRPC_SACK_SIZE - sack; - to = min_t(unsigned int, ack->nAcks, RXRPC_SACK_SIZE); + to = umin(ack->nAcks, RXRPC_SACK_SIZE); if (sack + ack->nAcks <= RXRPC_SACK_SIZE) { memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks); diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index cdab7b7d08a00..6dc51486b5a60 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -27,7 +27,7 @@ static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) static u32 rxrpc_bound_rto(u32 rto) { - return min(rto, RXRPC_RTO_MAX); + return umin(rto, RXRPC_RTO_MAX); } /* @@ -91,11 +91,11 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ - peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); + peer->rttvar_us = umax(peer->mdev_us, rxrpc_rto_min_us(peer)); peer->mdev_max_us = peer->rttvar_us; } - peer->srtt_us = max(1U, srtt); + peer->srtt_us = umax(srtt, 1); } /* diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 48a1475e6b063..e3194d73dd846 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -150,11 +150,11 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem struct rxrpc_txbuf *txb; size_t shdr, space; - remain = min(remain, 65535 - sizeof(struct rxrpc_wire_header)); + remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header)); switch (call->conn->security_level) { default: - space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); + space = umin(remain, RXRPC_JUMBO_DATALEN); return rxrpc_alloc_data_txbuf(call, space, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); @@ -164,7 +164,7 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem break; } - space = min_t(size_t, round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); + space = umin(round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); space = round_up(space, RXKAD_ALIGN); txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp); diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 085e7892d3104..7ef93407be830 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -503,7 +503,7 @@ static int rxperf_process_call(struct rxperf_call *call) reply_len + sizeof(rxperf_magic_cookie)); while (reply_len > 0) { - len = min_t(size_t, reply_len, PAGE_SIZE); + len = umin(reply_len, PAGE_SIZE); bvec_set_page(&bv, ZERO_PAGE(0), len, 0); iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len); msg.msg_flags = MSG_MORE; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 6abb8eec1b2b1..b04afb5df2419 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -360,7 +360,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { - size_t copy = min_t(size_t, txb->space, msg_data_left(msg)); + size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset, From efa95c32352b2ac7ff09d680144e22c0f25244cb Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:32 +0000 Subject: [PATCH 0281/1450] rxrpc: Clean up Tx header flags generation handling Clean up the generation of the header flags when building packet headers for transmission: (1) Assemble the flags in a local variable rather than in the txb->flags. (2) Do the flags masking and JUMBO-PACKET setting in one bit of code for both the main header and the jumbo headers. (3) Generate the REQUEST-ACK flag afresh each time. There's a possibility we might want to do jumbo retransmission packets in future. (4) Pass the local flags variable to the rxrpc_tx_data tracepoint rather than the combination of the txb flags and the wire header flags (the latter belong only to the first subpacket). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-5-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 - net/rxrpc/ar-internal.h | 2 +- net/rxrpc/output.c | 18 ++++++++++++------ net/rxrpc/proc.c | 3 +-- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 27c23873c8811..62064f63d6eb8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -452,7 +452,6 @@ #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ - EM(rxrpc_reqack_already_on, "ALREADY-ON") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d0fd37bdcfe9c..fcdfbc1d5aafa 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -110,7 +110,7 @@ struct rxrpc_net { atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; - atomic_t stat_why_req_ack[8]; + atomic_t stat_why_req_ack[7]; atomic_t stat_io_loop; }; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 85112ea31a39c..50d5f2a02458a 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -330,6 +330,8 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; + bool last; + u8 flags; _enter("%x,{%d}", txb->seq, txb->len); @@ -339,6 +341,10 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t txb->seq == 1) whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; + txb->flags &= ~RXRPC_REQUEST_ACK; + flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; + last = txb->flags & RXRPC_LAST_PACKET; + /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * @@ -346,9 +352,7 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (txb->flags & RXRPC_REQUEST_ACK) - why = rxrpc_reqack_already_on; - else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb)) + if (last && rxrpc_sending_to_client(txb)) why = rxrpc_reqack_no_srv_last; else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) why = rxrpc_reqack_ack_lost; @@ -367,15 +371,17 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); trace_rxrpc_req_ack(call->debug_id, txb->seq, why); - if (why != rxrpc_reqack_no_srv_last) + if (why != rxrpc_reqack_no_srv_last) { txb->flags |= RXRPC_REQUEST_ACK; + flags |= RXRPC_REQUEST_ACK; + } dont_set_request_ack: - whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; + whdr->flags = flags; whdr->serial = htonl(txb->serial); whdr->cksum = txb->cksum; - trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false); + trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, false); } /* diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 263a2251e3d24..3b7e34dd4385e 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -519,9 +519,8 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); seq_printf(seq, - "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n", + "Why-Req-A: acklost=%u mrtt=%u ortt=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt])); seq_printf(seq, From cbe0d89095c31afcede96e4ce9cd58c4bed62d63 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:33 +0000 Subject: [PATCH 0282/1450] rxrpc: Don't set the MORE-PACKETS rxrpc wire header flag The MORE-PACKETS rxrpc header flag hasn't actually been looked at by anything since 1988 and not all implementations generate it. Change rxrpc so that it doesn't set MORE-PACKETS at all rather than setting it inconsistently. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-6-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/sendmsg.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index b04afb5df2419..546abb463c3f9 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -385,9 +385,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, (msg_data_left(msg) == 0 && !more)) { if (msg_data_left(msg) == 0 && !more) txb->flags |= RXRPC_LAST_PACKET; - else if (call->tx_top - call->acks_hard_ack < - call->tx_winsize) - txb->flags |= RXRPC_MORE_PACKETS; ret = call->security->secure_packet(call, txb); if (ret < 0) From ff992adbc470c86d2dcb66f5ed837fbb3c1a561e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:34 +0000 Subject: [PATCH 0283/1450] rxrpc: Show stats counter for received reason-0 ACKs In /proc/net/rxrpc/stats, show the stats counter for received ACKs that have the reason code set to 0 as some implementations do this. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-7-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 3b7e34dd4385e..cdf32f0d8e0ea 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -508,7 +508,7 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]), atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE])); seq_printf(seq, - "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n", + "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u z=%u\n", atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]), @@ -517,7 +517,8 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), - atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]), + atomic_read(&rxnet->stat_rx_acks[0])); seq_printf(seq, "Why-Req-A: acklost=%u mrtt=%u ortt=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), From 8b5823ea437624b53ecf084b6dd582760f110394 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:35 +0000 Subject: [PATCH 0284/1450] rxrpc: Request an ACK on impending Tx stall Set the REQUEST-ACK flag on the DATA packet we're about to send if we're about to stall transmission because the app layer isn't keeping up supplying us with data to transmit. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-8-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 + net/rxrpc/ar-internal.h | 2 +- net/rxrpc/output.c | 7 ++++++- net/rxrpc/proc.c | 5 +++-- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 62064f63d6eb8..d86b5f07d2926 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -452,6 +452,7 @@ #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ + EM(rxrpc_reqack_app_stall, "APP-STALL ") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index fcdfbc1d5aafa..d0fd37bdcfe9c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -110,7 +110,7 @@ struct rxrpc_net { atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; - atomic_t stat_why_req_ack[7]; + atomic_t stat_why_req_ack[8]; atomic_t stat_io_loop; }; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 50d5f2a02458a..b93a5d50be3e4 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -330,7 +330,7 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; - bool last; + bool last, more; u8 flags; _enter("%x,{%d}", txb->seq, txb->len); @@ -345,6 +345,9 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; last = txb->flags & RXRPC_LAST_PACKET; + more = (!list_is_last(&txb->call_link, &call->tx_buffer) || + !list_empty(&call->tx_sendmsg)); + /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * @@ -366,6 +369,8 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t why = rxrpc_reqack_more_rtt; else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) why = rxrpc_reqack_old_rtt; + else if (!last && !more) + why = rxrpc_reqack_app_stall; else goto dont_set_request_ack; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index cdf32f0d8e0ea..ce4d48bdfbe9e 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -520,10 +520,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]), atomic_read(&rxnet->stat_rx_acks[0])); seq_printf(seq, - "Why-Req-A: acklost=%u mrtt=%u ortt=%u\n", + "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt])); + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall])); seq_printf(seq, "Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]), From 420f8af502877a34dd371a7c8b6b943594487ebb Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:36 +0000 Subject: [PATCH 0285/1450] rxrpc: Use a large kvec[] in rxrpc_local rather than every rxrpc_txbuf Use a single large kvec[] in the rxrpc_local struct rather than one in every rxrpc_txbuf struct to build large packets to save on memory. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-9-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 6 ++++++ net/rxrpc/output.c | 45 ++++++++++++++++++++++++++++++----------- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d0fd37bdcfe9c..ab8e565cb20bc 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -320,6 +320,12 @@ struct rxrpc_local { struct list_head new_client_calls; /* Newly created client calls need connection */ spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ + /* Provide a kvec table sufficiently large to manage either a DATA + * packet with a maximum set of jumbo subpackets or a PING ACK padded + * out to 64K with zeropages for PMTUD. + */ + struct kvec kvec[RXRPC_MAX_NR_JUMBO > 3 + 16 ? + RXRPC_MAX_NR_JUMBO : 3 + 16]; }; /* diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index b93a5d50be3e4..f8bb5250e8495 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -175,9 +175,11 @@ static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial /* * Transmit an ACK packet. */ -static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb, + int nr_kv) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; struct rxrpc_connection *conn; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); struct msghdr msg; @@ -206,8 +208,9 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, txb->len); + iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, txb->len); rxrpc_local_dont_fragment(conn->local, false); + ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { @@ -233,6 +236,8 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { struct rxrpc_txbuf *txb; + struct kvec *kv = call->local->kvec; + int nr_kv; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return; @@ -248,12 +253,19 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, txb->ack_why = why; rxrpc_fill_out_ack(call, txb, ack_reason, serial); + + nr_kv = txb->nr_kvec; + kv[0] = txb->kvec[0]; + kv[1] = txb->kvec[1]; + kv[2] = txb->kvec[2]; + // TODO: Extend a path MTU probe ACK + call->ackr_nr_unacked = 0; atomic_set(&call->ackr_nr_consumed, 0); clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb); + rxrpc_send_ack_packet(call, txb, nr_kv); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); } @@ -324,12 +336,15 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) /* * Prepare a (sub)packet for transmission. */ -static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, - rxrpc_serial_t serial) +static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, + rxrpc_serial_t serial, + int subpkt) { struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; + struct kvec *kv = &call->local->kvec[subpkt]; + size_t len = txb->len; bool last, more; u8 flags; @@ -385,8 +400,13 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t whdr->flags = flags; whdr->serial = htonl(txb->serial); whdr->cksum = txb->cksum; + whdr->serviceId = htons(conn->service_id); + kv->iov_base = whdr; + // TODO: Convert into a jumbo header for tail subpackets trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, false); + kv->iov_len = len; + return len; } /* @@ -395,13 +415,15 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { rxrpc_serial_t serial; + size_t len = 0; /* Each transmission of a Tx packet needs a new serial number */ serial = rxrpc_get_next_serial(call->conn); - rxrpc_prepare_data_subpacket(call, txb, serial); + len += rxrpc_prepare_data_subpacket(call, txb, serial, 0); + // TODO: Loop around adding tail subpackets - return txb->len; + return len; } /* @@ -442,7 +464,6 @@ static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbu */ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; struct msghdr msg; @@ -463,7 +484,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t } } - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, len); + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -480,7 +501,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (txb->len >= call->peer->maxdata) { + if (len >= sizeof(struct rxrpc_wire_header) + call->peer->maxdata) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; } else { @@ -503,7 +524,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { - trace_rxrpc_tx_packet(call->debug_id, whdr, frag); + trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); } rxrpc_tx_backoff(call, ret); From eeaedc5449d9fccf2b56e844a018df9d3720d59e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:37 +0000 Subject: [PATCH 0286/1450] rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899) Implement path-MTU probing (along the lines of RFC8899) by padding some of the PING ACKs we send. PING ACKs get their own individual responses quite apart from the acking of data (though, as ACKs, they fulfil that role also). The probing concentrates on packet sizes that correspond how many subpackets can be stuffed inside a jumbo packet as jumbo DATA packets are just aggregations of individual DATA packets and can be split easily for retransmission purposes. If we want to perform probing, we advertise this by setting the maximum number of jumbo subpackets to 0 in the ack trailer when we send an ACK and see if the peer is also advertising the service. This is interpreted by non-supporting Rx stacks as an indication that jumbo packets aren't supported. The MTU sizes advertised in the ACK trailer AF_RXRPC transmits are pegged at a maximum of 1444 unless pmtud is supported by both sides. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-10-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 124 +++++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 25 +++++-- net/rxrpc/call_event.c | 5 ++ net/rxrpc/conn_event.c | 17 +++-- net/rxrpc/conn_object.c | 6 ++ net/rxrpc/input.c | 26 +++++--- net/rxrpc/io_thread.c | 6 ++ net/rxrpc/misc.c | 4 +- net/rxrpc/output.c | 67 +++++++++++++++---- net/rxrpc/peer_event.c | 104 +++++++++++++++++++++++++++-- net/rxrpc/peer_object.c | 24 +++++-- net/rxrpc/proc.c | 9 +-- net/rxrpc/protocol.h | 13 ++-- net/rxrpc/sysctl.c | 6 +- net/rxrpc/txbuf.c | 3 +- 15 files changed, 382 insertions(+), 57 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d86b5f07d2926..9dcadad88e766 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -364,6 +364,7 @@ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \ EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \ + EM(rxrpc_propose_ack_ping_for_mtu_probe, "MTUProb") \ EM(rxrpc_propose_ack_ping_for_old_rtt, "OldRtt ") \ EM(rxrpc_propose_ack_ping_for_params, "Params ") \ EM(rxrpc_propose_ack_ping_for_rtt, "Rtt ") \ @@ -478,6 +479,11 @@ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") +#define rxrpc_pmtud_reduce_traces \ + EM(rxrpc_pmtud_reduce_ack, "Ack ") \ + EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ + E_(rxrpc_pmtud_reduce_route, "Route") + /* * Generate enums for tracing information. */ @@ -498,6 +504,7 @@ enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); enum rxrpc_conn_trace { rxrpc_conn_traces } __mode(byte); enum rxrpc_local_trace { rxrpc_local_traces } __mode(byte); enum rxrpc_peer_trace { rxrpc_peer_traces } __mode(byte); +enum rxrpc_pmtud_reduce_trace { rxrpc_pmtud_reduce_traces } __mode(byte); enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte); enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); @@ -534,6 +541,7 @@ rxrpc_congest_changes; rxrpc_congest_modes; rxrpc_conn_traces; rxrpc_local_traces; +rxrpc_pmtud_reduce_traces; rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; @@ -2040,6 +2048,122 @@ TRACE_EVENT(rxrpc_sack, __entry->sack) ); +TRACE_EVENT(rxrpc_pmtud_tx, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(unsigned short, pmtud_trial) + __field(unsigned short, pmtud_good) + __field(unsigned short, pmtud_bad) + ), + + TP_fast_assign( + __entry->peer_debug_id = call->peer->debug_id; + __entry->call_debug_id = call->debug_id; + __entry->ping_serial = call->conn->pmtud_probe; + __entry->pmtud_trial = call->peer->pmtud_trial; + __entry->pmtud_good = call->peer->pmtud_good; + __entry->pmtud_bad = call->peer->pmtud_bad; + ), + + TP_printk("P=%08x c=%08x pr=%08x %u-%u-%u", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->pmtud_good, + __entry->pmtud_trial, + __entry->pmtud_bad) + ); + +TRACE_EVENT(rxrpc_pmtud_rx, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), + + TP_ARGS(conn, resp_serial), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(rxrpc_serial_t, resp_serial) + __field(unsigned short, max_data) + __field(u8, jumbo_max) + ), + + TP_fast_assign( + __entry->peer_debug_id = conn->peer->debug_id; + __entry->call_debug_id = conn->pmtud_call; + __entry->ping_serial = conn->pmtud_probe; + __entry->resp_serial = resp_serial; + __entry->max_data = conn->peer->max_data; + __entry->jumbo_max = conn->peer->pmtud_jumbo; + ), + + TP_printk("P=%08x c=%08x pr=%08x rr=%08x max=%u jm=%u", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->resp_serial, + __entry->max_data, + __entry->jumbo_max) + ); + +TRACE_EVENT(rxrpc_pmtud_lost, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), + + TP_ARGS(conn, resp_serial), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(rxrpc_serial_t, resp_serial) + ), + + TP_fast_assign( + __entry->peer_debug_id = conn->peer->debug_id; + __entry->call_debug_id = conn->pmtud_call; + __entry->ping_serial = conn->pmtud_probe; + __entry->resp_serial = resp_serial; + ), + + TP_printk("P=%08x c=%08x pr=%08x rr=%08x", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->resp_serial) + ); + +TRACE_EVENT(rxrpc_pmtud_reduce, + TP_PROTO(struct rxrpc_peer *peer, rxrpc_serial_t serial, + unsigned int max_data, enum rxrpc_pmtud_reduce_trace reason), + + TP_ARGS(peer, serial, max_data, reason), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(rxrpc_serial_t, serial) + __field(unsigned int, max_data) + __field(enum rxrpc_pmtud_reduce_trace, reason) + ), + + TP_fast_assign( + __entry->peer_debug_id = peer->debug_id; + __entry->serial = serial; + __entry->max_data = max_data; + __entry->reason = reason; + ), + + TP_printk("P=%08x %s r=%08x m=%u", + __entry->peer_debug_id, + __print_symbolic(__entry->reason, rxrpc_pmtud_reduce_traces), + __entry->serial, __entry->max_data) + ); + #undef EM #undef E_ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ab8e565cb20bc..69e6f4b20bad3 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -344,13 +344,25 @@ struct rxrpc_peer { time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ - unsigned int if_mtu; /* interface MTU for this peer */ - unsigned int mtu; /* network MTU for this peer */ - unsigned int maxdata; /* data size (MTU - hdrsize) */ - unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ struct sockaddr_rxrpc srx; /* remote address */ + /* Path MTU discovery [RFC8899] */ + unsigned int pmtud_trial; /* Current MTU probe size */ + unsigned int pmtud_good; /* Largest working MTU probe we've tried */ + unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */ + bool pmtud_lost; /* T if MTU probe was lost */ + bool pmtud_probing; /* T if we have an active probe outstanding */ + bool pmtud_pending; /* T if a call to this peer should send a probe */ + u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ + bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ + unsigned int ackr_max_data; /* Maximum data advertised by peer */ + seqcount_t mtu_lock; /* Lockless MTU access management */ + unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ + unsigned int max_data; /* Maximum packet data capacity for this peer */ + unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ + unsigned short tx_seg_max; /* Maximum number of transmissable segments */ + /* calculated RTT cache */ #define RXRPC_RTT_CACHE_SIZE 32 spinlock_t rtt_input_lock; /* RTT lock for input routine */ @@ -531,6 +543,8 @@ struct rxrpc_connection { int debug_id; /* debug ID for printks */ rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ + rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */ + unsigned int pmtud_call; /* ID of call used for probe */ u32 service_id; /* Service ID, possibly upgraded */ u32 security_level; /* Security level selected */ u8 security_ix; /* security type */ @@ -1155,6 +1169,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) */ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); int rxrpc_send_abort_packet(struct rxrpc_call *); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); @@ -1166,6 +1181,8 @@ void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); */ void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); void rxrpc_peer_keepalive_worker(struct work_struct *); +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail); /* * peer_object.c diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index c4754cc9b8d47..1d889b6f03661 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -483,6 +483,11 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_disconnect_call(call); if (call->security) call->security->free_call_crypto(call); + } else { + if (skb && + call->peer->ackr_adv_pmtud && + call->peer->pmtud_pending) + rxrpc_send_probe_for_pmtud(call); } if (call->acks_hard_ack != call->tx_bottom) rxrpc_shrink_call_tx_buffer(call); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 2a1396cd892f3..f6c02cc44d98e 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -92,7 +92,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct rxrpc_acktrailer trailer; size_t len; int ret, ioc; - u32 serial, mtu, call_id, padding; + u32 serial, max_mtu, if_mtu, call_id, padding; _enter("%d", conn->debug_id); @@ -150,8 +150,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; case RXRPC_PACKET_TYPE_ACK: - mtu = conn->peer->if_mtu; - mtu -= conn->peer->hdrsize; + if_mtu = conn->peer->if_mtu - conn->peer->hdrsize; + if (conn->peer->ackr_adv_pmtud) { + max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(1444, if_mtu); + max_mtu = if_mtu; + } pkt.ack.bufferSpace = 0; pkt.ack.maxSkew = htons(skb ? skb->priority : 0); pkt.ack.firstPacket = htonl(chan->last_seq + 1); @@ -159,10 +164,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - trailer.maxMTU = htonl(rxrpc_rx_mtu); - trailer.ifMTU = htonl(mtu); + trailer.maxMTU = htonl(max_mtu); + trailer.ifMTU = htonl(if_mtu); trailer.rwind = htonl(rxrpc_rx_window_size); - trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max); + trailer.jumbo_max = 0; pkt.whdr.flags |= RXRPC_SLOW_START_OK; padding = 0; iov[0].iov_len += sizeof(pkt.ack); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 694c4df7a1a31..b0627398311b4 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -321,6 +321,12 @@ static void rxrpc_clean_up_connection(struct work_struct *work) list_del_init(&conn->proc_link); write_unlock(&rxnet->conn_lock); + if (conn->pmtud_probe) { + trace_rxrpc_pmtud_lost(conn, 0); + conn->peer->pmtud_probing = false; + conn->peer->pmtud_pending = true; + } + rxrpc_purge_queue(&conn->rx_queue); rxrpc_kill_client_conn(conn); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 49e35be7dc135..fd08d813ef296 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -692,8 +692,8 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb struct rxrpc_acktrailer *trailer) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_peer *peer; - unsigned int mtu; + struct rxrpc_peer *peer = call->peer; + unsigned int max_data; bool wake = false; u32 rwind = ntohl(trailer->rwind); @@ -706,14 +706,22 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb call->tx_winsize = rwind; } - mtu = umin(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); + if (trailer->jumbo_max == 0) { + /* The peer says it supports pmtu discovery */ + peer->ackr_adv_pmtud = true; + } else { + peer->ackr_adv_pmtud = false; + } + + max_data = ntohl(trailer->maxMTU); + peer->ackr_max_data = max_data; - peer = call->peer; - if (mtu < peer->maxdata) { - spin_lock(&peer->lock); - peer->maxdata = mtu; - peer->mtu = mtu + peer->hdrsize; - spin_unlock(&peer->lock); + if (max_data < peer->max_data) { + trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_data, + rxrpc_pmtud_reduce_ack); + write_seqcount_begin(&peer->mtu_lock); + peer->max_data = max_data; + write_seqcount_end(&peer->mtu_lock); } if (wake) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 7af5adf53b25c..bd6d4f5e97b4c 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -364,6 +364,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (sp->hdr.callNumber == 0) return rxrpc_input_conn_packet(conn, skb); + /* Deal with path MTU discovery probing. */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK && + conn->pmtud_probe && + after_eq(sp->ack.acked_serial, conn->pmtud_probe)) + rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false); + /* Call-bound packets are routed by connection channel. */ channel = sp->hdr.cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 657cf35089a62..8fcc8139d7717 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255; * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet * made by gluing normal packets together that we're willing to handle. */ -unsigned int rxrpc_rx_mtu = 5692; +unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46); /* * The maximum number of fragments in a received jumbo packet that we tell the * sender that we're willing to handle. */ -unsigned int rxrpc_rx_jumbo_max = 4; +unsigned int rxrpc_rx_jumbo_max = 46; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY /* diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f8bb5250e8495..a91be871ad963 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -82,10 +82,9 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); - unsigned int qsize, sack, wrap, to; + unsigned int qsize, sack, wrap, to, max_mtu, if_mtu; rxrpc_seq_t window, wtop; int rsize; - u32 mtu, jmax; u8 *filler = txb->kvec[2].iov_base; u8 *sackp = txb->kvec[1].iov_base; @@ -132,16 +131,22 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, ack->reason = RXRPC_ACK_IDLE; } - mtu = call->peer->if_mtu; - mtu -= call->peer->hdrsize; - jmax = rxrpc_rx_jumbo_max; qsize = (window - 1) - call->rx_consumed; rsize = max_t(int, call->rx_winsize - qsize, 0); txb->ack_rwind = rsize; - trailer->maxMTU = htonl(rxrpc_rx_mtu); - trailer->ifMTU = htonl(mtu); + + if_mtu = call->peer->if_mtu - call->peer->hdrsize; + if (call->peer->ackr_adv_pmtud) { + max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(if_mtu, 1444); + max_mtu = if_mtu; + } + + trailer->maxMTU = htonl(max_mtu); + trailer->ifMTU = htonl(if_mtu); trailer->rwind = htonl(rsize); - trailer->jumbo_max = htonl(jmax); + trailer->jumbo_max = 0; /* Advertise pmtu discovery */ } /* @@ -176,7 +181,7 @@ static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial * Transmit an ACK packet. */ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb, - int nr_kv) + int nr_kv, enum rxrpc_propose_ack_trace why) { struct kvec *kv = call->local->kvec; struct rxrpc_wire_header *whdr = kv[0].iov_base; @@ -209,13 +214,16 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, txb->len); - rxrpc_local_dont_fragment(conn->local, false); + rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe); ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, rxrpc_tx_point_call_ack); + if (why == rxrpc_propose_ack_ping_for_mtu_probe && + ret == -EMSGSIZE) + rxrpc_input_probe_for_pmtud(conn, txb->serial, true); } else { trace_rxrpc_tx_packet(call->debug_id, whdr, rxrpc_tx_point_call_ack); @@ -225,6 +233,13 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t if (txb->flags & RXRPC_REQUEST_ACK) call->peer->rtt_last_req = now; rxrpc_set_keepalive(call, now); + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + call->peer->pmtud_pending = false; + call->peer->pmtud_probing = true; + call->conn->pmtud_probe = txb->serial; + call->conn->pmtud_call = call->debug_id; + trace_rxrpc_pmtud_tx(call); + } } rxrpc_tx_backoff(call, ret); } @@ -254,21 +269,45 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_fill_out_ack(call, txb, ack_reason, serial); + /* Extend a path MTU probe ACK. */ nr_kv = txb->nr_kvec; kv[0] = txb->kvec[0]; kv[1] = txb->kvec[1]; kv[2] = txb->kvec[2]; - // TODO: Extend a path MTU probe ACK + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header); + + if (txb->len > probe_mtu) + goto skip; + while (txb->len < probe_mtu) { + size_t part = umin(probe_mtu - txb->len, PAGE_SIZE); + + kv[nr_kv].iov_base = page_address(ZERO_PAGE(0)); + kv[nr_kv].iov_len = part; + txb->len += part; + nr_kv++; + } + } call->ackr_nr_unacked = 0; atomic_set(&call->ackr_nr_consumed, 0); clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb, nr_kv); + rxrpc_send_ack_packet(call, txb, nr_kv, why); +skip: rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); } +/* + * Send an ACK probe for path MTU discovery. + */ +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call) +{ + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_mtu_probe); +} + /* * Send an ABORT call packet. */ @@ -501,7 +540,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (len >= sizeof(struct rxrpc_wire_header) + call->peer->maxdata) { + if (len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; } else { @@ -548,7 +587,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t RX_USER_ABORT, ret); } - _leave(" = %d [%u]", ret, call->peer->maxdata); + _leave(" = %d [%u]", ret, call->peer->max_data); return ret; } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 552ba84a255c4..8fc9464a960ca 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, */ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { + unsigned int max_data; + /* wind down the local interface MTU */ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) peer->if_mtu = mtu; @@ -120,11 +122,17 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) } } - if (mtu < peer->mtu) { - spin_lock(&peer->lock); - peer->mtu = mtu; - peer->maxdata = peer->mtu - peer->hdrsize; - spin_unlock(&peer->lock); + max_data = max_t(int, mtu - peer->hdrsize, 500); + if (max_data < peer->max_data) { + if (peer->pmtud_good > max_data) + peer->pmtud_good = max_data; + if (peer->pmtud_bad > max_data + 1) + peer->pmtud_bad = max_data + 1; + + trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); + write_seqcount_begin(&peer->mtu_lock); + peer->max_data = max_data; + write_seqcount_end(&peer->mtu_lock); } } @@ -347,3 +355,89 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) _leave(""); } + +/* + * Do path MTU probing. + */ +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail) +{ + struct rxrpc_peer *peer = conn->peer; + unsigned int max_data = peer->max_data; + int good, trial, bad, jumbo; + + good = peer->pmtud_good; + trial = peer->pmtud_trial; + bad = peer->pmtud_bad; + if (good >= bad - 1) { + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + return; + } + + if (!peer->pmtud_probing) + goto send_probe; + + if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) { + /* Retry a lost probe. */ + if (!peer->pmtud_lost) { + trace_rxrpc_pmtud_lost(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = true; + goto send_probe; + } + + /* The probed size didn't seem to get through. */ + bad = trial; + peer->pmtud_bad = bad; + if (bad <= max_data) + max_data = bad - 1; + } else { + /* It did get through. */ + good = trial; + peer->pmtud_good = good; + if (good > max_data) + max_data = good; + } + + max_data = umin(max_data, peer->ackr_max_data); + if (max_data != peer->max_data) { + preempt_disable(); + write_seqcount_begin(&peer->mtu_lock); + peer->max_data = max_data; + write_seqcount_end(&peer->mtu_lock); + preempt_enable(); + } + + jumbo = max_data + sizeof(struct rxrpc_jumbo_header); + jumbo /= RXRPC_JUMBO_SUBPKTLEN; + peer->pmtud_jumbo = jumbo; + + trace_rxrpc_pmtud_rx(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + + if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2)) + trial = RXRPC_JUMBO(2); + else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4)) + trial = RXRPC_JUMBO(4); + else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3)) + trial = RXRPC_JUMBO(3); + else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6)) + trial = RXRPC_JUMBO(6); + else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5)) + trial = RXRPC_JUMBO(5); + else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8)) + trial = RXRPC_JUMBO(8); + else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7)) + trial = RXRPC_JUMBO(7); + else + trial = (good + bad) / 2; + peer->pmtud_trial = trial; + + if (good >= bad) + return; + +send_probe: + peer->pmtud_pending = true; +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 49dcda67a0d59..80ef6f06d5122 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, #endif peer->if_mtu = 1500; + if (peer->max_data < peer->if_mtu - peer->hdrsize) { + trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize, + rxrpc_pmtud_reduce_route); + peer->max_data = peer->if_mtu - peer->hdrsize; + } memset(&fl, 0, sizeof(fl)); switch (peer->srx.transport.family) { @@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, } peer->if_mtu = dst_mtu(dst); + peer->hdrsize += dst->header_len + dst->trailer_len; + peer->tx_seg_max = dst->dev->gso_max_segs; dst_release(dst); + peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize); + peer->pmtud_good = 500; + peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1; + peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1); + peer->pmtud_pending = true; + _leave(" [if_mtu %u]", peer->if_mtu); } @@ -223,6 +236,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); spin_lock_init(&peer->rtt_input_lock); + seqcount_init(&peer->mtu_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); rxrpc_peer_init_rtt(peer); @@ -242,9 +256,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; - rxrpc_assess_MTU_size(local, peer); - peer->mtu = peer->if_mtu; - peer->rtt_last_req = ktime_get_real(); + switch (peer->srx.transport.family) { case AF_INET: @@ -268,7 +280,11 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, } peer->hdrsize += sizeof(struct rxrpc_wire_header); - peer->maxdata = peer->mtu - peer->hdrsize; + peer->max_data = peer->if_mtu - peer->hdrsize; + + rxrpc_assess_MTU_size(local, peer); + + peer->rtt_last_req = ktime_get_real(); } /* diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index ce4d48bdfbe9e..44722c2260648 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -283,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_puts(seq, - "Proto Local " - " Remote " - " Use SST MTU LastUse RTT RTO\n" + "Proto Local Remote Use SST Maxd LastUse RTT RTO\n" ); return 0; } @@ -298,13 +296,12 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) now = ktime_get_seconds(); seq_printf(seq, - "UDP %-47.47s %-47.47s %3u" - " %3u %5u %6llus %8u %8u\n", + "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8u %8u\n", lbuff, rbuff, refcount_read(&peer->ref), peer->cong_ssthresh, - peer->mtu, + peer->max_data, now - peer->last_tx_at, peer->srtt_us >> 3, peer->rto_us); diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 4fe6b4d20ada9..42f70e4636f82 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -92,11 +92,16 @@ struct rxrpc_jumbo_header { /* * The maximum number of subpackets that can possibly fit in a UDP packet is: * - * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1 - * = ((65535 - 28 - 28) / 1416) + 1 - * = 46 non-terminal packets and 1 terminal packet. + * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412) + * = ((65535 - 28 + 4) / 1416) + * = 45 non-terminal packets and 1 terminal packet. */ -#define RXRPC_MAX_NR_JUMBO 47 +#define RXRPC_MAX_NR_JUMBO 46 + +/* Size of a jumbo packet with N subpackets, excluding UDP+IP */ +#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \ + RXRPC_JUMBO_DATALEN + \ + ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN) /*****************************************************************************/ /* diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 9bf9a1f6e4cb2..46a20cf4c4027 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -11,6 +11,8 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; +static const unsigned int rxrpc_rx_mtu_min = 500; +static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO; static const unsigned int four = 4; static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1; static const unsigned int n_65535 = 65535; @@ -115,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)SYSCTL_ONE, + .extra1 = (void *)&rxrpc_rx_mtu_min, .extra2 = (void *)&n_65535, }, { @@ -125,7 +127,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)SYSCTL_ONE, - .extra2 = (void *)&four, + .extra2 = (void *)&rxrpc_jumbo_max, }, }; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index c3913d8a50d34..2a4291617d40f 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -179,7 +179,8 @@ static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); for (i = 0; i < txb->nr_kvec; i++) - if (txb->kvec[i].iov_base) + if (txb->kvec[i].iov_base && + !is_zero_pfn(page_to_pfn(virt_to_page(txb->kvec[i].iov_base)))) page_frag_free(txb->kvec[i].iov_base); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); From 3d2bdf73cea57d7f6bf314aa1c948af11af94980 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:38 +0000 Subject: [PATCH 0287/1450] rxrpc: Separate the packet length from the data length in rxrpc_txbuf Separate the packet length from the data length (txb->len) stored in the rxrpc_txbuf to make security calculations easier. Also store the allocation size as that's an upper bound on the size of the security wrapper and change a number of fields to unsigned short as the amount of data can't exceed the capacity of a UDP packet. Also, whilst we're at it, use kzalloc() for txbufs. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-11-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 8 +++++--- net/rxrpc/insecure.c | 1 + net/rxrpc/output.c | 7 ++++--- net/rxrpc/rxkad.c | 44 ++++++++++++++++++++++------------------- net/rxrpc/sendmsg.c | 1 - net/rxrpc/txbuf.c | 7 ++----- 6 files changed, 36 insertions(+), 32 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 69e6f4b20bad3..a5c0bc9176416 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -821,9 +821,11 @@ struct rxrpc_txbuf { rxrpc_serial_t serial; /* Last serial number transmitted with */ unsigned int call_debug_id; unsigned int debug_id; - unsigned int len; /* Amount of data in buffer */ - unsigned int space; /* Remaining data space */ - unsigned int offset; /* Offset of fill point */ + unsigned short len; /* Amount of data in buffer */ + unsigned short space; /* Remaining data space */ + unsigned short offset; /* Offset of fill point */ + unsigned short pkt_len; /* Size of packet content */ + unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; #define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 751eb621021de..d665f486be5fa 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -24,6 +24,7 @@ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t rema static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { + txb->pkt_len = txb->len; return 0; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index a91be871ad963..df9af4ad42609 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -383,11 +383,11 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; struct kvec *kv = &call->local->kvec[subpkt]; - size_t len = txb->len; + size_t len = txb->pkt_len; bool last, more; u8 flags; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,%zd", txb->seq, len); txb->serial = serial; @@ -441,6 +441,7 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc whdr->cksum = txb->cksum; whdr->serviceId = htons(conn->service_id); kv->iov_base = whdr; + len += sizeof(*whdr); // TODO: Convert into a jumbo header for tail subpackets trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, false); @@ -509,7 +510,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t size_t len; int ret; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,{%d}", txb->seq, txb->pkt_len); len = rxrpc_prepare_data_packet(call, txb); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index e3194d73dd846..755897fab6265 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -148,14 +148,14 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn, static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { struct rxrpc_txbuf *txb; - size_t shdr, space; + size_t shdr, alloc, limit, part; remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header)); switch (call->conn->security_level) { default: - space = umin(remain, RXRPC_JUMBO_DATALEN); - return rxrpc_alloc_data_txbuf(call, space, 1, gfp); + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); break; @@ -164,15 +164,21 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem break; } - space = umin(round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); - space = round_up(space, RXKAD_ALIGN); + limit = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN) - shdr; + if (remain < limit) { + part = remain; + alloc = round_up(shdr + part, RXKAD_ALIGN); + } else { + part = limit; + alloc = RXRPC_JUMBO_DATALEN; + } - txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp); + txb = rxrpc_alloc_data_txbuf(call, alloc, RXKAD_ALIGN, gfp); if (!txb) return NULL; txb->offset += shdr; - txb->space -= shdr; + txb->space = part; return txb; } @@ -263,13 +269,13 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, check = txb->seq ^ call->call_id; hdr->data_size = htonl((u32)check << 16 | txb->len); - txb->len += sizeof(struct rxkad_level1_hdr); - pad = txb->len; + txb->pkt_len = sizeof(struct rxkad_level1_hdr) + txb->len; + pad = txb->pkt_len; pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; + txb->pkt_len += pad; } /* start the encryption afresh */ @@ -298,7 +304,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1); struct rxrpc_crypt iv; struct scatterlist sg; - size_t pad; + size_t content, pad; u16 check; int ret; @@ -309,23 +315,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr->data_size = htonl(txb->len | (u32)check << 16); rxkhdr->checksum = 0; - txb->len += sizeof(struct rxkad_level2_hdr); - pad = txb->len; - pad = RXKAD_ALIGN - pad; - pad &= RXKAD_ALIGN - 1; - if (pad) { + content = sizeof(struct rxkad_level2_hdr) + txb->len; + txb->pkt_len = round_up(content, RXKAD_ALIGN); + pad = txb->pkt_len - content; + if (pad) memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; - } /* encrypt from the session key */ token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg, rxkhdr, txb->len); + sg_init_one(&sg, rxkhdr, txb->pkt_len); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x); + skcipher_request_set_crypt(req, &sg, &sg, txb->pkt_len, iv.x); ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return ret; @@ -384,6 +387,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: + txb->pkt_len = txb->len; ret = 0; break; case RXRPC_SECURITY_AUTH: diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 546abb463c3f9..786c1fb1369ab 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -391,7 +391,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, goto out; txb->kvec[0].iov_len += txb->len; - txb->len = txb->kvec[0].iov_len; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 2a4291617d40f..8b7c854ed3d75 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -24,7 +24,7 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ size_t total, hoff; void *buf; - txb = kmalloc(sizeof(*txb), gfp); + txb = kzalloc(sizeof(*txb), gfp); if (!txb) return NULL; @@ -49,14 +49,11 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ txb->last_sent = KTIME_MIN; txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); + txb->alloc_size = data_size; txb->space = data_size; - txb->len = 0; txb->offset = sizeof(*whdr); txb->flags = call->conn->out_clientflag; - txb->ack_why = 0; txb->seq = call->tx_prepared + 1; - txb->serial = 0; - txb->cksum = 0; txb->nr_kvec = 1; txb->kvec[0].iov_base = whdr; txb->kvec[0].iov_len = sizeof(*whdr); From b7313009c2e56d6e8bffd3d21c1a3a67a9149e2e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:39 +0000 Subject: [PATCH 0288/1450] rxrpc: Prepare to be able to send jumbo DATA packets Prepare to be able to send jumbo DATA packets if the we decide to, but don't enable that yet. This will allow larger chunks of data to be sent without reducing the retryability as the subpackets in a jumbo packet can also be retransmitted individually. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-12-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 18 +++++++++- net/rxrpc/call_event.c | 48 ++++++++++++++----------- net/rxrpc/input.c | 36 +++++++++++-------- net/rxrpc/insecure.c | 2 ++ net/rxrpc/output.c | 80 ++++++++++++++++++++++++++++------------- net/rxrpc/rxkad.c | 13 +++++++ 6 files changed, 137 insertions(+), 60 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a5c0bc9176416..4386b2e6cca57 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -832,6 +832,7 @@ struct rxrpc_txbuf { __be16 cksum; /* Checksum to go in header */ unsigned short ack_rwind; /* ACK receive window */ u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ + bool jumboable; /* Can be non-terminal jumbo subpacket */ u8 nr_kvec; /* Amount of kvec[] used */ struct kvec kvec[3]; }; @@ -862,6 +863,21 @@ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn return serial; } +/* + * Allocate the next serial n numbers on a connection. 0 must be skipped. + */ +static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn, + unsigned int n) +{ + rxrpc_serial_t serial; + + serial = conn->tx_serial; + if (serial + n <= n) + serial = 1; + conn->tx_serial = serial + n; + return serial; +} + /* * af_rxrpc.c */ @@ -1176,7 +1192,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); +void rxrpc_transmit_data(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n); /* * peer_event.c diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 1d889b6f03661..3379adfaaf65f 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -124,7 +124,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) ktime_sub(resend_at, now)); txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); + rxrpc_transmit_data(call, txb, 1); did_send = true; now = ktime_get_real(); @@ -164,7 +164,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) unacked = true; txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); + rxrpc_transmit_data(call, txb, 1); did_send = true; rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); now = ktime_get_real(); @@ -231,15 +231,12 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) } } -static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) +static unsigned int rxrpc_tx_window_space(struct rxrpc_call *call) { - unsigned int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); - rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize; - rxrpc_seq_t tx_top = call->tx_top; - int space; + int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); + int in_flight = call->tx_top - call->acks_hard_ack; - space = wtop - tx_top; - return space > 0; + return max(winsize - in_flight, 0); } /* @@ -247,7 +244,7 @@ static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) */ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) { - struct rxrpc_txbuf *txb; + int space = rxrpc_tx_window_space(call); if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { if (list_empty(&call->tx_sendmsg)) @@ -255,22 +252,33 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) rxrpc_expose_client_call(call); } - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { + while (space > 0) { + struct rxrpc_txbuf *head = NULL, *txb; + int count = 0, limit = min(space, 1); + + if (list_empty(&call->tx_sendmsg)) + break; + spin_lock(&call->tx_lock); - list_del(&txb->call_link); + do { + txb = list_first_entry(&call->tx_sendmsg, + struct rxrpc_txbuf, call_link); + if (!head) + head = txb; + list_move_tail(&txb->call_link, &call->tx_buffer); + count++; + if (!txb->jumboable) + break; + } while (count < limit && !list_empty(&call->tx_sendmsg)); + spin_unlock(&call->tx_lock); call->tx_top = txb->seq; - list_add_tail(&txb->call_link, &call->tx_buffer); - if (txb->flags & RXRPC_LAST_PACKET) rxrpc_close_tx_phase(call); - rxrpc_transmit_one(call, txb); - - if (!rxrpc_tx_window_has_space(call)) - break; + space -= count; + rxrpc_transmit_data(call, head, count); } } @@ -285,7 +293,7 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) case RXRPC_CALL_SERVER_SEND_REPLY: case RXRPC_CALL_CLIENT_SEND_REQUEST: - if (!rxrpc_tx_window_has_space(call)) + if (!rxrpc_tx_window_space(call)) return; if (list_empty(&call->tx_sendmsg)) { rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index fd08d813ef296..8398fa10ee8d7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -693,9 +693,12 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_peer *peer = call->peer; - unsigned int max_data; + unsigned int max_data, capacity; bool wake = false; - u32 rwind = ntohl(trailer->rwind); + u32 max_mtu = ntohl(trailer->maxMTU); + //u32 if_mtu = ntohl(trailer->ifMTU); + u32 rwind = ntohl(trailer->rwind); + u32 jumbo_max = ntohl(trailer->jumbo_max); if (rwind > RXRPC_TX_MAX_WINDOW) rwind = RXRPC_TX_MAX_WINDOW; @@ -706,24 +709,29 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb call->tx_winsize = rwind; } - if (trailer->jumbo_max == 0) { - /* The peer says it supports pmtu discovery */ - peer->ackr_adv_pmtud = true; - } else { - peer->ackr_adv_pmtud = false; - } - - max_data = ntohl(trailer->maxMTU); - peer->ackr_max_data = max_data; + max_mtu = clamp(max_mtu, 500, 65535); + peer->ackr_max_data = max_mtu; - if (max_data < peer->max_data) { - trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_data, + if (max_mtu < peer->max_data) { + trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu, rxrpc_pmtud_reduce_ack); write_seqcount_begin(&peer->mtu_lock); - peer->max_data = max_data; + peer->max_data = max_mtu; write_seqcount_end(&peer->mtu_lock); } + max_data = umin(max_mtu, peer->max_data); + capacity = max_data; + capacity += sizeof(struct rxrpc_jumbo_header); /* First subpacket has main hdr, not jumbo */ + capacity /= sizeof(struct rxrpc_jumbo_header) + RXRPC_JUMBO_DATALEN; + + if (jumbo_max == 0) { + /* The peer says it supports pmtu discovery */ + peer->ackr_adv_pmtud = true; + } else { + peer->ackr_adv_pmtud = false; + } + if (wake) wake_up(&call->waitq); } diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index d665f486be5fa..e068f9b79d023 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -25,6 +25,8 @@ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t rema static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { txb->pkt_len = txb->len; + if (txb->len == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; return 0; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index df9af4ad42609..aededdd474d71 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -377,9 +377,10 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) */ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_serial_t serial, - int subpkt) + int subpkt, int nr_subpkts) { struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxrpc_jumbo_header *jumbo = (void *)(whdr + 1) - sizeof(*jumbo); enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; struct kvec *kv = &call->local->kvec[subpkt]; @@ -399,6 +400,11 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; last = txb->flags & RXRPC_LAST_PACKET; + if (subpkt < nr_subpkts - 1) { + len = RXRPC_JUMBO_DATALEN; + goto dont_set_request_ack; + } + more = (!list_is_last(&txb->call_link, &call->tx_buffer) || !list_empty(&call->tx_sendmsg)); @@ -436,13 +442,25 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc } dont_set_request_ack: - whdr->flags = flags; - whdr->serial = htonl(txb->serial); - whdr->cksum = txb->cksum; - whdr->serviceId = htons(conn->service_id); - kv->iov_base = whdr; - len += sizeof(*whdr); - // TODO: Convert into a jumbo header for tail subpackets + /* The jumbo header overlays the wire header in the txbuf. */ + if (subpkt < nr_subpkts - 1) + flags |= RXRPC_JUMBO_PACKET; + else + flags &= ~RXRPC_JUMBO_PACKET; + if (subpkt == 0) { + whdr->flags = flags; + whdr->serial = htonl(txb->serial); + whdr->cksum = txb->cksum; + whdr->serviceId = htons(conn->service_id); + kv->iov_base = whdr; + len += sizeof(*whdr); + } else { + jumbo->flags = flags; + jumbo->pad = 0; + jumbo->cksum = txb->cksum; + kv->iov_base = jumbo; + len += sizeof(*jumbo); + } trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, false); kv->iov_len = len; @@ -450,18 +468,22 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc } /* - * Prepare a packet for transmission. + * Prepare a (jumbo) packet for transmission. */ -static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *head, int n) { + struct rxrpc_txbuf *txb = head; rxrpc_serial_t serial; size_t len = 0; /* Each transmission of a Tx packet needs a new serial number */ - serial = rxrpc_get_next_serial(call->conn); + serial = rxrpc_get_next_serials(call->conn, n); - len += rxrpc_prepare_data_subpacket(call, txb, serial, 0); - // TODO: Loop around adding tail subpackets + for (int i = 0; i < n; i++) { + len += rxrpc_prepare_data_subpacket(call, txb, serial, i, n); + serial++; + txb = list_next_entry(txb, call_link); + } return len; } @@ -469,16 +491,24 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_tx /* * Set timeouts after transmitting a packet. */ -static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n) { + rxrpc_serial_t serial; ktime_t now = ktime_get_real(); bool ack_requested = txb->flags & RXRPC_REQUEST_ACK; + int i; call->tx_last_sent = now; - txb->last_sent = now; + + for (i = 0; i < n; i++) { + txb->last_sent = now; + ack_requested |= txb->flags & RXRPC_REQUEST_ACK; + serial = txb->serial; + txb = list_next_entry(txb, call_link); + } if (ack_requested) { - rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_data); + rxrpc_begin_rtt_probe(call, serial, now, rxrpc_rtt_tx_data); call->peer->rtt_last_req = now; if (call->peer->rtt_count > 1) { @@ -502,7 +532,7 @@ static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbu /* * send a packet through the transport endpoint */ -static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n) { struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; @@ -512,7 +542,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t _enter("%x,{%d}", txb->seq, txb->pkt_len); - len = rxrpc_prepare_data_packet(call, txb); + len = rxrpc_prepare_data_packet(call, txb, n); if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; @@ -524,7 +554,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t } } - iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1, len); + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, n, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -537,7 +567,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t * yet. */ if (txb->seq == call->tx_transmitted + 1) - call->tx_transmitted = txb->seq; + call->tx_transmitted = txb->seq + n - 1; /* send the packet with the don't fragment bit set if we currently * think it's small enough */ @@ -568,7 +598,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t } rxrpc_tx_backoff(call, ret); - if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_frag) { + if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_nofrag) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; goto retry; @@ -576,7 +606,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t done: if (ret >= 0) { - rxrpc_tstamp_data_packets(call, txb); + rxrpc_tstamp_data_packets(call, txb, n); } else { /* Cancel the call if the initial transmission fails, * particularly if that's due to network routing issues that @@ -776,13 +806,13 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call, } /* - * Transmit one packet. + * Transmit a packet, possibly gluing several subpackets together. */ -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +void rxrpc_transmit_data(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n) { int ret; - ret = rxrpc_send_data_packet(call, txb); + ret = rxrpc_send_data_packet(call, txb, n); if (ret < 0) { switch (ret) { case -ENETUNREACH: diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 755897fab6265..62b09d23ec08c 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -392,15 +392,28 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) break; case RXRPC_SECURITY_AUTH: ret = rxkad_secure_packet_auth(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; case RXRPC_SECURITY_ENCRYPT: ret = rxkad_secure_packet_encrypt(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; default: ret = -EPERM; break; } + /* Clear excess space in the packet */ + if (txb->pkt_len < txb->alloc_size) { + struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + size_t gap = txb->alloc_size - txb->pkt_len; + void *p = whdr + 1; + + memset(p + txb->pkt_len, 0, gap); + } + skcipher_request_free(req); _leave(" = %d [set %x]", ret, y); return ret; From 149d002bee706f51772bd320cda90c922844bb0e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:40 +0000 Subject: [PATCH 0289/1450] rxrpc: Add a tracepoint to show variables pertinent to jumbo packet size Add a tracepoint to be called right before packets are transmitted for the first time that shows variable values that are pertinent to how many subpackets will be added to a jumbo DATA packet. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-13-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 41 ++++++++++++++++++++++++++++++++++++ net/rxrpc/call_event.c | 2 ++ 2 files changed, 43 insertions(+) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 9dcadad88e766..71f07e726a903 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -903,6 +903,47 @@ TRACE_EVENT(rxrpc_txqueue, __entry->tx_winsize) ); +TRACE_EVENT(rxrpc_transmit, + TP_PROTO(struct rxrpc_call *call, int space), + + TP_ARGS(call, space), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, seq) + __field(u16, space) + __field(u16, tx_winsize) + __field(u16, cong_cwnd) + __field(u16, cong_extra) + __field(u16, in_flight) + __field(u16, prepared) + __field(u16, pmtud_jumbo) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->seq = call->tx_bottom; + __entry->space = space; + __entry->tx_winsize = call->tx_winsize; + __entry->cong_cwnd = call->cong_cwnd; + __entry->cong_extra = call->cong_extra; + __entry->prepared = call->tx_prepared - call->tx_bottom; + __entry->in_flight = call->tx_top - call->acks_hard_ack; + __entry->pmtud_jumbo = call->peer->pmtud_jumbo; + ), + + TP_printk("c=%08x q=%08x sp=%u tw=%u cw=%u+%u pr=%u if=%u pj=%u", + __entry->call, + __entry->seq, + __entry->space, + __entry->tx_winsize, + __entry->cong_cwnd, + __entry->cong_extra, + __entry->prepared, + __entry->in_flight, + __entry->pmtud_jumbo) + ); + TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags), diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 3379adfaaf65f..1f716f09d4415 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -259,6 +259,8 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) if (list_empty(&call->tx_sendmsg)) break; + trace_rxrpc_transmit(call, space); + spin_lock(&call->tx_lock); do { txb = list_first_entry(&call->tx_sendmsg, From 9e3cccd176b5ec6ff78693287fb03097e453e69c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:41 +0000 Subject: [PATCH 0290/1450] rxrpc: Fix CPU time starvation in I/O thread Starvation can happen in the rxrpc I/O thread because it goes back to the top of the I/O loop after it does any one thing without trying to give any other connection or call CPU time. Also, because it processes one call packet at a time, it tries to do the retransmission loop after each ACK without checking to see if there are other ACKs already in the queue that can update the SACK state. Fix this by: (1) Add a received-packet queue on each call. (2) Distribute packets from the master Rx queue to the individual call, conn and error queues and 'poking' calls to add them to the attend queue first thing in the I/O thread. (3) Go through all the attention-seeking connections and calls before going back to the top of the I/O thread. Each queue is extracted as a whole and then gone through so that new additions to insert themselves into the queue. (4) Make the call event handler go through all the packets currently on the call's rx_queue before transmitting and retransmitting DATA packets. (5) Drop the skb argument from the call event handler as this is now replaced with the rx_queue. Instead, keep track of whether we received a packet or an ACK for the tests that used to rely on that. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-14-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 3 + net/rxrpc/ar-internal.h | 10 +++- net/rxrpc/call_accept.c | 2 +- net/rxrpc/call_event.c | 34 +++++++----- net/rxrpc/call_object.c | 2 + net/rxrpc/conn_client.c | 12 ++-- net/rxrpc/input.c | 2 +- net/rxrpc/io_thread.c | 104 ++++++++++++++++++----------------- net/rxrpc/peer_event.c | 2 +- 9 files changed, 96 insertions(+), 75 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 71f07e726a903..28fa7be31ff83 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -120,6 +120,7 @@ EM(rxrpc_call_poke_conn_abort, "Conn-abort") \ EM(rxrpc_call_poke_error, "Error") \ EM(rxrpc_call_poke_idle, "Idle") \ + EM(rxrpc_call_poke_rx_packet, "Rx-packet") \ EM(rxrpc_call_poke_set_timeout, "Set-timo") \ EM(rxrpc_call_poke_start, "Start") \ EM(rxrpc_call_poke_timer, "Timer") \ @@ -128,6 +129,7 @@ #define rxrpc_skb_traces \ EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ + EM(rxrpc_skb_get_call_rx, "GET call-rx ") \ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_last_nack, "GET last-nack") \ @@ -139,6 +141,7 @@ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ + EM(rxrpc_skb_put_call_rx, "PUT call-rx ") \ EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 4386b2e6cca57..55cc68dd1b403 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -705,6 +705,7 @@ struct rxrpc_call { /* Received data tracking */ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ + struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ @@ -906,7 +907,7 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); +bool rxrpc_input_call_event(struct rxrpc_call *call); /* * call_object.c @@ -1352,6 +1353,13 @@ static inline bool after_eq(u32 seq1, u32 seq2) return (s32)(seq1 - seq2) >= 0; } +static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + rxrpc_get_skb(skb, rxrpc_skb_get_call_rx); + __skb_queue_tail(&call->rx_queue, skb); + rxrpc_poke_call(call, rxrpc_call_poke_rx_packet); +} + /* * debug tracing */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 0f5a1d77b890f..a6776b1604bab 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -408,7 +408,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, } _leave(" = %p{%d}", call, call->debug_id); - rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); return true; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 1f716f09d4415..ef47de3f41c6d 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -324,10 +324,11 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call) /* * Handle retransmission and deferred ACK/abort generation. */ -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) +bool rxrpc_input_call_event(struct rxrpc_call *call) { + struct sk_buff *skb; ktime_t now, t; - bool resend = false; + bool resend = false, did_receive = false, saw_ack = false; s32 abort_code; rxrpc_see_call(call, rxrpc_call_see_input); @@ -337,9 +338,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)], call->events); - if (__rxrpc_call_is_complete(call)) - goto out; - /* Handle abort request locklessly, vs rxrpc_propose_abort(). */ abort_code = smp_load_acquire(&call->send_abort); if (abort_code) { @@ -348,11 +346,21 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) goto out; } - if (skb && skb->mark == RXRPC_SKB_MARK_ERROR) - goto out; + while ((skb = __skb_dequeue(&call->rx_queue))) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (__rxrpc_call_is_complete(call) || + skb->mark == RXRPC_SKB_MARK_ERROR) { + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + goto out; + } + + saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; - if (skb) rxrpc_input_call_packet(call, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + did_receive = true; + } /* If we see our async-event poke, check for timeout trippage. */ now = ktime_get_real(); @@ -418,12 +426,8 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_keepalive); } - if (skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) - rxrpc_congestion_degrade(call); - } + if (saw_ack) + rxrpc_congestion_degrade(call); if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) rxrpc_send_initial_ping(call); @@ -494,7 +498,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) if (call->security) call->security->free_call_crypto(call); } else { - if (skb && + if (did_receive && call->peer->ackr_adv_pmtud && call->peer->pmtud_pending) rxrpc_send_probe_for_pmtud(call); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 0df647d1d3a25..c026f16f891eb 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -148,6 +148,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->attend_link); INIT_LIST_HEAD(&call->tx_sendmsg); INIT_LIST_HEAD(&call->tx_buffer); + skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); @@ -536,6 +537,7 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) static void rxrpc_cleanup_ring(struct rxrpc_call *call) { rxrpc_purge_queue(&call->recvmsg_queue); + rxrpc_purge_queue(&call->rx_queue); rxrpc_purge_queue(&call->rx_oos_queue); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 86fb18bcd1885..706631e6ac2f1 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -508,16 +508,18 @@ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) void rxrpc_connect_client_calls(struct rxrpc_local *local) { struct rxrpc_call *call; + LIST_HEAD(new_client_calls); - while ((call = list_first_entry_or_null(&local->new_client_calls, - struct rxrpc_call, wait_link)) - ) { + spin_lock(&local->client_call_lock); + list_splice_tail_init(&local->new_client_calls, &new_client_calls); + spin_unlock(&local->client_call_lock); + + while ((call = list_first_entry_or_null(&new_client_calls, + struct rxrpc_call, wait_link))) { struct rxrpc_bundle *bundle = call->bundle; - spin_lock(&local->client_call_lock); list_move_tail(&call->wait_link, &bundle->waiting_calls); rxrpc_see_call(call, rxrpc_call_see_waiting_call); - spin_unlock(&local->client_call_lock); if (rxrpc_bundle_has_space(bundle)) rxrpc_activate_channels(bundle); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 8398fa10ee8d7..96fe005c5e819 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1124,5 +1124,5 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) break; } - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); } diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index bd6d4f5e97b4c..bc678a299bd87 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -338,7 +338,6 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; unsigned int channel; - bool ret; if (sp->hdr.securityIndex != conn->security_ix) return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security, @@ -425,9 +424,9 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, peer_srx, skb); } - ret = rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return ret; + return true; } /* @@ -444,6 +443,8 @@ int rxrpc_io_thread(void *data) ktime_t now; #endif bool should_stop; + LIST_HEAD(conn_attend_q); + LIST_HEAD(call_attend_q); complete(&local->io_thread_ready); @@ -454,43 +455,25 @@ int rxrpc_io_thread(void *data) for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); - /* Deal with connections that want immediate attention. */ - conn = list_first_entry_or_null(&local->conn_attend_q, - struct rxrpc_connection, - attend_link); - if (conn) { - spin_lock_bh(&local->lock); - list_del_init(&conn->attend_link); - spin_unlock_bh(&local->lock); - - rxrpc_input_conn_event(conn, NULL); - rxrpc_put_connection(conn, rxrpc_conn_put_poke); - continue; + /* Inject a delay into packets if requested. */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + now = ktime_get_real(); + while ((skb = skb_peek(&local->rx_delay_queue))) { + if (ktime_before(now, skb->tstamp)) + break; + skb = skb_dequeue(&local->rx_delay_queue); + skb_queue_tail(&local->rx_queue, skb); } +#endif - if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, - &local->client_conn_flags)) - rxrpc_discard_expired_client_conns(local); - - /* Deal with calls that want immediate attention. */ - if ((call = list_first_entry_or_null(&local->call_attend_q, - struct rxrpc_call, - attend_link))) { - spin_lock_bh(&local->lock); - list_del_init(&call->attend_link); - spin_unlock_bh(&local->lock); - - trace_rxrpc_call_poked(call); - rxrpc_input_call_event(call, NULL); - rxrpc_put_call(call, rxrpc_call_put_poke); - continue; + if (!skb_queue_empty(&local->rx_queue)) { + spin_lock_irq(&local->rx_queue.lock); + skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); + spin_unlock_irq(&local->rx_queue.lock); } - if (!list_empty(&local->new_client_calls)) - rxrpc_connect_client_calls(local); - - /* Process received packets and errors. */ - if ((skb = __skb_dequeue(&rx_queue))) { + /* Distribute packets and errors. */ + while ((skb = __skb_dequeue(&rx_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: @@ -514,27 +497,46 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_unknown); break; } - continue; } - /* Inject a delay into packets if requested. */ -#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY - now = ktime_get_real(); - while ((skb = skb_peek(&local->rx_delay_queue))) { - if (ktime_before(now, skb->tstamp)) - break; - skb = skb_dequeue(&local->rx_delay_queue); - skb_queue_tail(&local->rx_queue, skb); + /* Deal with connections that want immediate attention. */ + spin_lock_bh(&local->lock); + list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); + spin_unlock_bh(&local->lock); + + while ((conn = list_first_entry_or_null(&conn_attend_q, + struct rxrpc_connection, + attend_link))) { + spin_lock_bh(&local->lock); + list_del_init(&conn->attend_link); + spin_unlock_bh(&local->lock); + rxrpc_input_conn_event(conn, NULL); + rxrpc_put_connection(conn, rxrpc_conn_put_poke); } -#endif - if (!skb_queue_empty(&local->rx_queue)) { - spin_lock_irq(&local->rx_queue.lock); - skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); - spin_unlock_irq(&local->rx_queue.lock); - continue; + if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, + &local->client_conn_flags)) + rxrpc_discard_expired_client_conns(local); + + /* Deal with calls that want immediate attention. */ + spin_lock_bh(&local->lock); + list_splice_tail_init(&local->call_attend_q, &call_attend_q); + spin_unlock_bh(&local->lock); + + while ((call = list_first_entry_or_null(&call_attend_q, + struct rxrpc_call, + attend_link))) { + spin_lock_bh(&local->lock); + list_del_init(&call->attend_link); + spin_unlock_bh(&local->lock); + trace_rxrpc_call_poked(call); + rxrpc_input_call_event(call); + rxrpc_put_call(call, rxrpc_call_put_poke); } + if (!list_empty(&local->new_client_calls)) + rxrpc_connect_client_calls(local); + set_current_state(TASK_INTERRUPTIBLE); should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 8fc9464a960ca..ff30e0c055079 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -224,7 +224,7 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb, rxrpc_see_call(call, rxrpc_call_see_distribute_error); rxrpc_set_call_completion(call, compl, 0, -err); - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); spin_lock(&peer->lock); } From cd69a07b6d186eeb7df20d8bcbef18d7bbd84c4b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:42 +0000 Subject: [PATCH 0291/1450] rxrpc: Fix injection of packet loss Fix the code that injects packet loss for testing to make sure call->tx_transmitted is updated. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-15-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/output.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index aededdd474d71..ca0da5e5d2784 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -544,16 +544,6 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t len = rxrpc_prepare_data_packet(call, txb, n); - if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { - static int lose; - if ((lose++ & 7) == 7) { - ret = 0; - trace_rxrpc_tx_data(call, txb->seq, txb->serial, - txb->flags, true); - goto done; - } - } - iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, n, len); msg.msg_name = &call->peer->srx.transport; @@ -579,6 +569,17 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t frag = rxrpc_tx_point_call_data_nofrag; } + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; + + if ((lose++ & 7) == 7) { + ret = 0; + trace_rxrpc_tx_data(call, txb->seq, txb->serial, + txb->flags, true); + goto done; + } + } + retry: /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet From 81e7761be58aa915cc825afc6ff35dec63bf0b2f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:43 +0000 Subject: [PATCH 0292/1450] rxrpc: Only set DF=1 on initial DATA transmission Change how the DF flag is managed on DATA transmissions. Set it on initial transmission and don't set it on retransmissions. Then remove the handling for EMSGSIZE in rxrpc_send_data_packet() and just pretend it didn't happen, leaving it to the retransmission path to retry. The path-MTU discovery using PING ACKs is then used to probe for the maximum DATA size - though notification by ICMP will be used if one is received. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-16-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/output.c | 32 ++++++++++++++++---------------- net/rxrpc/proc.c | 5 +++-- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 55cc68dd1b403..84efa21f176cc 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -98,6 +98,7 @@ struct rxrpc_net { atomic_t stat_tx_data_send; atomic_t stat_tx_data_send_frag; atomic_t stat_tx_data_send_fail; + atomic_t stat_tx_data_send_msgsize; atomic_t stat_tx_data_underflow; atomic_t stat_tx_data_cwnd_reset; atomic_t stat_rx_data; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index ca0da5e5d2784..3d992023f80f2 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -552,16 +552,11 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t msg.msg_controllen = 0; msg.msg_flags = MSG_SPLICE_PAGES; - /* Track what we've attempted to transmit at least once so that the - * retransmission algorithm doesn't try to resend what we haven't sent - * yet. + /* Send the packet with the don't fragment bit set unless we think it's + * too big or if this is a retransmission. */ - if (txb->seq == call->tx_transmitted + 1) - call->tx_transmitted = txb->seq + n - 1; - - /* send the packet with the don't fragment bit set if we currently - * think it's small enough */ - if (len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { + if (txb->seq == call->tx_transmitted + 1 && + len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; } else { @@ -569,6 +564,13 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t frag = rxrpc_tx_point_call_data_nofrag; } + /* Track what we've attempted to transmit at least once so that the + * retransmission algorithm doesn't try to resend what we haven't sent + * yet. + */ + if (txb->seq == call->tx_transmitted + 1) + call->tx_transmitted = txb->seq + n - 1; + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; @@ -580,7 +582,6 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t } } -retry: /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface @@ -591,7 +592,11 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t ret = do_udp_sendmsg(conn->local->socket, &msg, len); conn->peer->last_tx_at = ktime_get_seconds(); - if (ret < 0) { + if (ret == -EMSGSIZE) { + rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize); + trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); + ret = 0; + } else if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { @@ -599,11 +604,6 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t } rxrpc_tx_backoff(call, ret); - if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_nofrag) { - rxrpc_local_dont_fragment(conn->local, false); - frag = rxrpc_tx_point_call_data_frag; - goto retry; - } done: if (ret >= 0) { diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 44722c2260648..249e1ed9c5c9f 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -473,10 +473,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq)); seq_printf(seq, - "Data : send=%u sendf=%u fail=%u\n", + "Data : send=%u sendf=%u fail=%u emsz=%u\n", atomic_read(&rxnet->stat_tx_data_send), atomic_read(&rxnet->stat_tx_data_send_frag), - atomic_read(&rxnet->stat_tx_data_send_fail)); + atomic_read(&rxnet->stat_tx_data_send_fail), + atomic_read(&rxnet->stat_tx_data_send_msgsize)); seq_printf(seq, "Data-Tx : nr=%u retrans=%u uf=%u cwr=%u\n", atomic_read(&rxnet->stat_tx_data), From 976b0ca5aae741ef33f4cf4079a9a026331eae88 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:44 +0000 Subject: [PATCH 0293/1450] rxrpc: Timestamp DATA packets before transmitting them Move to setting the timestamp on DATA packets before transmitting them as part of the preparation. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-17-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/output.c | 56 ++++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 39 deletions(-) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 3d992023f80f2..400c3389d4925 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -377,7 +377,8 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) */ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_serial_t serial, - int subpkt, int nr_subpkts) + int subpkt, int nr_subpkts, + ktime_t now) { struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; struct rxrpc_jumbo_header *jumbo = (void *)(whdr + 1) - sizeof(*jumbo); @@ -437,8 +438,9 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); trace_rxrpc_req_ack(call->debug_id, txb->seq, why); if (why != rxrpc_reqack_no_srv_last) { - txb->flags |= RXRPC_REQUEST_ACK; flags |= RXRPC_REQUEST_ACK; + rxrpc_begin_rtt_probe(call, serial, now, rxrpc_rtt_tx_data); + call->peer->rtt_last_req = now; } dont_set_request_ack: @@ -474,49 +476,25 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_tx { struct rxrpc_txbuf *txb = head; rxrpc_serial_t serial; + ktime_t now = ktime_get_real(); size_t len = 0; /* Each transmission of a Tx packet needs a new serial number */ serial = rxrpc_get_next_serials(call->conn, n); for (int i = 0; i < n; i++) { - len += rxrpc_prepare_data_subpacket(call, txb, serial, i, n); - serial++; - txb = list_next_entry(txb, call_link); - } - - return len; -} - -/* - * Set timeouts after transmitting a packet. - */ -static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n) -{ - rxrpc_serial_t serial; - ktime_t now = ktime_get_real(); - bool ack_requested = txb->flags & RXRPC_REQUEST_ACK; - int i; - - call->tx_last_sent = now; - - for (i = 0; i < n; i++) { txb->last_sent = now; - ack_requested |= txb->flags & RXRPC_REQUEST_ACK; - serial = txb->serial; + len += rxrpc_prepare_data_subpacket(call, txb, serial, i, n, now); + serial++; txb = list_next_entry(txb, call_link); } - if (ack_requested) { - rxrpc_begin_rtt_probe(call, serial, now, rxrpc_rtt_tx_data); - - call->peer->rtt_last_req = now; - if (call->peer->rtt_count > 1) { - ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + /* Set timeouts */ + if (call->peer->rtt_count > 1) { + ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); - call->ack_lost_at = ktime_add(now, delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); - } + call->ack_lost_at = ktime_add(now, delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); } if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { @@ -527,6 +505,7 @@ static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbu } rxrpc_set_keepalive(call, now); + return len; } /* @@ -538,6 +517,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t enum rxrpc_tx_point frag; struct msghdr msg; size_t len; + bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); int ret; _enter("%x,{%d}", txb->seq, txb->pkt_len); @@ -605,20 +585,18 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t rxrpc_tx_backoff(call, ret); -done: - if (ret >= 0) { - rxrpc_tstamp_data_packets(call, txb, n); - } else { + if (ret < 0) { /* Cancel the call if the initial transmission fails, * particularly if that's due to network routing issues that * aren't going away anytime soon. The layer above can arrange * the retransmission. */ - if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + if (new_call) rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_USER_ABORT, ret); } +done: _leave(" = %d [%u]", ret, call->peer->max_data); return ret; } From 6396b48ac0a77165f9c2c40ab03d6c8188c89739 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:45 +0000 Subject: [PATCH 0294/1450] rxrpc: Don't need barrier for ->tx_bottom and ->acks_hard_ack We don't need a barrier for the ->tx_bottom value (which indicates the lowest sequence still in the transmission queue) and the ->acks_hard_ack value (which tracks the DATA packets hard-ack'd by the latest ACK packet received and thus indicates which DATA packets can now be discarded) as the app thread doesn't use either value as a reference to memory to access. Rather, the app thread merely uses these as a guide to how much space is available in the transmission queue Change the code to use READ/WRITE_ONCE() instead. Also, change rxrpc_check_tx_space() to use the same value for tx_bottom throughout. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-18-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/sendmsg.c | 8 +++++--- net/rxrpc/txbuf.c | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 786c1fb1369ab..467c9402882e6 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -94,9 +94,11 @@ static int rxrpc_wait_to_be_connected(struct rxrpc_call *call, long *timeo) */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { + rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom); + if (_tx_win) - *_tx_win = call->tx_bottom; - return call->tx_prepared - call->tx_bottom < 256; + *_tx_win = tx_bottom; + return call->tx_prepared - tx_bottom < 256; } /* @@ -138,7 +140,7 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rtt = 2; timeout = rtt; - tx_start = smp_load_acquire(&call->acks_hard_ack); + tx_start = READ_ONCE(call->acks_hard_ack); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 8b7c854ed3d75..0cc8f49d69a9e 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -214,14 +214,14 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) while ((txb = list_first_entry_or_null(&call->tx_buffer, struct rxrpc_txbuf, call_link))) { - hard_ack = smp_load_acquire(&call->acks_hard_ack); + hard_ack = call->acks_hard_ack; if (before(hard_ack, txb->seq)) break; if (txb->seq != call->tx_bottom + 1) rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step); ASSERTCMP(txb->seq, ==, call->tx_bottom + 1); - smp_store_release(&call->tx_bottom, call->tx_bottom + 1); + WRITE_ONCE(call->tx_bottom, call->tx_bottom + 1); list_del_rcu(&txb->call_link); trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); From b341a0263b1b804d329f864c2dc24815364510ec Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:46 +0000 Subject: [PATCH 0295/1450] rxrpc: Implement progressive transmission queue struct We need to scan the buffers in the transmission queue occasionally when processing ACKs, but the transmission queue is currently a linked list of transmission buffers which, when we eventually expand the Tx window to 8192 packets will be very slow to walk. Instead, pull the fields we need to examine a lot (last sent time, retransmitted flag) into a new struct rxrpc_txqueue and make each one hold an array of 32 or 64 packets. The transmission queue is then a list of these structs, each pointing to a contiguous set of packets. Scanning is then a lot faster as the flags and timestamps are concentrated in the CPU dcache. The transmission timestamps are stored as a number of microseconds from a base ktime to reduce memory requirements. This should be fine provided we manage to transmit an entire buffer within an hour. This will make implementing RACK-TLP [RFC8985] easier as it will be less costly to scan the transmission buffers. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-19-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 98 ++++++++++++++--- net/rxrpc/ar-internal.h | 47 ++++++-- net/rxrpc/call_event.c | 204 ++++++++++++++++++++++------------- net/rxrpc/call_object.c | 38 ++++--- net/rxrpc/input.c | 72 ++++++++++--- net/rxrpc/output.c | 163 ++++++++++++++-------------- net/rxrpc/sendmsg.c | 69 +++++++++--- net/rxrpc/txbuf.c | 41 +------ 8 files changed, 467 insertions(+), 265 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 28fa7be31ff83..e6cf9ec940aac 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -297,7 +297,6 @@ #define rxrpc_txqueue_traces \ EM(rxrpc_txqueue_await_reply, "AWR") \ - EM(rxrpc_txqueue_dequeue, "DEQ") \ EM(rxrpc_txqueue_end, "END") \ EM(rxrpc_txqueue_queue, "QUE") \ EM(rxrpc_txqueue_queue_last, "QLS") \ @@ -482,6 +481,19 @@ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") +#define rxrpc_tq_traces \ + EM(rxrpc_tq_alloc, "ALLOC") \ + EM(rxrpc_tq_cleaned, "CLEAN") \ + EM(rxrpc_tq_decant, "DCNT ") \ + EM(rxrpc_tq_decant_advance, "DCNT>") \ + EM(rxrpc_tq_queue, "QUEUE") \ + EM(rxrpc_tq_queue_dup, "QUE!!") \ + EM(rxrpc_tq_rotate, "ROT ") \ + EM(rxrpc_tq_rotate_and_free, "ROT-F") \ + EM(rxrpc_tq_rotate_and_keep, "ROT-K") \ + EM(rxrpc_tq_transmit, "XMIT ") \ + E_(rxrpc_tq_transmit_advance, "XMIT>") + #define rxrpc_pmtud_reduce_traces \ EM(rxrpc_pmtud_reduce_ack, "Ack ") \ EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ @@ -518,6 +530,7 @@ enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); +enum rxrpc_tq_trace { rxrpc_tq_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); @@ -554,6 +567,7 @@ rxrpc_rtt_tx_traces; rxrpc_sack_traces; rxrpc_skb_traces; rxrpc_timer_traces; +rxrpc_tq_traces; rxrpc_tx_points; rxrpc_txbuf_traces; rxrpc_txqueue_traces; @@ -881,7 +895,7 @@ TRACE_EVENT(rxrpc_txqueue, __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_bottom) __field(rxrpc_seq_t, tx_top) - __field(rxrpc_seq_t, tx_prepared) + __field(rxrpc_seq_t, send_top) __field(int, tx_winsize) ), @@ -891,7 +905,7 @@ TRACE_EVENT(rxrpc_txqueue, __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_bottom = call->tx_bottom; __entry->tx_top = call->tx_top; - __entry->tx_prepared = call->tx_prepared; + __entry->send_top = call->send_top; __entry->tx_winsize = call->tx_winsize; ), @@ -902,14 +916,14 @@ TRACE_EVENT(rxrpc_txqueue, __entry->acks_hard_ack, __entry->tx_top - __entry->tx_bottom, __entry->tx_top - __entry->acks_hard_ack, - __entry->tx_prepared - __entry->tx_bottom, + __entry->send_top - __entry->tx_top, __entry->tx_winsize) ); TRACE_EVENT(rxrpc_transmit, - TP_PROTO(struct rxrpc_call *call, int space), + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t send_top, int space), - TP_ARGS(call, space), + TP_ARGS(call, send_top, space), TP_STRUCT__entry( __field(unsigned int, call) @@ -925,12 +939,12 @@ TRACE_EVENT(rxrpc_transmit, TP_fast_assign( __entry->call = call->debug_id; - __entry->seq = call->tx_bottom; + __entry->seq = call->tx_top + 1; __entry->space = space; __entry->tx_winsize = call->tx_winsize; __entry->cong_cwnd = call->cong_cwnd; __entry->cong_extra = call->cong_extra; - __entry->prepared = call->tx_prepared - call->tx_bottom; + __entry->prepared = send_top - call->tx_bottom; __entry->in_flight = call->tx_top - call->acks_hard_ack; __entry->pmtud_jumbo = call->peer->pmtud_jumbo; ), @@ -947,6 +961,32 @@ TRACE_EVENT(rxrpc_transmit, __entry->pmtud_jumbo) ); +TRACE_EVENT(rxrpc_tx_rotate, + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, rxrpc_seq_t to), + + TP_ARGS(call, seq, to), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, seq) + __field(rxrpc_seq_t, to) + __field(rxrpc_seq_t, top) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->seq = seq; + __entry->to = to; + __entry->top = call->tx_top; + ), + + TP_printk("c=%08x q=%08x-%08x-%08x", + __entry->call, + __entry->seq, + __entry->to, + __entry->top) + ); + TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags), @@ -1621,10 +1661,11 @@ TRACE_EVENT(rxrpc_drop_ack, ); TRACE_EVENT(rxrpc_retransmit, - TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, - rxrpc_serial_t serial, ktime_t expiry), + TP_PROTO(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_txbuf *txb, ktime_t expiry), - TP_ARGS(call, seq, serial, expiry), + TP_ARGS(call, req, txb, expiry), TP_STRUCT__entry( __field(unsigned int, call) @@ -1635,8 +1676,8 @@ TRACE_EVENT(rxrpc_retransmit, TP_fast_assign( __entry->call = call->debug_id; - __entry->seq = seq; - __entry->serial = serial; + __entry->seq = req->seq; + __entry->serial = txb->serial; __entry->expiry = expiry; ), @@ -1714,9 +1755,9 @@ TRACE_EVENT(rxrpc_reset_cwnd, __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; __entry->hard_ack = call->acks_hard_ack; - __entry->prepared = call->tx_prepared - call->tx_bottom; + __entry->prepared = call->send_top - call->tx_bottom; __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); - __entry->has_data = !list_empty(&call->tx_sendmsg); + __entry->has_data = call->tx_bottom != call->tx_top; ), TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", @@ -2024,6 +2065,33 @@ TRACE_EVENT(rxrpc_txbuf, __entry->ref) ); +TRACE_EVENT(rxrpc_tq, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, + rxrpc_seq_t seq, enum rxrpc_tq_trace trace), + + TP_ARGS(call, tq, seq, trace), + + TP_STRUCT__entry( + __field(unsigned int, call_debug_id) + __field(rxrpc_seq_t, qbase) + __field(rxrpc_seq_t, seq) + __field(enum rxrpc_tq_trace, trace) + ), + + TP_fast_assign( + __entry->call_debug_id = call->debug_id; + __entry->qbase = tq ? tq->qbase : call->tx_qbase; + __entry->seq = seq; + __entry->trace = trace; + ), + + TP_printk("c=%08x bq=%08x q=%08x %s", + __entry->call_debug_id, + __entry->qbase, + __entry->seq, + __print_symbolic(__entry->trace, rxrpc_tq_traces)) + ); + TRACE_EVENT(rxrpc_poke_call, TP_PROTO(struct rxrpc_call *call, bool busy, enum rxrpc_call_poke_trace what), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 84efa21f176cc..bcce4862b0b75 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -30,6 +30,7 @@ struct rxrpc_crypt { struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; +struct rxrpc_txqueue; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -691,13 +692,17 @@ struct rxrpc_call { unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ + /* Sendmsg data tracking. */ + rxrpc_seq_t send_top; /* Highest Tx slot filled by sendmsg. */ + struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */ + /* Transmitted data tracking. */ spinlock_t tx_lock; /* Transmit queue lock */ - struct list_head tx_sendmsg; /* Sendmsg prepared packets */ - struct list_head tx_buffer; /* Buffer of transmissible packets */ + struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */ + struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */ + rxrpc_seq_t tx_qbase; /* First slot in tx_queue */ rxrpc_seq_t tx_bottom; /* First packet in buffer */ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ - rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ u8 tx_winsize; /* Maximum size of Tx window */ @@ -815,9 +820,6 @@ struct rxrpc_send_params { * Buffer of data to be output as a packet. */ struct rxrpc_txbuf { - struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */ - struct list_head tx_link; /* Link in live Enc queue or Tx queue */ - ktime_t last_sent; /* Time at which last transmitted */ refcount_t ref; rxrpc_seq_t seq; /* Sequence number of this packet */ rxrpc_serial_t serial; /* Last serial number transmitted with */ @@ -849,6 +851,36 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) return !rxrpc_sending_to_server(txb); } +/* + * Transmit queue element, including RACK [RFC8985] per-segment metadata. The + * transmission timestamp is in usec from the base. + */ +struct rxrpc_txqueue { + /* Start with the members we want to prefetch. */ + struct rxrpc_txqueue *next; + ktime_t xmit_ts_base; + rxrpc_seq_t qbase; + + /* The arrays we want to pack into as few cache lines as possible. */ + struct { +#define RXRPC_NR_TXQUEUE BITS_PER_LONG +#define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1) + struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE]; + unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE]; + } ____cacheline_aligned; +}; + +/* + * Data transmission request. + */ +struct rxrpc_send_data_req { + ktime_t now; /* Current time */ + struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */ + rxrpc_seq_t seq; /* Sequence of first data */ + int n; /* Number of DATA packets to glue into jumbo */ + bool did_send; /* T if did actually send */ +}; + #include /* @@ -905,7 +937,6 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); bool rxrpc_input_call_event(struct rxrpc_call *call); @@ -1191,10 +1222,10 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); int rxrpc_send_abort_packet(struct rxrpc_call *); +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); -void rxrpc_transmit_data(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n); /* * peer_event.c diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index ef47de3f41c6d..90e3d93956750 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -62,57 +62,85 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags); } +/* + * Retransmit one or more packets. + */ +static void rxrpc_retransmit_data(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + ktime_t rto) +{ + struct rxrpc_txqueue *tq = req->tq; + unsigned int ix = req->seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[ix]; + ktime_t xmit_ts, resend_at; + + _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id); + + xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); + resend_at = ktime_add(xmit_ts, rto); + trace_rxrpc_retransmit(call, req, txb, + ktime_sub(resend_at, req->now)); + + txb->flags |= RXRPC_TXBUF_RESENT; + rxrpc_send_data_packet(call, req); + rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); + + req->tq = NULL; + req->n = 0; + req->did_send = true; + req->now = ktime_get_real(); +} + /* * Perform retransmission of NAK'd and unack'd packets. */ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) { + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + }; struct rxrpc_ackpacket *ack = NULL; struct rxrpc_skb_priv *sp; + struct rxrpc_txqueue *tq; struct rxrpc_txbuf *txb; - rxrpc_seq_t transmitted = call->tx_transmitted; + rxrpc_seq_t transmitted = call->tx_transmitted, seq; ktime_t next_resend = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); - ktime_t resend_at = KTIME_MAX, now, delay; + ktime_t resend_at = KTIME_MAX, delay; bool unacked = false, did_send = false; - unsigned int i; + unsigned int qix; _enter("{%d,%d}", call->acks_hard_ack, call->tx_top); - now = ktime_get_real(); - - if (list_empty(&call->tx_buffer)) + if (call->tx_bottom == call->tx_top) goto no_resend; trace_rxrpc_resend(call, ack_skb); - txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link); + tq = call->tx_queue; + seq = call->tx_bottom; - /* Scan the soft ACK table without dropping the lock and resend any - * explicitly NAK'd packets. - */ + /* Scan the soft ACK table and resend any explicitly NAK'd packets. */ if (ack_skb) { sp = rxrpc_skb(ack_skb); ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); - for (i = 0; i < sp->ack.nr_acks; i++) { - rxrpc_seq_t seq; + for (int i = 0; i < sp->ack.nr_acks; i++) { + rxrpc_seq_t aseq; if (ack->acks[i] & 1) continue; - seq = sp->ack.first_ack + i; - if (after(txb->seq, transmitted)) - break; - if (after(txb->seq, seq)) - continue; /* A new hard ACK probably came in */ - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - if (txb->seq == seq) - goto found_txb; - } - goto no_further_resend; + aseq = sp->ack.first_ack + i; + while (after_eq(aseq, tq->qbase + RXRPC_NR_TXQUEUE)) + tq = tq->next; + seq = aseq; + qix = seq - tq->qbase; + txb = tq->bufs[qix]; + if (after(seq, transmitted)) + goto no_further_resend; - found_txb: - resend_at = ktime_add(txb->last_sent, rto); + resend_at = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[qix]); + resend_at = ktime_add(resend_at, rto); if (after(txb->serial, call->acks_highest_serial)) { - if (ktime_after(resend_at, now) && + if (ktime_after(resend_at, req.now) && ktime_before(resend_at, next_resend)) next_resend = resend_at; continue; /* Ack point not yet reached */ @@ -120,17 +148,13 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); - trace_rxrpc_retransmit(call, txb->seq, txb->serial, - ktime_sub(resend_at, now)); - - txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_data(call, txb, 1); - did_send = true; - now = ktime_get_real(); + req.tq = tq; + req.seq = seq; + req.n = 1; + rxrpc_retransmit_data(call, &req, rto); - if (list_is_last(&txb->call_link, &call->tx_buffer)) + if (after_eq(seq, call->tx_top)) goto no_further_resend; - txb = list_next_entry(txb, call_link); } } @@ -139,35 +163,43 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) * ACK'd or NACK'd in due course, so don't worry about it here; here we * need to consider retransmitting anything beyond that point. */ - if (after_eq(call->acks_prev_seq, call->tx_transmitted)) + seq = call->acks_prev_seq; + if (after_eq(seq, call->tx_transmitted)) goto no_further_resend; + seq++; - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - resend_at = ktime_add(txb->last_sent, rto); + while (after_eq(seq, tq->qbase + RXRPC_NR_TXQUEUE)) + tq = tq->next; - if (before_eq(txb->seq, call->acks_prev_seq)) + while (before_eq(seq, call->tx_transmitted)) { + qix = seq - tq->qbase; + if (qix >= RXRPC_NR_TXQUEUE) { + tq = tq->next; continue; - if (after(txb->seq, call->tx_transmitted)) - break; /* Not transmitted yet */ + } + txb = tq->bufs[qix]; + resend_at = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[qix]); + resend_at = ktime_add(resend_at, rto); if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE && before(txb->serial, ntohl(ack->serial))) goto do_resend; /* Wasn't accounted for by a more recent ping. */ - if (ktime_after(resend_at, now)) { + if (ktime_after(resend_at, req.now)) { if (ktime_before(resend_at, next_resend)) next_resend = resend_at; + seq++; continue; } do_resend: unacked = true; - txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_data(call, txb, 1); - did_send = true; - rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); - now = ktime_get_real(); + req.tq = tq; + req.seq = seq; + req.n = 1; + rxrpc_retransmit_data(call, &req, rto); + seq++; } no_further_resend: @@ -175,7 +207,8 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) if (resend_at < KTIME_MAX) { delay = rxrpc_get_rto_backoff(call->peer, did_send); resend_at = ktime_add(resend_at, delay); - trace_rxrpc_timer_set(call, resend_at - now, rxrpc_timer_trace_resend_reset); + trace_rxrpc_timer_set(call, resend_at - req.now, + rxrpc_timer_trace_resend_reset); } call->resend_at = resend_at; @@ -186,11 +219,11 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) * that an ACK got lost somewhere. Send a ping to find out instead of * retransmitting data. */ - if (!did_send) { + if (!req.did_send) { ktime_t next_ping = ktime_add_us(call->acks_latest_ts, call->peer->srtt_us >> 3); - if (ktime_sub(next_ping, now) <= 0) + if (ktime_sub(next_ping, req.now) <= 0) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_0_retrans); } @@ -240,47 +273,68 @@ static unsigned int rxrpc_tx_window_space(struct rxrpc_call *call) } /* - * Decant some if the sendmsg prepared queue into the transmission buffer. + * Transmit some as-yet untransmitted data. */ -static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) +static void rxrpc_transmit_fresh_data(struct rxrpc_call *call) { int space = rxrpc_tx_window_space(call); if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { - if (list_empty(&call->tx_sendmsg)) + if (call->send_top == call->tx_top) return; rxrpc_expose_client_call(call); } while (space > 0) { - struct rxrpc_txbuf *head = NULL, *txb; - int count = 0, limit = min(space, 1); - - if (list_empty(&call->tx_sendmsg)) + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted + 1, + .n = 0, + }; + struct rxrpc_txqueue *tq; + struct rxrpc_txbuf *txb; + rxrpc_seq_t send_top, seq; + int limit = min(space, 1); + + /* Order send_top before the contents of the new txbufs and + * txqueue pointers + */ + send_top = smp_load_acquire(&call->send_top); + if (call->tx_top == send_top) break; - trace_rxrpc_transmit(call, space); + trace_rxrpc_transmit(call, send_top, space); + + tq = call->tx_qtail; + seq = call->tx_top; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant); - spin_lock(&call->tx_lock); do { - txb = list_first_entry(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link); - if (!head) - head = txb; - list_move_tail(&txb->call_link, &call->tx_buffer); - count++; + int ix; + + seq++; + ix = seq & RXRPC_TXQ_MASK; + if (!ix) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant_advance); + } + if (!req.tq) + req.tq = tq; + txb = tq->bufs[ix]; + req.n++; if (!txb->jumboable) break; - } while (count < limit && !list_empty(&call->tx_sendmsg)); + } while (req.n < limit && before(seq, send_top)); - spin_unlock(&call->tx_lock); - - call->tx_top = txb->seq; - if (txb->flags & RXRPC_LAST_PACKET) + if (txb->flags & RXRPC_LAST_PACKET) { rxrpc_close_tx_phase(call); + tq = NULL; + } + call->tx_qtail = tq; + call->tx_top = seq; - space -= count; - rxrpc_transmit_data(call, head, count); + space -= req.n; + rxrpc_send_data_packet(call, &req); } } @@ -288,7 +342,7 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) { switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_ACK_REQUEST: - if (list_empty(&call->tx_sendmsg)) + if (call->tx_bottom == READ_ONCE(call->send_top)) return; rxrpc_begin_service_reply(call); fallthrough; @@ -297,11 +351,11 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) case RXRPC_CALL_CLIENT_SEND_REQUEST: if (!rxrpc_tx_window_space(call)) return; - if (list_empty(&call->tx_sendmsg)) { + if (call->tx_bottom == READ_ONCE(call->send_top)) { rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); return; } - rxrpc_decant_prepared_tx(call); + rxrpc_transmit_fresh_data(call); break; default: return; @@ -503,8 +557,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) call->peer->pmtud_pending) rxrpc_send_probe_for_pmtud(call); } - if (call->acks_hard_ack != call->tx_bottom) - rxrpc_shrink_call_tx_buffer(call); _leave(""); return true; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c026f16f891eb..a9682b31a4f9f 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -146,8 +146,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); - INIT_LIST_HEAD(&call->tx_sendmsg); - INIT_LIST_HEAD(&call->tx_buffer); skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); @@ -532,9 +530,26 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) } /* - * Clean up the Rx skb ring. + * Clean up the transmission buffers. */ -static void rxrpc_cleanup_ring(struct rxrpc_call *call) +static void rxrpc_cleanup_tx_buffers(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq, *next; + + for (tq = call->tx_queue; tq; tq = next) { + next = tq->next; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + if (tq->bufs[i]) + rxrpc_put_txbuf(tq->bufs[i], rxrpc_txbuf_put_cleaned); + trace_rxrpc_tq(call, tq, 0, rxrpc_tq_cleaned); + kfree(tq); + } +} + +/* + * Clean up the receive buffers. + */ +static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call) { rxrpc_purge_queue(&call->recvmsg_queue); rxrpc_purge_queue(&call->rx_queue); @@ -673,23 +688,12 @@ static void rxrpc_rcu_free_call(struct rcu_head *rcu) static void rxrpc_destroy_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); - struct rxrpc_txbuf *txb; del_timer_sync(&call->timer); rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - rxrpc_cleanup_ring(call); - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } - + rxrpc_cleanup_tx_buffers(call); + rxrpc_cleanup_rx_buffers(call); rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_deactivate_bundle(call->bundle); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 96fe005c5e819..cfdd23042d4c7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -214,24 +214,71 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, struct rxrpc_ack_summary *summary) { - struct rxrpc_txbuf *txb; + struct rxrpc_txqueue *tq = call->tx_queue; + rxrpc_seq_t seq = call->tx_bottom + 1; bool rot_last = false; - list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { - if (before_eq(txb->seq, call->acks_hard_ack)) - continue; - if (txb->flags & RXRPC_LAST_PACKET) { + _enter("%x,%x,%x", call->tx_bottom, call->acks_hard_ack, to); + + trace_rxrpc_tx_rotate(call, seq, to); + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate); + + /* We may have a left over fully-consumed buffer at the front that we + * couldn't drop before (rotate_and_keep below). + */ + if (seq == call->tx_qbase + RXRPC_NR_TXQUEUE) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } + + do { + unsigned int ix = seq - call->tx_qbase; + + _debug("tq=%x seq=%x i=%d f=%x", tq->qbase, seq, ix, tq->bufs[ix]->flags); + if (tq->bufs[ix]->flags & RXRPC_LAST_PACKET) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; } - if (txb->seq == to) - break; - } + rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated); + tq->bufs[ix] = NULL; + + WRITE_ONCE(call->tx_bottom, seq); + WRITE_ONCE(call->acks_hard_ack, seq); + trace_rxrpc_txqueue(call, (rot_last ? + rxrpc_txqueue_rotate_last : + rxrpc_txqueue_rotate)); - if (rot_last) + seq++; + if (!(seq & RXRPC_TXQ_MASK)) { + prefetch(tq->next); + if (tq != call->tx_qtail) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } else { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_keep); + tq = NULL; + break; + } + } + + } while (before_eq(seq, to)); + + if (rot_last) { set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); + if (tq) { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + call->tx_queue = NULL; + } + } - _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last); + _debug("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last); if (call->acks_lowest_nak == call->acks_hard_ack) { call->acks_lowest_nak = to; @@ -240,11 +287,6 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, call->acks_lowest_nak = to; } - smp_store_release(&call->acks_hard_ack, to); - - trace_rxrpc_txqueue(call, (rot_last ? - rxrpc_txqueue_rotate_last : - rxrpc_txqueue_rotate)); wake_up(&call->waitq); return rot_last; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 400c3389d4925..c2044d5932374 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -375,10 +375,10 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) /* * Prepare a (sub)packet for transmission. */ -static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, - rxrpc_serial_t serial, - int subpkt, int nr_subpkts, - ktime_t now) +static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_txbuf *txb, + rxrpc_serial_t serial, int subpkt) { struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; struct rxrpc_jumbo_header *jumbo = (void *)(whdr + 1) - sizeof(*jumbo); @@ -386,7 +386,7 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc struct rxrpc_connection *conn = call->conn; struct kvec *kv = &call->local->kvec[subpkt]; size_t len = txb->pkt_len; - bool last, more; + bool last; u8 flags; _enter("%x,%zd", txb->seq, len); @@ -401,14 +401,11 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; last = txb->flags & RXRPC_LAST_PACKET; - if (subpkt < nr_subpkts - 1) { + if (subpkt < req->n - 1) { len = RXRPC_JUMBO_DATALEN; goto dont_set_request_ack; } - more = (!list_is_last(&txb->call_link, &call->tx_buffer) || - !list_empty(&call->tx_sendmsg)); - /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * @@ -430,7 +427,7 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc why = rxrpc_reqack_more_rtt; else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) why = rxrpc_reqack_old_rtt; - else if (!last && !more) + else if (!last && !after(READ_ONCE(call->send_top), txb->seq)) why = rxrpc_reqack_app_stall; else goto dont_set_request_ack; @@ -439,13 +436,13 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc trace_rxrpc_req_ack(call->debug_id, txb->seq, why); if (why != rxrpc_reqack_no_srv_last) { flags |= RXRPC_REQUEST_ACK; - rxrpc_begin_rtt_probe(call, serial, now, rxrpc_rtt_tx_data); - call->peer->rtt_last_req = now; + rxrpc_begin_rtt_probe(call, serial, req->now, rxrpc_rtt_tx_data); + call->peer->rtt_last_req = req->now; } dont_set_request_ack: /* The jumbo header overlays the wire header in the txbuf. */ - if (subpkt < nr_subpkts - 1) + if (subpkt < req->n - 1) flags |= RXRPC_JUMBO_PACKET; else flags &= ~RXRPC_JUMBO_PACKET; @@ -469,62 +466,100 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc return len; } +/* + * Prepare a transmission queue object for initial transmission. Returns the + * number of microseconds since the transmission queue base timestamp. + */ +static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq, + struct rxrpc_send_data_req *req) +{ + if (!tq) + return 0; + if (tq->xmit_ts_base == KTIME_MIN) { + tq->xmit_ts_base = req->now; + return 0; + } + return ktime_to_us(ktime_sub(req->now, tq->xmit_ts_base)); +} + /* * Prepare a (jumbo) packet for transmission. */ -static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *head, int n) +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) { - struct rxrpc_txbuf *txb = head; + struct rxrpc_txqueue *tq = req->tq; rxrpc_serial_t serial; - ktime_t now = ktime_get_real(); + unsigned int xmit_ts; + rxrpc_seq_t seq = req->seq; size_t len = 0; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit); + /* Each transmission of a Tx packet needs a new serial number */ - serial = rxrpc_get_next_serials(call->conn, n); + serial = rxrpc_get_next_serials(call->conn, req->n); - for (int i = 0; i < n; i++) { - txb->last_sent = now; - len += rxrpc_prepare_data_subpacket(call, txb, serial, i, n, now); - serial++; - txb = list_next_entry(txb, call_link); - } + call->tx_last_sent = req->now; + xmit_ts = rxrpc_prepare_txqueue(tq, req); + prefetch(tq->next); - /* Set timeouts */ - if (call->peer->rtt_count > 1) { - ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + for (int i = 0;;) { + int ix = seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK]; - call->ack_lost_at = ktime_add(now, delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); + _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq); + tq->segment_xmit_ts[ix] = xmit_ts; + len += rxrpc_prepare_data_subpacket(call, req, txb, serial, i); + serial++; + seq++; + i++; + if (i >= req->n) + break; + if (!(seq & RXRPC_TXQ_MASK)) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit_advance); + xmit_ts = rxrpc_prepare_txqueue(tq, req); + } } + /* Set timeouts */ if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo)); - call->expect_rx_by = ktime_add(now, delay); + call->expect_rx_by = ktime_add(req->now, delay); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); } + if (call->resend_at == KTIME_MAX) { + ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + + call->resend_at = ktime_add(req->now, delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend); + } - rxrpc_set_keepalive(call, now); + rxrpc_set_keepalive(call, req->now); return len; } /* - * send a packet through the transport endpoint + * Send one or more packets through the transport endpoint */ -static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n) +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) { struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; + struct rxrpc_txqueue *tq = req->tq; + struct rxrpc_txbuf *txb; struct msghdr msg; + rxrpc_seq_t seq = req->seq; size_t len; bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); int ret; - _enter("%x,{%d}", txb->seq, txb->pkt_len); + _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1); - len = rxrpc_prepare_data_packet(call, txb, n); + len = rxrpc_prepare_data_packet(call, req); + txb = tq->bufs[seq & RXRPC_TXQ_MASK]; - iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, n, len); + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, req->n, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -535,7 +570,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t /* Send the packet with the don't fragment bit set unless we think it's * too big or if this is a retransmission. */ - if (txb->seq == call->tx_transmitted + 1 && + if (seq == call->tx_transmitted + 1 && len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; @@ -548,8 +583,8 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t * retransmission algorithm doesn't try to resend what we haven't sent * yet. */ - if (txb->seq == call->tx_transmitted + 1) - call->tx_transmitted = txb->seq + n - 1; + if (seq == call->tx_transmitted + 1) + call->tx_transmitted = seq + req->n - 1; if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; @@ -586,19 +621,21 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t rxrpc_tx_backoff(call, ret); if (ret < 0) { - /* Cancel the call if the initial transmission fails, - * particularly if that's due to network routing issues that - * aren't going away anytime soon. The layer above can arrange - * the retransmission. + /* Cancel the call if the initial transmission fails or if we + * hit due to network routing issues that aren't going away + * anytime soon. The layer above can arrange the + * retransmission. */ - if (new_call) + if (new_call || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH || + ret == -ECONNREFUSED) rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_USER_ABORT, ret); } done: _leave(" = %d [%u]", ret, call->peer->max_data); - return ret; } /* @@ -773,41 +810,3 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) peer->last_tx_at = ktime_get_seconds(); _leave(""); } - -/* - * Schedule an instant Tx resend. - */ -static inline void rxrpc_instant_resend(struct rxrpc_call *call, - struct rxrpc_txbuf *txb) -{ - if (!__rxrpc_call_is_complete(call)) - kdebug("resend"); -} - -/* - * Transmit a packet, possibly gluing several subpackets together. - */ -void rxrpc_transmit_data(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n) -{ - int ret; - - ret = rxrpc_send_data_packet(call, txb, n); - if (ret < 0) { - switch (ret) { - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - 0, ret); - break; - default: - _debug("need instant resend %d", ret); - rxrpc_instant_resend(call, txb); - } - } else { - ktime_t delay = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); - - call->resend_at = ktime_add(ktime_get_real(), delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_tx); - } -} diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 467c9402882e6..85b35b11755dd 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -98,7 +98,7 @@ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) if (_tx_win) *_tx_win = tx_bottom; - return call->tx_prepared - tx_bottom < 256; + return call->send_top - tx_bottom < 256; } /* @@ -242,36 +242,74 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { + struct rxrpc_txqueue *sq = call->send_queue; rxrpc_seq_t seq = txb->seq; bool poke, last = txb->flags & RXRPC_LAST_PACKET; - + int ix = seq & RXRPC_TXQ_MASK; rxrpc_inc_stat(call->rxnet, stat_tx_data); - ASSERTCMP(txb->seq, ==, call->tx_prepared + 1); - - /* We have to set the timestamp before queueing as the retransmit - * algorithm can see the packet as soon as we queue it. - */ - txb->last_sent = ktime_get_real(); + ASSERTCMP(txb->seq, ==, call->send_top + 1); if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); + if (WARN_ON_ONCE(sq->bufs[ix])) + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup); + else + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue); + /* Add the packet to the call's output buffer */ spin_lock(&call->tx_lock); - poke = list_empty(&call->tx_sendmsg); - list_add_tail(&txb->call_link, &call->tx_sendmsg); - call->tx_prepared = seq; - if (last) + poke = (READ_ONCE(call->tx_bottom) == call->send_top); + sq->bufs[ix] = txb; + /* Order send_top after the queue->next pointer and txb content. */ + smp_store_release(&call->send_top, seq); + if (last) { rxrpc_notify_end_tx(rx, call, notify_end_tx); + call->send_queue = NULL; + } spin_unlock(&call->tx_lock); if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } +/* + * Allocate a new txqueue unit and add it to the transmission queue. + */ +static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + + tq = kzalloc(sizeof(*tq), sk->sk_allocation); + if (!tq) + return -ENOMEM; + + tq->xmit_ts_base = KTIME_MIN; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + tq->segment_xmit_ts[i] = UINT_MAX; + + if (call->send_queue) { + tq->qbase = call->send_top + 1; + call->send_queue->next = tq; + call->send_queue = tq; + } else if (WARN_ON(call->tx_queue)) { + kfree(tq); + return -ENOMEM; + } else { + tq->qbase = 0; + call->tx_qbase = 0; + call->send_queue = tq; + call->tx_qtail = tq; + call->tx_queue = tq; + } + + trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc); + return 0; +} + /* * send data through a socket * - must be called in process context @@ -346,6 +384,13 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, if (!rxrpc_check_tx_space(call, NULL)) goto wait_for_space; + /* See if we need to begin/extend the Tx queue. */ + if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) { + ret = rxrpc_alloc_txqueue(sk, call); + if (ret < 0) + goto maybe_error; + } + /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded * region (enc blocksize), but the trailer is not. diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 0cc8f49d69a9e..067223c8c35f2 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -43,17 +43,14 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ whdr = buf + hoff; - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); refcount_set(&txb->ref, 1); - txb->last_sent = KTIME_MIN; txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); txb->alloc_size = data_size; txb->space = data_size; txb->offset = sizeof(*whdr); txb->flags = call->conn->out_clientflag; - txb->seq = call->tx_prepared + 1; + txb->seq = call->send_top + 1; txb->nr_kvec = 1; txb->kvec[0].iov_base = whdr; txb->kvec[0].iov_len = sizeof(*whdr); @@ -114,8 +111,6 @@ struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_s filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); refcount_set(&txb->ref, 1); txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); @@ -200,37 +195,3 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) rxrpc_free_txbuf(txb); } } - -/* - * Shrink the transmit buffer. - */ -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) -{ - struct rxrpc_txbuf *txb; - rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack); - bool wake = false; - - _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top); - - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - hard_ack = call->acks_hard_ack; - if (before(hard_ack, txb->seq)) - break; - - if (txb->seq != call->tx_bottom + 1) - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step); - ASSERTCMP(txb->seq, ==, call->tx_bottom + 1); - WRITE_ONCE(call->tx_bottom, call->tx_bottom + 1); - list_del_rcu(&txb->call_link); - - trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); - - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated); - if (after(call->acks_hard_ack, call->tx_bottom + 128)) - wake = true; - } - - if (wake) - wake_up(&call->waitq); -} From 692c4caa074c0d6092bd713babc6fc3872b5592a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:47 +0000 Subject: [PATCH 0296/1450] rxrpc: call->acks_hard_ack is now the same call->tx_bottom, so remove it Now that packets are removed from the Tx queue in the rotation function rather than being cleaned up later, call->acks_hard_ack now advances in step with call->tx_bottom, so remove it. Some of the places call->acks_hard_ack is used in the rxrpc tracepoints are replaced by call->acks_first_seq instead as that's the peer's reported idea of the hard-ACK point. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-20-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 20 ++++++++++---------- net/rxrpc/ar-internal.h | 1 - net/rxrpc/call_event.c | 4 ++-- net/rxrpc/input.c | 17 ++++++++--------- net/rxrpc/proc.c | 6 +++--- net/rxrpc/sendmsg.c | 6 +++--- 6 files changed, 26 insertions(+), 28 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index e6cf9ec940aac..0f253287de00b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -892,8 +892,8 @@ TRACE_EVENT(rxrpc_txqueue, TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_txqueue_trace, why) - __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_bottom) + __field(rxrpc_seq_t, acks_first_seq) __field(rxrpc_seq_t, tx_top) __field(rxrpc_seq_t, send_top) __field(int, tx_winsize) @@ -902,8 +902,8 @@ TRACE_EVENT(rxrpc_txqueue, TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; - __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_bottom = call->tx_bottom; + __entry->acks_first_seq = call->acks_first_seq; __entry->tx_top = call->tx_top; __entry->send_top = call->send_top; __entry->tx_winsize = call->tx_winsize; @@ -913,9 +913,9 @@ TRACE_EVENT(rxrpc_txqueue, __entry->call, __print_symbolic(__entry->why, rxrpc_txqueue_traces), __entry->tx_bottom, - __entry->acks_hard_ack, - __entry->tx_top - __entry->tx_bottom, - __entry->tx_top - __entry->acks_hard_ack, + __entry->acks_first_seq, + __entry->acks_first_seq - __entry->tx_bottom, + __entry->tx_top - __entry->acks_first_seq, __entry->send_top - __entry->tx_top, __entry->tx_winsize) ); @@ -945,7 +945,7 @@ TRACE_EVENT(rxrpc_transmit, __entry->cong_cwnd = call->cong_cwnd; __entry->cong_extra = call->cong_extra; __entry->prepared = send_top - call->tx_bottom; - __entry->in_flight = call->tx_top - call->acks_hard_ack; + __entry->in_flight = call->tx_top - call->tx_bottom; __entry->pmtud_jumbo = call->peer->pmtud_jumbo; ), @@ -1707,7 +1707,7 @@ TRACE_EVENT(rxrpc_congest, TP_fast_assign( __entry->call = call->debug_id; __entry->change = change; - __entry->hard_ack = call->acks_hard_ack; + __entry->hard_ack = call->acks_first_seq; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->ack_serial = ack_serial; @@ -1754,7 +1754,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, __entry->mode = call->cong_mode; __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; - __entry->hard_ack = call->acks_hard_ack; + __entry->hard_ack = call->acks_first_seq; __entry->prepared = call->send_top - call->tx_bottom; __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); __entry->has_data = call->tx_bottom != call->tx_top; @@ -1855,7 +1855,7 @@ TRACE_EVENT(rxrpc_resend, TP_fast_assign( struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; - __entry->seq = call->acks_hard_ack; + __entry->seq = call->acks_first_seq; __entry->transmitted = call->tx_transmitted; __entry->ack_serial = sp ? sp->hdr.serial : 0; ), @@ -1944,7 +1944,7 @@ TRACE_EVENT(rxrpc_call_reset, __entry->call_id = call->call_id; __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; - __entry->tx_seq = call->acks_hard_ack; + __entry->tx_seq = call->acks_first_seq; __entry->rx_seq = call->rx_highest_seq; ), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index bcce4862b0b75..6683043cee3f9 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -759,7 +759,6 @@ struct rxrpc_call { ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ rxrpc_seq_t acks_first_seq; /* first sequence number received */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ - rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ }; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 90e3d93956750..2311e5c737e84 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -109,7 +109,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) bool unacked = false, did_send = false; unsigned int qix; - _enter("{%d,%d}", call->acks_hard_ack, call->tx_top); + _enter("{%d,%d}", call->tx_bottom, call->tx_top); if (call->tx_bottom == call->tx_top) goto no_resend; @@ -267,7 +267,7 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) static unsigned int rxrpc_tx_window_space(struct rxrpc_call *call) { int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); - int in_flight = call->tx_top - call->acks_hard_ack; + int in_flight = call->tx_top - call->tx_bottom; return max(winsize - in_flight, 0); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index cfdd23042d4c7..afb87a3322da9 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -40,7 +40,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, bool resend = false; summary->flight_size = - (call->tx_top - call->acks_hard_ack) - summary->nr_acks; + (call->tx_top - call->tx_bottom) - summary->nr_acks; if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; @@ -175,7 +175,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, * state. */ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) || - summary->nr_acks != call->tx_top - call->acks_hard_ack) { + summary->nr_acks != call->tx_top - call->tx_bottom) { call->cong_extra++; wake_up(&call->waitq); } @@ -218,7 +218,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, rxrpc_seq_t seq = call->tx_bottom + 1; bool rot_last = false; - _enter("%x,%x,%x", call->tx_bottom, call->acks_hard_ack, to); + _enter("%x,%x", call->tx_bottom, to); trace_rxrpc_tx_rotate(call, seq, to); trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate); @@ -246,7 +246,6 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, tq->bufs[ix] = NULL; WRITE_ONCE(call->tx_bottom, seq); - WRITE_ONCE(call->acks_hard_ack, seq); trace_rxrpc_txqueue(call, (rot_last ? rxrpc_txqueue_rotate_last : rxrpc_txqueue_rotate)); @@ -278,9 +277,9 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, } } - _debug("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last); + _debug("%x,%x,%x,%d", to, call->tx_bottom, call->tx_top, rot_last); - if (call->acks_lowest_nak == call->acks_hard_ack) { + if (call->acks_lowest_nak == call->tx_bottom) { call->acks_lowest_nak = to; } else if (after(to, call->acks_lowest_nak)) { summary->new_low_nack = true; @@ -968,7 +967,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) && first_soft_ack == 1 && prev_pkt == 0 && - call->acks_hard_ack == 0 && + call->tx_bottom == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); @@ -1033,13 +1032,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto send_response; } - if (before(hard_ack, call->acks_hard_ack) || + if (before(hard_ack, call->tx_bottom) || after(hard_ack, call->tx_top)) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window); if (nr_acks > call->tx_top - hard_ack) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow); - if (after(hard_ack, call->acks_hard_ack)) { + if (after(hard_ack, call->tx_bottom)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); goto send_response; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 249e1ed9c5c9f..a8325b8e33c2c 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -52,7 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); enum rxrpc_call_state state; - rxrpc_seq_t acks_hard_ack; + rxrpc_seq_t tx_bottom; char lbuff[50], rbuff[50]; long timeout = 0; @@ -79,7 +79,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) if (state != RXRPC_CALL_SERVER_PREALLOC) timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real()); - acks_hard_ack = READ_ONCE(call->acks_hard_ack); + tx_bottom = READ_ONCE(call->tx_bottom); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" " %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n", @@ -93,7 +93,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) rxrpc_call_states[state], call->abort_code, call->debug_id, - acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, + tx_bottom, READ_ONCE(call->tx_top) - tx_bottom, call->ackr_window, call->ackr_wtop - call->ackr_window, call->rx_serial, call->cong_cwnd, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 85b35b11755dd..dfbf9f4b24b6d 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -140,7 +140,7 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rtt = 2; timeout = rtt; - tx_start = READ_ONCE(call->acks_hard_ack); + tx_start = READ_ONCE(call->tx_bottom); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -197,8 +197,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, DECLARE_WAITQUEUE(myself, current); int ret; - _enter(",{%u,%u,%u,%u}", - call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize); + _enter(",{%u,%u,%u}", + call->tx_bottom, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); From 203457e11b591f80ada571f981dd5f4d683b0009 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:48 +0000 Subject: [PATCH 0297/1450] rxrpc: Replace call->acks_first_seq with tracking of the hard ACK point Replace the call->acks_first_seq variable (which holds ack.firstPacket from the latest ACK packet and indicates the sequence number of the first ack slot in the SACK table) with call->acks_hard_ack which will hold the highest sequence hard ACK'd. This is 1 less than call->acks_first_seq, but it fits in the same schema as the other tracking variables which hold the sequence of a packet, not one past it. This will fix the rxrpc_congest tracepoint's calculation of SACK window size which shows one fewer than it should - and will occasionally go to -1. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-21-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 68 +++++++++++++++++------------------- net/rxrpc/ar-internal.h | 2 +- net/rxrpc/input.c | 56 ++++++++++++++--------------- 3 files changed, 59 insertions(+), 67 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 0f253287de00b..91108e0de3af9 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -893,7 +893,7 @@ TRACE_EVENT(rxrpc_txqueue, __field(unsigned int, call) __field(enum rxrpc_txqueue_trace, why) __field(rxrpc_seq_t, tx_bottom) - __field(rxrpc_seq_t, acks_first_seq) + __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_top) __field(rxrpc_seq_t, send_top) __field(int, tx_winsize) @@ -903,19 +903,19 @@ TRACE_EVENT(rxrpc_txqueue, __entry->call = call->debug_id; __entry->why = why; __entry->tx_bottom = call->tx_bottom; - __entry->acks_first_seq = call->acks_first_seq; + __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_top = call->tx_top; __entry->send_top = call->send_top; __entry->tx_winsize = call->tx_winsize; ), - TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u/%u", + TP_printk("c=%08x %s b=%08x h=%08x n=%u/%u/%u/%u", __entry->call, __print_symbolic(__entry->why, rxrpc_txqueue_traces), __entry->tx_bottom, - __entry->acks_first_seq, - __entry->acks_first_seq - __entry->tx_bottom, - __entry->tx_top - __entry->acks_first_seq, + __entry->acks_hard_ack, + __entry->acks_hard_ack - __entry->tx_bottom, + __entry->tx_top - __entry->acks_hard_ack, __entry->send_top - __entry->tx_top, __entry->tx_winsize) ); @@ -1015,11 +1015,9 @@ TRACE_EVENT(rxrpc_rx_data, ); TRACE_EVENT(rxrpc_rx_ack, - TP_PROTO(struct rxrpc_call *call, - rxrpc_serial_t serial, rxrpc_serial_t ack_serial, - rxrpc_seq_t first, rxrpc_seq_t prev, u8 reason, u8 n_acks), + TP_PROTO(struct rxrpc_call *call, struct rxrpc_skb_priv *sp), - TP_ARGS(call, serial, ack_serial, first, prev, reason, n_acks), + TP_ARGS(call, sp), TP_STRUCT__entry( __field(unsigned int, call) @@ -1032,13 +1030,13 @@ TRACE_EVENT(rxrpc_rx_ack, ), TP_fast_assign( - __entry->call = call->debug_id; - __entry->serial = serial; - __entry->ack_serial = ack_serial; - __entry->first = first; - __entry->prev = prev; - __entry->reason = reason; - __entry->n_acks = n_acks; + __entry->call = call->debug_id; + __entry->serial = sp->hdr.serial; + __entry->ack_serial = sp->ack.acked_serial; + __entry->first = sp->ack.first_ack; + __entry->prev = sp->ack.prev_ack; + __entry->reason = sp->ack.reason; + __entry->n_acks = sp->ack.nr_acks; ), TP_printk("c=%08x %08x %s r=%08x f=%08x p=%08x n=%u", @@ -1707,7 +1705,7 @@ TRACE_EVENT(rxrpc_congest, TP_fast_assign( __entry->call = call->debug_id; __entry->change = change; - __entry->hard_ack = call->acks_first_seq; + __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->ack_serial = ack_serial; @@ -1754,7 +1752,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, __entry->mode = call->cong_mode; __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; - __entry->hard_ack = call->acks_first_seq; + __entry->hard_ack = call->acks_hard_ack; __entry->prepared = call->send_top - call->tx_bottom; __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); __entry->has_data = call->tx_bottom != call->tx_top; @@ -1855,7 +1853,7 @@ TRACE_EVENT(rxrpc_resend, TP_fast_assign( struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; - __entry->seq = call->acks_first_seq; + __entry->seq = call->acks_hard_ack; __entry->transmitted = call->tx_transmitted; __entry->ack_serial = sp ? sp->hdr.serial : 0; ), @@ -1944,7 +1942,7 @@ TRACE_EVENT(rxrpc_call_reset, __entry->call_id = call->call_id; __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; - __entry->tx_seq = call->acks_first_seq; + __entry->tx_seq = call->acks_hard_ack; __entry->rx_seq = call->rx_highest_seq; ), @@ -1976,38 +1974,36 @@ TRACE_EVENT(rxrpc_notify_socket, ); TRACE_EVENT(rxrpc_rx_discard_ack, - TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, - rxrpc_seq_t first_soft_ack, rxrpc_seq_t call_ackr_first, - rxrpc_seq_t prev_pkt, rxrpc_seq_t call_ackr_prev), + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, + rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt), - TP_ARGS(debug_id, serial, first_soft_ack, call_ackr_first, - prev_pkt, call_ackr_prev), + TP_ARGS(call, serial, hard_ack, prev_pkt), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(rxrpc_serial_t, serial) - __field(rxrpc_seq_t, first_soft_ack) - __field(rxrpc_seq_t, call_ackr_first) + __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, prev_pkt) - __field(rxrpc_seq_t, call_ackr_prev) + __field(rxrpc_seq_t, acks_hard_ack) + __field(rxrpc_seq_t, acks_prev_seq) ), TP_fast_assign( - __entry->debug_id = debug_id; + __entry->debug_id = call->debug_id; __entry->serial = serial; - __entry->first_soft_ack = first_soft_ack; - __entry->call_ackr_first = call_ackr_first; + __entry->hard_ack = hard_ack; __entry->prev_pkt = prev_pkt; - __entry->call_ackr_prev = call_ackr_prev; + __entry->acks_hard_ack = call->acks_hard_ack; + __entry->acks_prev_seq = call->acks_prev_seq; ), TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x", __entry->debug_id, __entry->serial, - __entry->first_soft_ack, - __entry->call_ackr_first, + __entry->hard_ack, + __entry->acks_hard_ack, __entry->prev_pkt, - __entry->call_ackr_prev) + __entry->acks_prev_seq) ); TRACE_EVENT(rxrpc_req_ack, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6683043cee3f9..3e57cef7385f7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -757,7 +757,7 @@ struct rxrpc_call { /* Transmission-phase ACK management (ACKs we've received). */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ - rxrpc_seq_t acks_first_seq; /* first sequence number received */ + rxrpc_seq_t acks_hard_ack; /* Highest sequence hard acked */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index afb87a3322da9..b89fd0dee324b 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -782,12 +782,12 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb */ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - rxrpc_seq_t seq) + rxrpc_seq_t hard_ack) { struct sk_buff *skb = call->cong_last_nack; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned int i, new_acks = 0, retained_nacks = 0; - rxrpc_seq_t old_seq = sp->ack.first_ack; + rxrpc_seq_t seq = hard_ack + 1, old_seq = sp->ack.first_ack; u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); if (after_eq(seq, old_seq + sp->ack.nr_acks)) { @@ -810,7 +810,7 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, summary->nr_retained_nacks = retained_nacks; } - return old_seq + sp->ack.nr_acks; + return old_seq + sp->ack.nr_acks - 1; } /* @@ -825,22 +825,23 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, static void rxrpc_input_soft_acks(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, struct sk_buff *skb, - rxrpc_seq_t seq, rxrpc_seq_t since) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned int i, old_nacks = 0; - rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks; + rxrpc_seq_t lowest_nak = call->acks_hard_ack + sp->ack.nr_acks + 1; + rxrpc_seq_t seq = call->acks_hard_ack; u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); for (i = 0; i < sp->ack.nr_acks; i++) { + seq++; if (acks[i] == RXRPC_ACK_TYPE_ACK) { summary->nr_acks++; - if (after_eq(seq, since)) + if (after(seq, since)) summary->nr_new_acks++; } else { summary->saw_nacks = true; - if (before(seq, since)) { + if (before_eq(seq, since)) { /* Overlap with previous ACK */ old_nacks++; } else { @@ -851,7 +852,6 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, if (before(seq, lowest_nak)) lowest_nak = seq; } - seq++; } if (lowest_nak != call->acks_lowest_nak) { @@ -874,21 +874,21 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * with respect to the ack state conveyed by preceding ACKs. */ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, - rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt) + rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt) { - rxrpc_seq_t base = READ_ONCE(call->acks_first_seq); + rxrpc_seq_t base = READ_ONCE(call->acks_hard_ack); - if (after(first_pkt, base)) + if (after(hard_ack, base)) return true; /* The window advanced */ - if (before(first_pkt, base)) + if (before(hard_ack, base)) return false; /* firstPacket regressed */ if (after_eq(prev_pkt, call->acks_prev_seq)) return true; /* previousPacket hasn't regressed. */ /* Some rx implementations put a serial number in previousPacket. */ - if (after_eq(prev_pkt, base + call->tx_winsize)) + if (after(prev_pkt, base + call->tx_winsize)) return false; return true; } @@ -906,8 +906,8 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_acktrailer trailer; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); rxrpc_serial_t ack_serial, acked_serial; rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since; int nr_acks, offset, ioffset; @@ -925,9 +925,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? sp->ack.reason : RXRPC_ACK__INVALID); - trace_rxrpc_rx_ack(call, ack_serial, acked_serial, - first_soft_ack, prev_pkt, - summary.ack_reason, nr_acks); + trace_rxrpc_rx_ack(call, sp); rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); if (acked_serial != 0) { @@ -952,7 +950,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * lost the call because it switched to a different peer. */ if (unlikely(summary.ack_reason == RXRPC_ACK_EXCEEDS_WINDOW) && - first_soft_ack == 1 && + hard_ack == 0 && prev_pkt == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, @@ -965,7 +963,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * if we still have it buffered to the beginning. */ if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) && - first_soft_ack == 1 && + hard_ack == 0 && prev_pkt == 0 && call->tx_bottom == 0 && rxrpc_is_client_call(call)) { @@ -975,10 +973,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } /* Discard any out-of-order or duplicate ACKs (outside lock). */ - if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { - trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, - first_soft_ack, call->acks_first_seq, - prev_pkt, call->acks_prev_seq); + if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) { + trace_rxrpc_rx_discard_ack(call, ack_serial, hard_ack, prev_pkt); goto send_response; } @@ -992,17 +988,17 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) skb_condense(skb); if (call->cong_last_nack) { - since = rxrpc_input_check_prev_ack(call, &summary, first_soft_ack); + since = rxrpc_input_check_prev_ack(call, &summary, hard_ack); rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); call->cong_last_nack = NULL; } else { - summary.nr_new_acks = first_soft_ack - call->acks_first_seq; - call->acks_lowest_nak = first_soft_ack + nr_acks; - since = first_soft_ack; + summary.nr_new_acks = hard_ack - call->acks_hard_ack; + call->acks_lowest_nak = hard_ack + nr_acks; + since = hard_ack; } call->acks_latest_ts = skb->tstamp; - call->acks_first_seq = first_soft_ack; + call->acks_hard_ack = hard_ack; call->acks_prev_seq = prev_pkt; switch (summary.ack_reason) { @@ -1018,7 +1014,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (trailer.maxMTU) rxrpc_input_ack_trailer(call, skb, &trailer); - if (first_soft_ack == 0) + if (hard_ack + 1 == 0) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); /* Ignore ACKs unless we are or have just been transmitting. */ @@ -1048,7 +1044,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) { if (offset > (int)skb->len - nr_acks) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack); - rxrpc_input_soft_acks(call, &summary, skb, first_soft_ack, since); + rxrpc_input_soft_acks(call, &summary, skb, since); rxrpc_get_skb(skb, rxrpc_skb_get_last_nack); call->cong_last_nack = skb; } From f003e4038f0e14b3b374f7dae76dfeef9591f006 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:49 +0000 Subject: [PATCH 0298/1450] rxrpc: Display stats about jumbo packets transmitted and received In /proc/net/rxrpc/stats, display statistics about the numbers of different sizes of jumbo packets transmitted and received, showing counts for 1 subpacket (ie. a non-jumbo packet), 2 subpackets, 3, ... to 8 and then 9+. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-22-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/input.c | 6 +++++- net/rxrpc/output.c | 5 ++++- net/rxrpc/proc.c | 26 ++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 3e57cef7385f7..840293f913a38 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -111,6 +111,8 @@ struct rxrpc_net { atomic_t stat_tx_ack_skip; atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; + atomic_t stat_tx_jumbo[10]; + atomic_t stat_rx_jumbo[10]; atomic_t stat_why_req_ack[8]; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b89fd0dee324b..8d7ab4b9d7d03 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -568,7 +568,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = skb->len - offset; bool notify = false; - int ack_reason = 0; + int ack_reason = 0, count = 1, stat_ix; while (sp->hdr.flags & RXRPC_JUMBO_PACKET) { if (len < RXRPC_JUMBO_SUBPKTLEN) @@ -597,12 +597,16 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb sp->hdr.serial++; offset += RXRPC_JUMBO_SUBPKTLEN; len -= RXRPC_JUMBO_SUBPKTLEN; + count++; } sp->offset = offset; sp->len = len; rxrpc_input_data_one(call, skb, ¬ify, &ack_serial, &ack_reason); + stat_ix = umin(count, ARRAY_SIZE(call->rxnet->stat_rx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_rx_jumbo[stat_ix]); + if (ack_reason > 0) { rxrpc_send_ACK(call, ack_reason, ack_serial, rxrpc_propose_ack_input_data); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index c2044d5932374..3886777d1bb60 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -552,10 +552,13 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req rxrpc_seq_t seq = req->seq; size_t len; bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); - int ret; + int ret, stat_ix; _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1); + stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]); + len = rxrpc_prepare_data_packet(call, req); txb = tq->bufs[seq & RXRPC_TXQ_MASK]; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index a8325b8e33c2c..5f974ec13d694 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -529,6 +529,30 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_retrans]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin])); + seq_printf(seq, + "Jumbo-Tx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_tx_jumbo[0]), + atomic_read(&rxnet->stat_tx_jumbo[1]), + atomic_read(&rxnet->stat_tx_jumbo[2]), + atomic_read(&rxnet->stat_tx_jumbo[3]), + atomic_read(&rxnet->stat_tx_jumbo[4]), + atomic_read(&rxnet->stat_tx_jumbo[5]), + atomic_read(&rxnet->stat_tx_jumbo[6]), + atomic_read(&rxnet->stat_tx_jumbo[7]), + atomic_read(&rxnet->stat_tx_jumbo[8]), + atomic_read(&rxnet->stat_tx_jumbo[9])); + seq_printf(seq, + "Jumbo-Rx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_rx_jumbo[0]), + atomic_read(&rxnet->stat_rx_jumbo[1]), + atomic_read(&rxnet->stat_rx_jumbo[2]), + atomic_read(&rxnet->stat_rx_jumbo[3]), + atomic_read(&rxnet->stat_rx_jumbo[4]), + atomic_read(&rxnet->stat_rx_jumbo[5]), + atomic_read(&rxnet->stat_rx_jumbo[6]), + atomic_read(&rxnet->stat_rx_jumbo[7]), + atomic_read(&rxnet->stat_rx_jumbo[8]), + atomic_read(&rxnet->stat_rx_jumbo[9])); seq_printf(seq, "Buffers : txb=%u rxb=%u\n", atomic_read(&rxrpc_nr_txbuf), @@ -566,6 +590,8 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size) atomic_set(&rxnet->stat_tx_ack_skip, 0); memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks)); memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks)); + memset(&rxnet->stat_tx_jumbo, 0, sizeof(rxnet->stat_tx_jumbo)); + memset(&rxnet->stat_rx_jumbo, 0, sizeof(rxnet->stat_rx_jumbo)); memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack)); From f7dd0dc9651326f609579fa81cbdda69b0467c2a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:50 +0000 Subject: [PATCH 0299/1450] rxrpc: Adjust names and types of congestion-related fields Adjust some of the names of fields and constants to make them look a bit more like the TCP congestion symbol names, such as flight_size -> in_flight and congest_mode to ca_state. Move the persistent congestion-related fields from the rxrpc_ack_summary struct into the rxrpc_call struct rather than copying them out and back in again. The rxrpc_congest tracepoint can fetch them from the call struct. Rename the counters for soft acks and nacks to have an 's' on the front to reflect the softness, e.g. nr_acks -> nr_sacks. Make fields counting numbers of packets or numbers of acks u16 rather than u8 to allow for windows of up to 8192 DATA packets in flight in future. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-23-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 58 ++++++++------ net/rxrpc/ar-internal.h | 51 ++++++------ net/rxrpc/conn_client.c | 4 +- net/rxrpc/input.c | 151 ++++++++++++++++------------------- net/rxrpc/output.c | 2 +- 5 files changed, 132 insertions(+), 134 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 91108e0de3af9..d47b8235fad38 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -378,11 +378,11 @@ EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \ E_(rxrpc_propose_ack_terminal_ack, "ClTerm ") -#define rxrpc_congest_modes \ - EM(RXRPC_CALL_CONGEST_AVOIDANCE, "CongAvoid") \ - EM(RXRPC_CALL_FAST_RETRANSMIT, "FastReTx ") \ - EM(RXRPC_CALL_PACKET_LOSS, "PktLoss ") \ - E_(RXRPC_CALL_SLOW_START, "SlowStart") +#define rxrpc_ca_states \ + EM(RXRPC_CA_CONGEST_AVOIDANCE, "CongAvoid") \ + EM(RXRPC_CA_FAST_RETRANSMIT, "FastReTx ") \ + EM(RXRPC_CA_PACKET_LOSS, "PktLoss ") \ + E_(RXRPC_CA_SLOW_START, "SlowStart") #define rxrpc_congest_changes \ EM(rxrpc_cong_begin_retransmission, " Retrans") \ @@ -550,11 +550,11 @@ enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); rxrpc_abort_reasons; rxrpc_bundle_traces; +rxrpc_ca_states; rxrpc_call_poke_traces; rxrpc_call_traces; rxrpc_client_traces; rxrpc_congest_changes; -rxrpc_congest_modes; rxrpc_conn_traces; rxrpc_local_traces; rxrpc_pmtud_reduce_traces; @@ -1688,27 +1688,39 @@ TRACE_EVENT(rxrpc_retransmit, TRACE_EVENT(rxrpc_congest, TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - rxrpc_serial_t ack_serial, enum rxrpc_congest_change change), + rxrpc_serial_t ack_serial), - TP_ARGS(call, summary, ack_serial, change), + TP_ARGS(call, summary, ack_serial), TP_STRUCT__entry( __field(unsigned int, call) - __field(enum rxrpc_congest_change, change) + __field(enum rxrpc_ca_state, ca_state) __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, top) __field(rxrpc_seq_t, lowest_nak) __field(rxrpc_serial_t, ack_serial) + __field(u16, nr_sacks) + __field(u16, nr_snacks) + __field(u16, cwnd) + __field(u16, ssthresh) + __field(u16, cumul_acks) + __field(u16, dup_acks) __field_struct(struct rxrpc_ack_summary, sum) ), TP_fast_assign( __entry->call = call->debug_id; - __entry->change = change; + __entry->ca_state = call->cong_ca_state; __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->ack_serial = ack_serial; + __entry->nr_sacks = call->acks_nr_sacks; + __entry->nr_snacks = call->acks_nr_snacks; + __entry->cwnd = call->cong_cwnd; + __entry->ssthresh = call->cong_ssthresh; + __entry->cumul_acks = call->cong_cumul_acks; + __entry->dup_acks = call->cong_dup_acks; memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), @@ -1717,17 +1729,17 @@ TRACE_EVENT(rxrpc_congest, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), __entry->hard_ack, - __print_symbolic(__entry->sum.mode, rxrpc_congest_modes), - __entry->sum.cwnd, - __entry->sum.ssthresh, - __entry->sum.nr_acks, __entry->sum.nr_retained_nacks, - __entry->sum.nr_new_acks, - __entry->sum.nr_new_nacks, + __print_symbolic(__entry->ca_state, rxrpc_ca_states), + __entry->cwnd, + __entry->ssthresh, + __entry->nr_sacks, __entry->sum.nr_retained_snacks, + __entry->sum.nr_new_sacks, + __entry->sum.nr_new_snacks, __entry->top - __entry->hard_ack, - __entry->sum.cumulative_acks, - __entry->sum.dup_acks, - __entry->lowest_nak, __entry->sum.new_low_nack ? "!" : "", - __print_symbolic(__entry->change, rxrpc_congest_changes), + __entry->cumul_acks, + __entry->dup_acks, + __entry->lowest_nak, __entry->sum.new_low_snack ? "!" : "", + __print_symbolic(__entry->sum.change, rxrpc_congest_changes), __entry->sum.retrans_timeo ? " rTxTo" : "") ); @@ -1738,7 +1750,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, TP_STRUCT__entry( __field(unsigned int, call) - __field(enum rxrpc_congest_mode, mode) + __field(enum rxrpc_ca_state, ca_state) __field(unsigned short, cwnd) __field(unsigned short, extra) __field(rxrpc_seq_t, hard_ack) @@ -1749,7 +1761,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, TP_fast_assign( __entry->call = call->debug_id; - __entry->mode = call->cong_mode; + __entry->ca_state = call->cong_ca_state; __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; __entry->hard_ack = call->acks_hard_ack; @@ -1761,7 +1773,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", __entry->call, __entry->hard_ack, - __print_symbolic(__entry->mode, rxrpc_congest_modes), + __print_symbolic(__entry->ca_state, rxrpc_ca_states), __entry->cwnd, __entry->extra, __entry->prepared, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 840293f913a38..f6e6b2ab6c2a1 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -623,13 +623,13 @@ enum rxrpc_call_state { /* * Call Tx congestion management modes. */ -enum rxrpc_congest_mode { - RXRPC_CALL_SLOW_START, - RXRPC_CALL_CONGEST_AVOIDANCE, - RXRPC_CALL_PACKET_LOSS, - RXRPC_CALL_FAST_RETRANSMIT, - NR__RXRPC_CONGEST_MODES -}; +enum rxrpc_ca_state { + RXRPC_CA_SLOW_START, + RXRPC_CA_CONGEST_AVOIDANCE, + RXRPC_CA_PACKET_LOSS, + RXRPC_CA_FAST_RETRANSMIT, + NR__RXRPC_CA_STATES +} __mode(byte); /* * RxRPC call definition @@ -727,12 +727,12 @@ struct rxrpc_call { */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN #define RXRPC_MIN_CWND 4 - u8 cong_cwnd; /* Congestion window size */ + enum rxrpc_ca_state cong_ca_state; /* Congestion control state */ u8 cong_extra; /* Extra to send for congestion management */ - u8 cong_ssthresh; /* Slow-start threshold */ - enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ - u8 cong_dup_acks; /* Count of ACKs showing missing packets */ - u8 cong_cumul_acks; /* Cumulative ACK count */ + u16 cong_cwnd; /* Congestion window size */ + u16 cong_ssthresh; /* Slow-start threshold */ + u16 cong_dup_acks; /* Count of ACKs showing missing packets */ + u16 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ struct sk_buff *cong_last_nack; /* Last ACK with nacks received */ @@ -763,27 +763,24 @@ struct rxrpc_call { rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ + unsigned short acks_nr_sacks; /* Number of soft acks recorded */ + unsigned short acks_nr_snacks; /* Number of soft nacks recorded */ }; /* * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { - u16 nr_acks; /* Number of ACKs in packet */ - u16 nr_new_acks; /* Number of new ACKs in packet */ - u16 nr_new_nacks; /* Number of new nacks in packet */ - u16 nr_retained_nacks; /* Number of nacks retained between ACKs */ - u8 ack_reason; - bool saw_nacks; /* Saw NACKs in packet */ - bool new_low_nack; /* T if new low NACK found */ - bool retrans_timeo; /* T if reTx due to timeout happened */ - u8 flight_size; /* Number of unreceived transmissions */ - /* Place to stash values for tracing */ - enum rxrpc_congest_mode mode:8; - u8 cwnd; - u8 ssthresh; - u8 dup_acks; - u8 cumulative_acks; + u16 in_flight; /* Number of unreceived transmissions */ + u16 nr_new_hacks; /* Number of rotated new ACKs */ + u16 nr_new_sacks; /* Number of new soft ACKs in packet */ + u16 nr_new_snacks; /* Number of new soft nacks in packet */ + u16 nr_retained_snacks; /* Number of nacks retained between ACKs */ + u8 ack_reason; + bool saw_snacks:1; /* T if we saw a soft NACK */ + bool new_low_snack:1; /* T if new low soft NACK found */ + bool retrans_timeo:1; /* T if reTx due to timeout happened */ + u8 /*enum rxrpc_congest_change*/ change; }; /* diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 706631e6ac2f1..5f76bd90567c0 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -437,9 +437,9 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->dest_srx.srx_service = conn->service_id; call->cong_ssthresh = call->peer->cong_ssthresh; if (call->cong_cwnd >= call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; else - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_ca_state = RXRPC_CA_SLOW_START; chan->call_id = call_id; chan->call_debug_id = call->debug_id; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 8d7ab4b9d7d03..c25d816aafeed 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -34,49 +34,41 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, rxrpc_serial_t acked_serial) { - enum rxrpc_congest_change change = rxrpc_cong_no_change; - unsigned int cumulative_acks = call->cong_cumul_acks; - unsigned int cwnd = call->cong_cwnd; bool resend = false; - summary->flight_size = - (call->tx_top - call->tx_bottom) - summary->nr_acks; + summary->change = rxrpc_cong_no_change; + summary->in_flight = (call->tx_top - call->tx_bottom) - call->acks_nr_sacks; if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; - call->cong_ssthresh = umax(summary->flight_size / 2, 2); - cwnd = 1; - if (cwnd >= call->cong_ssthresh && - call->cong_mode == RXRPC_CALL_SLOW_START) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = 1; + if (call->cong_cwnd >= call->cong_ssthresh && + call->cong_ca_state == RXRPC_CA_SLOW_START) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; call->cong_tstamp = skb->tstamp; - cumulative_acks = 0; + call->cong_cumul_acks = 0; } } - cumulative_acks += summary->nr_new_acks; - if (cumulative_acks > 255) - cumulative_acks = 255; + call->cong_cumul_acks += summary->nr_new_sacks; + if (call->cong_cumul_acks > 255) + call->cong_cumul_acks = 255; - summary->cwnd = call->cong_cwnd; - summary->ssthresh = call->cong_ssthresh; - summary->cumulative_acks = cumulative_acks; - summary->dup_acks = call->cong_dup_acks; - - switch (call->cong_mode) { - case RXRPC_CALL_SLOW_START: - if (summary->saw_nacks) + switch (call->cong_ca_state) { + case RXRPC_CA_SLOW_START: + if (summary->saw_snacks) goto packet_loss_detected; - if (summary->cumulative_acks > 0) - cwnd += 1; - if (cwnd >= call->cong_ssthresh) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + if (call->cong_cumul_acks > 0) + call->cong_cwnd += 1; + if (call->cong_cwnd >= call->cong_ssthresh) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; call->cong_tstamp = skb->tstamp; } goto out; - case RXRPC_CALL_CONGEST_AVOIDANCE: - if (summary->saw_nacks) + case RXRPC_CA_CONGEST_AVOIDANCE: + if (summary->saw_snacks) goto packet_loss_detected; /* We analyse the number of packets that get ACK'd per RTT @@ -88,18 +80,18 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, ktime_add_us(call->cong_tstamp, call->peer->srtt_us >> 3))) goto out_no_clear_ca; - change = rxrpc_cong_rtt_window_end; + summary->change = rxrpc_cong_rtt_window_end; call->cong_tstamp = skb->tstamp; - if (cumulative_acks >= cwnd) - cwnd++; + if (call->cong_cumul_acks >= call->cong_cwnd) + call->cong_cwnd++; goto out; - case RXRPC_CALL_PACKET_LOSS: - if (!summary->saw_nacks) + case RXRPC_CA_PACKET_LOSS: + if (!summary->saw_snacks) goto resume_normality; - if (summary->new_low_nack) { - change = rxrpc_cong_new_low_nack; + if (summary->new_low_snack) { + summary->change = rxrpc_cong_new_low_nack; call->cong_dup_acks = 1; if (call->cong_extra > 1) call->cong_extra = 1; @@ -110,29 +102,29 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (call->cong_dup_acks < 3) goto send_extra_data; - change = rxrpc_cong_begin_retransmission; - call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; - call->cong_ssthresh = umax(summary->flight_size / 2, 2); - cwnd = call->cong_ssthresh + 3; + summary->change = rxrpc_cong_begin_retransmission; + call->cong_ca_state = RXRPC_CA_FAST_RETRANSMIT; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = call->cong_ssthresh + 3; call->cong_extra = 0; call->cong_dup_acks = 0; resend = true; goto out; - case RXRPC_CALL_FAST_RETRANSMIT: - if (!summary->new_low_nack) { - if (summary->nr_new_acks == 0) - cwnd += 1; + case RXRPC_CA_FAST_RETRANSMIT: + if (!summary->new_low_snack) { + if (summary->nr_new_sacks == 0) + call->cong_cwnd += 1; call->cong_dup_acks++; if (call->cong_dup_acks == 2) { - change = rxrpc_cong_retransmit_again; + summary->change = rxrpc_cong_retransmit_again; call->cong_dup_acks = 0; resend = true; } } else { - change = rxrpc_cong_progress; - cwnd = call->cong_ssthresh; - if (!summary->saw_nacks) + summary->change = rxrpc_cong_progress; + call->cong_cwnd = call->cong_ssthresh; + if (!summary->saw_snacks) goto resume_normality; } goto out; @@ -143,30 +135,27 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, } resume_normality: - change = rxrpc_cong_cleared_nacks; + summary->change = rxrpc_cong_cleared_nacks; call->cong_dup_acks = 0; call->cong_extra = 0; call->cong_tstamp = skb->tstamp; - if (cwnd < call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_SLOW_START; + if (call->cong_cwnd < call->cong_ssthresh) + call->cong_ca_state = RXRPC_CA_SLOW_START; else - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; out: - cumulative_acks = 0; + call->cong_cumul_acks = 0; out_no_clear_ca: - if (cwnd >= RXRPC_TX_MAX_WINDOW) - cwnd = RXRPC_TX_MAX_WINDOW; - call->cong_cwnd = cwnd; - call->cong_cumul_acks = cumulative_acks; - summary->mode = call->cong_mode; - trace_rxrpc_congest(call, summary, acked_serial, change); + if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW) + call->cong_cwnd = RXRPC_TX_MAX_WINDOW; + trace_rxrpc_congest(call, summary, acked_serial); if (resend) rxrpc_resend(call, skb); return; packet_loss_detected: - change = rxrpc_cong_saw_nack; - call->cong_mode = RXRPC_CALL_PACKET_LOSS; + summary->change = rxrpc_cong_saw_nack; + call->cong_ca_state = RXRPC_CA_PACKET_LOSS; call->cong_dup_acks = 0; goto send_extra_data; @@ -175,7 +164,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, * state. */ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) || - summary->nr_acks != call->tx_top - call->tx_bottom) { + call->acks_nr_sacks != call->tx_top - call->tx_bottom) { call->cong_extra++; wake_up(&call->waitq); } @@ -189,8 +178,8 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) { ktime_t rtt, now; - if (call->cong_mode != RXRPC_CALL_SLOW_START && - call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE) + if (call->cong_ca_state != RXRPC_CA_SLOW_START && + call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE) return; if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY) return; @@ -203,7 +192,7 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) trace_rxrpc_reset_cwnd(call, now); rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); call->tx_last_sent = now; - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_ca_state = RXRPC_CA_SLOW_START; call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4); call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND); } @@ -282,7 +271,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, if (call->acks_lowest_nak == call->tx_bottom) { call->acks_lowest_nak = to; } else if (after(to, call->acks_lowest_nak)) { - summary->new_low_nack = true; + summary->new_low_snack = true; call->acks_lowest_nak = to; } @@ -795,11 +784,11 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); if (after_eq(seq, old_seq + sp->ack.nr_acks)) { - summary->nr_new_acks += sp->ack.nr_nacks; - summary->nr_new_acks += seq - (old_seq + sp->ack.nr_acks); - summary->nr_retained_nacks = 0; + summary->nr_new_sacks += sp->ack.nr_nacks; + summary->nr_new_sacks += seq - (old_seq + sp->ack.nr_acks); + summary->nr_retained_snacks = 0; } else if (seq == old_seq) { - summary->nr_retained_nacks = sp->ack.nr_nacks; + summary->nr_retained_snacks = sp->ack.nr_nacks; } else { for (i = 0; i < sp->ack.nr_acks; i++) { if (acks[i] == RXRPC_ACK_TYPE_NACK) { @@ -810,8 +799,8 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, } } - summary->nr_new_acks += new_acks; - summary->nr_retained_nacks = retained_nacks; + summary->nr_new_sacks += new_acks; + summary->nr_retained_snacks = retained_nacks; } return old_seq + sp->ack.nr_acks - 1; @@ -840,16 +829,16 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, for (i = 0; i < sp->ack.nr_acks; i++) { seq++; if (acks[i] == RXRPC_ACK_TYPE_ACK) { - summary->nr_acks++; + call->acks_nr_sacks++; if (after(seq, since)) - summary->nr_new_acks++; + summary->nr_new_sacks++; } else { - summary->saw_nacks = true; + summary->saw_snacks = true; if (before_eq(seq, since)) { /* Overlap with previous ACK */ old_nacks++; } else { - summary->nr_new_nacks++; + summary->nr_new_snacks++; sp->ack.nr_nacks++; } @@ -860,7 +849,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, if (lowest_nak != call->acks_lowest_nak) { call->acks_lowest_nak = lowest_nak; - summary->new_low_nack = true; + summary->new_low_snack = true; } /* We *can* have more nacks than we did - the peer is permitted to drop @@ -868,9 +857,9 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * possible for the nack distribution to change whilst the number of * nacks stays the same or goes down. */ - if (old_nacks < summary->nr_retained_nacks) - summary->nr_new_acks += summary->nr_retained_nacks - old_nacks; - summary->nr_retained_nacks = old_nacks; + if (old_nacks < summary->nr_retained_snacks) + summary->nr_new_sacks += summary->nr_retained_snacks - old_nacks; + summary->nr_retained_snacks = old_nacks; } /* @@ -996,7 +985,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); call->cong_last_nack = NULL; } else { - summary.nr_new_acks = hard_ack - call->acks_hard_ack; + summary.nr_new_sacks = hard_ack - call->acks_hard_ack; call->acks_lowest_nak = hard_ack + nr_acks; since = hard_ack; } @@ -1054,7 +1043,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && - summary.nr_acks == call->tx_top - hard_ack && + call->acks_nr_sacks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) rxrpc_propose_ping(call, ack_serial, rxrpc_propose_ack_ping_for_lost_reply); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 3886777d1bb60..7ed928b6f0e16 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -419,7 +419,7 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, why = rxrpc_reqack_ack_lost; else if (txb->flags & RXRPC_TXBUF_RESENT) why = rxrpc_reqack_retrans; - else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2) + else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= 2) why = rxrpc_reqack_slow_start; else if (call->tx_winsize <= 2) why = rxrpc_reqack_small_txwin; From 9b052c6b92f9316d670bf50566f70e183d0d19cb Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:51 +0000 Subject: [PATCH 0300/1450] rxrpc: Use the new rxrpc_tx_queue struct to more efficiently process ACKs With the change in the structure of the transmission buffer to store buffers in bunches of 32 or 64 (BITS_PER_LONG) we can place sets of per-buffer flags into the rxrpc_tx_queue struct rather than storing them in rxrpc_tx_buf, thereby vastly increasing efficiency when assessing the SACK table in an ACK packet. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-24-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 86 ++++++++++-- net/rxrpc/ar-internal.h | 23 +++- net/rxrpc/call_event.c | 181 ++++++++++++------------- net/rxrpc/call_object.c | 1 - net/rxrpc/input.c | 252 ++++++++++++++++++++++------------- net/rxrpc/output.c | 10 +- net/rxrpc/sendmsg.c | 3 + 7 files changed, 352 insertions(+), 204 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d47b8235fad38..609522a5bd0f7 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -132,7 +132,6 @@ EM(rxrpc_skb_get_call_rx, "GET call-rx ") \ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ - EM(rxrpc_skb_get_last_nack, "GET last-nack") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ @@ -147,7 +146,6 @@ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ - EM(rxrpc_skb_put_last_nack, "PUT last-nack") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ @@ -499,6 +497,11 @@ EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ E_(rxrpc_pmtud_reduce_route, "Route") +#define rxrpc_rotate_traces \ + EM(rxrpc_rotate_trace_hack, "hard-ack") \ + EM(rxrpc_rotate_trace_sack, "soft-ack") \ + E_(rxrpc_rotate_trace_snak, "soft-nack") + /* * Generate enums for tracing information. */ @@ -525,6 +528,7 @@ enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); enum rxrpc_recvmsg_trace { rxrpc_recvmsg_traces } __mode(byte); enum rxrpc_req_ack_trace { rxrpc_req_ack_traces } __mode(byte); +enum rxrpc_rotate_trace { rxrpc_rotate_traces } __mode(byte); enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte); enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte); @@ -562,6 +566,7 @@ rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; rxrpc_req_ack_traces; +rxrpc_rotate_traces; rxrpc_rtt_rx_traces; rxrpc_rtt_tx_traces; rxrpc_sack_traces; @@ -1667,6 +1672,7 @@ TRACE_EVENT(rxrpc_retransmit, TP_STRUCT__entry( __field(unsigned int, call) + __field(unsigned int, qbase) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) __field(ktime_t, expiry) @@ -1674,13 +1680,15 @@ TRACE_EVENT(rxrpc_retransmit, TP_fast_assign( __entry->call = call->debug_id; + __entry->qbase = req->tq->qbase; __entry->seq = req->seq; __entry->serial = txb->serial; __entry->expiry = expiry; ), - TP_printk("c=%08x q=%x r=%x xp=%lld", + TP_printk("c=%08x tq=%x q=%x r=%x xp=%lld", __entry->call, + __entry->qbase, __entry->seq, __entry->serial, ktime_to_us(__entry->expiry)) @@ -1724,7 +1732,7 @@ TRACE_EVENT(rxrpc_congest, memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), - TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u,%u b=%u u=%u d=%u l=%x%s%s%s", + TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u A=%u+%u/%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), @@ -1732,9 +1740,9 @@ TRACE_EVENT(rxrpc_congest, __print_symbolic(__entry->ca_state, rxrpc_ca_states), __entry->cwnd, __entry->ssthresh, - __entry->nr_sacks, __entry->sum.nr_retained_snacks, - __entry->sum.nr_new_sacks, - __entry->sum.nr_new_snacks, + __entry->nr_sacks, __entry->sum.nr_new_sacks, + __entry->nr_snacks, __entry->sum.nr_new_snacks, + __entry->sum.nr_new_hacks, __entry->top - __entry->hard_ack, __entry->cumul_acks, __entry->dup_acks, @@ -1850,10 +1858,36 @@ TRACE_EVENT(rxrpc_connect_call, &__entry->srx.transport) ); +TRACE_EVENT(rxrpc_apply_acks, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq), + + TP_ARGS(call, tq), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(unsigned int, nr_rep) + __field(rxrpc_seq_t, qbase) + __field(unsigned long, acks) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->acks = tq->segment_acked; + __entry->nr_rep = tq->nr_reported_acks; + ), + + TP_printk("c=%08x tq=%x acks=%016lx rep=%u", + __entry->call, + __entry->qbase, + __entry->acks, + __entry->nr_rep) + ); + TRACE_EVENT(rxrpc_resend, - TP_PROTO(struct rxrpc_call *call, struct sk_buff *ack), + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t ack_serial), - TP_ARGS(call, ack), + TP_ARGS(call, ack_serial), TP_STRUCT__entry( __field(unsigned int, call) @@ -1863,11 +1897,10 @@ TRACE_EVENT(rxrpc_resend, ), TP_fast_assign( - struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; __entry->seq = call->acks_hard_ack; __entry->transmitted = call->tx_transmitted; - __entry->ack_serial = sp ? sp->hdr.serial : 0; + __entry->ack_serial = ack_serial; ), TP_printk("c=%08x r=%x q=%x tq=%x", @@ -1877,6 +1910,37 @@ TRACE_EVENT(rxrpc_resend, __entry->transmitted) ); +TRACE_EVENT(rxrpc_rotate, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, + struct rxrpc_ack_summary *summary, rxrpc_seq_t seq, + enum rxrpc_rotate_trace trace), + + TP_ARGS(call, tq, summary, seq, trace), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(rxrpc_seq_t, seq) + __field(unsigned int, nr_rep) + __field(enum rxrpc_rotate_trace, trace) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->seq = seq; + __entry->nr_rep = tq->nr_reported_acks; + __entry->trace = trace; + ), + + TP_printk("c=%08x tq=%x q=%x nr=%x %s", + __entry->call, + __entry->qbase, + __entry->seq, + __entry->nr_rep, + __print_symbolic(__entry->trace, rxrpc_rotate_traces)) + ); + TRACE_EVENT(rxrpc_rx_icmp, TP_PROTO(struct rxrpc_peer *peer, struct sock_extended_err *ee, struct sockaddr_rxrpc *srx), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f6e6b2ab6c2a1..9a70f0b86570a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -214,9 +214,8 @@ struct rxrpc_skb_priv { rxrpc_seq_t first_ack; /* First packet in acks table */ rxrpc_seq_t prev_ack; /* Highest seq seen */ rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */ + u16 nr_acks; /* Number of acks+nacks */ u8 reason; /* Reason for ack */ - u8 nr_acks; /* Number of acks+nacks */ - u8 nr_nacks; /* Number of nacks */ } ack; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ @@ -734,7 +733,6 @@ struct rxrpc_call { u16 cong_dup_acks; /* Count of ACKs showing missing packets */ u16 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ - struct sk_buff *cong_last_nack; /* Last ACK with nacks received */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ @@ -775,11 +773,10 @@ struct rxrpc_ack_summary { u16 nr_new_hacks; /* Number of rotated new ACKs */ u16 nr_new_sacks; /* Number of new soft ACKs in packet */ u16 nr_new_snacks; /* Number of new soft nacks in packet */ - u16 nr_retained_snacks; /* Number of nacks retained between ACKs */ u8 ack_reason; - bool saw_snacks:1; /* T if we saw a soft NACK */ bool new_low_snack:1; /* T if new low soft NACK found */ bool retrans_timeo:1; /* T if reTx due to timeout happened */ + bool need_retransmit:1; /* T if we need transmission */ u8 /*enum rxrpc_congest_change*/ change; }; @@ -858,6 +855,10 @@ struct rxrpc_txqueue { struct rxrpc_txqueue *next; ktime_t xmit_ts_base; rxrpc_seq_t qbase; + u8 nr_reported_acks; /* Number of segments explicitly acked/nacked */ + unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */ + unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */ + unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */ /* The arrays we want to pack into as few cache lines as possible. */ struct { @@ -935,7 +936,7 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); +void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_response); bool rxrpc_input_call_event(struct rxrpc_call *call); @@ -1383,6 +1384,16 @@ static inline bool after_eq(u32 seq1, u32 seq2) return (s32)(seq1 - seq2) >= 0; } +static inline u32 earliest(u32 seq1, u32 seq2) +{ + return before(seq1, seq2) ? seq1 : seq2; +} + +static inline u32 latest(u32 seq1, u32 seq2) +{ + return after(seq1, seq2) ? seq1 : seq2; +} + static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb) { rxrpc_get_skb(skb, rxrpc_skb_get_call_rx); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 2311e5c737e84..e25921d39d4d7 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -65,9 +65,9 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) /* * Retransmit one or more packets. */ -static void rxrpc_retransmit_data(struct rxrpc_call *call, +static bool rxrpc_retransmit_data(struct rxrpc_call *call, struct rxrpc_send_data_req *req, - ktime_t rto) + ktime_t rto, bool skip_too_young) { struct rxrpc_txqueue *tq = req->tq; unsigned int ix = req->seq & RXRPC_TXQ_MASK; @@ -78,9 +78,11 @@ static void rxrpc_retransmit_data(struct rxrpc_call *call, xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); resend_at = ktime_add(xmit_ts, rto); - trace_rxrpc_retransmit(call, req, txb, - ktime_sub(resend_at, req->now)); + trace_rxrpc_retransmit(call, req, txb, ktime_sub(resend_at, req->now)); + if (skip_too_young && ktime_after(resend_at, req->now)) + return false; + __set_bit(ix, &tq->segment_retransmitted); txb->flags |= RXRPC_TXBUF_RESENT; rxrpc_send_data_packet(call, req); rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); @@ -89,128 +91,119 @@ static void rxrpc_retransmit_data(struct rxrpc_call *call, req->n = 0; req->did_send = true; req->now = ktime_get_real(); + return true; } /* * Perform retransmission of NAK'd and unack'd packets. */ -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) +void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_response) { struct rxrpc_send_data_req req = { .now = ktime_get_real(), }; - struct rxrpc_ackpacket *ack = NULL; - struct rxrpc_skb_priv *sp; - struct rxrpc_txqueue *tq; - struct rxrpc_txbuf *txb; - rxrpc_seq_t transmitted = call->tx_transmitted, seq; - ktime_t next_resend = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); - ktime_t resend_at = KTIME_MAX, delay; - bool unacked = false, did_send = false; - unsigned int qix; + struct rxrpc_txqueue *tq = call->tx_queue; + ktime_t lowest_xmit_ts = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); + bool unacked = false; _enter("{%d,%d}", call->tx_bottom, call->tx_top); - if (call->tx_bottom == call->tx_top) - goto no_resend; + if (call->tx_bottom == call->tx_top) { + call->resend_at = KTIME_MAX; + trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); + return; + } - trace_rxrpc_resend(call, ack_skb); - tq = call->tx_queue; - seq = call->tx_bottom; + trace_rxrpc_resend(call, ack_serial); - /* Scan the soft ACK table and resend any explicitly NAK'd packets. */ - if (ack_skb) { - sp = rxrpc_skb(ack_skb); - ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); + /* Scan the transmission queue, looking for explicitly NAK'd packets. */ + do { + unsigned long naks = ~tq->segment_acked; + rxrpc_seq_t tq_top = tq->qbase + RXRPC_NR_TXQUEUE - 1; - for (int i = 0; i < sp->ack.nr_acks; i++) { - rxrpc_seq_t aseq; + if (after(tq->qbase, call->tx_transmitted)) + break; - if (ack->acks[i] & 1) - continue; - aseq = sp->ack.first_ack + i; - while (after_eq(aseq, tq->qbase + RXRPC_NR_TXQUEUE)) - tq = tq->next; - seq = aseq; - qix = seq - tq->qbase; - txb = tq->bufs[qix]; - if (after(seq, transmitted)) - goto no_further_resend; - - resend_at = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[qix]); - resend_at = ktime_add(resend_at, rto); - if (after(txb->serial, call->acks_highest_serial)) { - if (ktime_after(resend_at, req.now) && - ktime_before(resend_at, next_resend)) - next_resend = resend_at; + if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) + naks &= (1UL << tq->nr_reported_acks) - 1; + + _debug("retr %16lx %u c=%08x [%x]", + tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase); + _debug("nack %16lx", naks); + + while (naks) { + unsigned int ix = __ffs(naks); + struct rxrpc_txbuf *txb = tq->bufs[ix]; + + __clear_bit(ix, &naks); + if (after(txb->serial, call->acks_highest_serial)) continue; /* Ack point not yet reached */ - } rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); req.tq = tq; - req.seq = seq; + req.seq = tq->qbase + ix; req.n = 1; - rxrpc_retransmit_data(call, &req, rto); - - if (after_eq(seq, call->tx_top)) - goto no_further_resend; + rxrpc_retransmit_data(call, &req, rto, false); } - } - /* Fast-forward through the Tx queue to the point the peer says it has - * seen. Anything between the soft-ACK table and that point will get - * ACK'd or NACK'd in due course, so don't worry about it here; here we - * need to consider retransmitting anything beyond that point. - */ - seq = call->acks_prev_seq; - if (after_eq(seq, call->tx_transmitted)) - goto no_further_resend; - seq++; - - while (after_eq(seq, tq->qbase + RXRPC_NR_TXQUEUE)) - tq = tq->next; - - while (before_eq(seq, call->tx_transmitted)) { - qix = seq - tq->qbase; - if (qix >= RXRPC_NR_TXQUEUE) { - tq = tq->next; - continue; + /* Anything after the soft-ACK table up to and including + * ack.previousPacket will get ACK'd or NACK'd in due course, + * so don't worry about those here. We do, however, need to + * consider retransmitting anything beyond that point. + */ + if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE && + after(tq_top, call->acks_prev_seq)) { + rxrpc_seq_t start = latest(call->acks_prev_seq, + tq->qbase + tq->nr_reported_acks); + rxrpc_seq_t stop = earliest(tq_top, call->tx_transmitted); + + _debug("unrep %x-%x", start, stop); + for (rxrpc_seq_t seq = start; before(seq, stop); seq++) { + struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK]; + + if (ping_response && + before(txb->serial, call->acks_highest_serial)) + break; /* Wasn't accounted for by a more recent ping. */ + req.tq = tq; + req.seq = seq; + req.n = 1; + if (rxrpc_retransmit_data(call, &req, rto, true)) + unacked = true; + } } - txb = tq->bufs[qix]; - resend_at = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[qix]); - resend_at = ktime_add(resend_at, rto); - if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE && - before(txb->serial, ntohl(ack->serial))) - goto do_resend; /* Wasn't accounted for by a more recent ping. */ + /* Work out the next retransmission timeout. */ + if (ktime_before(tq->xmit_ts_base, lowest_xmit_ts)) { + unsigned int lowest_us = UINT_MAX; - if (ktime_after(resend_at, req.now)) { - if (ktime_before(resend_at, next_resend)) - next_resend = resend_at; - seq++; - continue; - } + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + if (!test_bit(i, &tq->segment_acked) && + tq->segment_xmit_ts[i] < lowest_us) + lowest_us = tq->segment_xmit_ts[i]; + _debug("lowest[%x] %llx %u", tq->qbase, tq->xmit_ts_base, lowest_us); - do_resend: - unacked = true; + if (lowest_us != UINT_MAX) { + ktime_t lowest_ns = ktime_add_us(tq->xmit_ts_base, lowest_us); - req.tq = tq; - req.seq = seq; - req.n = 1; - rxrpc_retransmit_data(call, &req, rto); - seq++; - } + if (ktime_before(lowest_ns, lowest_xmit_ts)) + lowest_xmit_ts = lowest_ns; + } + } + } while ((tq = tq->next)); + + if (lowest_xmit_ts < KTIME_MAX) { + ktime_t delay = rxrpc_get_rto_backoff(call->peer, req.did_send); + ktime_t resend_at = ktime_add(lowest_xmit_ts, delay); -no_further_resend: -no_resend: - if (resend_at < KTIME_MAX) { - delay = rxrpc_get_rto_backoff(call->peer, did_send); - resend_at = ktime_add(resend_at, delay); + _debug("delay %llu %lld", delay, ktime_sub(resend_at, req.now)); + call->resend_at = resend_at; trace_rxrpc_timer_set(call, resend_at - req.now, rxrpc_timer_trace_resend_reset); + } else { + call->resend_at = KTIME_MAX; + trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); } - call->resend_at = resend_at; if (unacked) rxrpc_congestion_timeout(call); @@ -494,7 +487,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) if (resend && __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY && !test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags)) - rxrpc_resend(call, NULL); + rxrpc_resend(call, 0, false); if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index a9682b31a4f9f..bba058055c97f 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -691,7 +691,6 @@ static void rxrpc_destroy_call(struct work_struct *work) del_timer_sync(&call->timer); - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); rxrpc_cleanup_tx_buffers(call); rxrpc_cleanup_rx_buffers(call); rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c25d816aafeed..6e7ff133b5aac 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -34,8 +34,6 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, rxrpc_serial_t acked_serial) { - bool resend = false; - summary->change = rxrpc_cong_no_change; summary->in_flight = (call->tx_top - call->tx_bottom) - call->acks_nr_sacks; @@ -52,12 +50,13 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, } call->cong_cumul_acks += summary->nr_new_sacks; + call->cong_cumul_acks += summary->nr_new_hacks; if (call->cong_cumul_acks > 255) call->cong_cumul_acks = 255; switch (call->cong_ca_state) { case RXRPC_CA_SLOW_START: - if (summary->saw_snacks) + if (call->acks_nr_snacks > 0) goto packet_loss_detected; if (call->cong_cumul_acks > 0) call->cong_cwnd += 1; @@ -68,7 +67,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, goto out; case RXRPC_CA_CONGEST_AVOIDANCE: - if (summary->saw_snacks) + if (call->acks_nr_snacks > 0) goto packet_loss_detected; /* We analyse the number of packets that get ACK'd per RTT @@ -87,7 +86,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, goto out; case RXRPC_CA_PACKET_LOSS: - if (!summary->saw_snacks) + if (call->acks_nr_snacks == 0) goto resume_normality; if (summary->new_low_snack) { @@ -108,7 +107,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, call->cong_cwnd = call->cong_ssthresh + 3; call->cong_extra = 0; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; goto out; case RXRPC_CA_FAST_RETRANSMIT: @@ -119,12 +118,12 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (call->cong_dup_acks == 2) { summary->change = rxrpc_cong_retransmit_again; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; } } else { summary->change = rxrpc_cong_progress; call->cong_cwnd = call->cong_ssthresh; - if (!summary->saw_snacks) + if (call->acks_nr_snacks == 0) goto resume_normality; } goto out; @@ -149,8 +148,6 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW) call->cong_cwnd = RXRPC_TX_MAX_WINDOW; trace_rxrpc_congest(call, summary, acked_serial); - if (resend) - rxrpc_resend(call, skb); return; packet_loss_detected: @@ -212,6 +209,13 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, trace_rxrpc_tx_rotate(call, seq, to); trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate); + if (call->acks_lowest_nak == call->tx_bottom) { + call->acks_lowest_nak = to; + } else if (after(to, call->acks_lowest_nak)) { + summary->new_low_snack = true; + call->acks_lowest_nak = to; + } + /* We may have a left over fully-consumed buffer at the front that we * couldn't drop before (rotate_and_keep below). */ @@ -231,6 +235,25 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; } + + if (ix == tq->nr_reported_acks) { + /* Packet directly hard ACK'd. */ + tq->nr_reported_acks++; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack); + } else if (test_bit(ix, &tq->segment_acked)) { + /* Soft ACK -> hard ACK. */ + call->acks_nr_sacks--; + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_sack); + } else { + /* Soft NAK -> hard ACK. */ + call->acks_nr_snacks--; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak); + } + rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated); tq->bufs[ix] = NULL; @@ -268,13 +291,6 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, _debug("%x,%x,%x,%d", to, call->tx_bottom, call->tx_top, rot_last); - if (call->acks_lowest_nak == call->tx_bottom) { - call->acks_lowest_nak = to; - } else if (after(to, call->acks_lowest_nak)) { - summary->new_low_snack = true; - call->acks_lowest_nak = to; - } - wake_up(&call->waitq); return rot_last; } @@ -293,11 +309,6 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, call->resend_at = KTIME_MAX; trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); - if (unlikely(call->cong_last_nack)) { - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } - switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_CLIENT_AWAIT_REPLY: @@ -770,40 +781,92 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb wake_up(&call->waitq); } +#if defined(CONFIG_X86) && __GNUC__ && !defined(__clang__) +/* Clang doesn't support the %z constraint modifier */ +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + asm(" shr%z1 %1\n" \ + " inc %0\n" \ + " rcr%z2 %2\n" \ + : "+d"(shift_from), "+m"(*(shift_from)), "+rm"(rotate_into) \ + ); \ + }) +#else +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + typeof(rotate_into) __bit0 = *(shift_from) & 1; \ + *(shift_from) >>= 1; \ + shift_from++; \ + rotate_into >>= 1; \ + rotate_into |= __bit0 << (sizeof(rotate_into) * 8 - 1); \ + }) +#endif + /* - * Determine how many nacks from the previous ACK have now been satisfied. + * Process a batch of soft ACKs specific to a transmission queue segment. */ -static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, - struct rxrpc_ack_summary *summary, - rxrpc_seq_t hard_ack) +static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long extracted_acks, + int nr_reported, + rxrpc_seq_t seq, + rxrpc_seq_t *lowest_nak) { - struct sk_buff *skb = call->cong_last_nack; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, new_acks = 0, retained_nacks = 0; - rxrpc_seq_t seq = hard_ack + 1, old_seq = sp->ack.first_ack; - u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); + unsigned long old_reported, flipped, new_acks, a_to_n, n_to_a; + int new, a, n; + + old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks); + _enter("{%x,%lx,%d},%lx,%d,%x", + tq->qbase, tq->segment_acked, tq->nr_reported_acks, + extracted_acks, nr_reported, seq); + + _debug("[%x]", tq->qbase); + _debug("tq %16lx %u", tq->segment_acked, tq->nr_reported_acks); + _debug("sack %16lx %u", extracted_acks, nr_reported); + + /* See how many previously logged ACKs/NAKs have flipped. */ + flipped = (tq->segment_acked ^ extracted_acks) & old_reported; + if (flipped) { + n_to_a = ~tq->segment_acked & flipped; /* Old NAK -> ACK */ + a_to_n = tq->segment_acked & flipped; /* Old ACK -> NAK */ + a = hweight_long(n_to_a); + n = hweight_long(a_to_n); + _debug("flip %16lx", flipped); + _debug("ntoa %16lx %d", n_to_a, a); + _debug("aton %16lx %d", a_to_n, n); + call->acks_nr_sacks += a - n; + call->acks_nr_snacks += n - a; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } - if (after_eq(seq, old_seq + sp->ack.nr_acks)) { - summary->nr_new_sacks += sp->ack.nr_nacks; - summary->nr_new_sacks += seq - (old_seq + sp->ack.nr_acks); - summary->nr_retained_snacks = 0; - } else if (seq == old_seq) { - summary->nr_retained_snacks = sp->ack.nr_nacks; - } else { - for (i = 0; i < sp->ack.nr_acks; i++) { - if (acks[i] == RXRPC_ACK_TYPE_NACK) { - if (before(old_seq + i, seq)) - new_acks++; - else - retained_nacks++; - } + /* See how many new ACKs/NAKs have been acquired. */ + new = nr_reported - tq->nr_reported_acks; + if (new > 0) { + new_acks = extracted_acks & ~old_reported; + if (new_acks) { + a = hweight_long(new_acks); + n = new - a; + _debug("new_a %16lx new=%d a=%d n=%d", new_acks, new, a, n); + call->acks_nr_sacks += a; + call->acks_nr_snacks += n; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } else { + call->acks_nr_snacks += new; + summary->nr_new_snacks += new; } - - summary->nr_new_sacks += new_acks; - summary->nr_retained_snacks = retained_nacks; } - return old_seq + sp->ack.nr_acks - 1; + tq->nr_reported_acks = nr_reported; + tq->segment_acked = extracted_acks; + trace_rxrpc_apply_acks(call, tq); + + if (extracted_acks != ~0UL) { + rxrpc_seq_t lowest = seq + ffz(extracted_acks); + + if (before(lowest, *lowest_nak)) + *lowest_nak = lowest; + } } /* @@ -817,39 +880,50 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, */ static void rxrpc_input_soft_acks(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - struct sk_buff *skb, - rxrpc_seq_t since) + struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, old_nacks = 0; - rxrpc_seq_t lowest_nak = call->acks_hard_ack + sp->ack.nr_acks + 1; - rxrpc_seq_t seq = call->acks_hard_ack; + struct rxrpc_txqueue *tq = call->tx_queue; + unsigned long extracted = ~0UL; + unsigned int nr = 0; + rxrpc_seq_t seq = call->acks_hard_ack + 1; + rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks; u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - for (i = 0; i < sp->ack.nr_acks; i++) { - seq++; - if (acks[i] == RXRPC_ACK_TYPE_ACK) { - call->acks_nr_sacks++; - if (after(seq, since)) - summary->nr_new_sacks++; - } else { - summary->saw_snacks = true; - if (before_eq(seq, since)) { - /* Overlap with previous ACK */ - old_nacks++; - } else { - summary->nr_new_snacks++; - sp->ack.nr_nacks++; - } + _enter("%x,%x,%u", tq->qbase, seq, sp->ack.nr_acks); + + while (after(seq, tq->qbase + RXRPC_NR_TXQUEUE - 1)) + tq = tq->next; - if (before(seq, lowest_nak)) - lowest_nak = seq; + for (unsigned int i = 0; i < sp->ack.nr_acks; i++) { + /* Decant ACKs until we hit a txqueue boundary. */ + shiftr_adv_rotr(acks, extracted); + if (i == 256) { + acks -= i; + i = 0; } + seq++; + nr++; + if ((seq & RXRPC_TXQ_MASK) != 0) + continue; + + _debug("bound %16lx %u", extracted, nr); + + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE, + seq - RXRPC_NR_TXQUEUE, &lowest_nak); + extracted = ~0UL; + nr = 0; + tq = tq->next; + prefetch(tq); } - if (lowest_nak != call->acks_lowest_nak) { - call->acks_lowest_nak = lowest_nak; - summary->new_low_snack = true; + if (nr) { + unsigned int nr_reported = seq & RXRPC_TXQ_MASK; + + extracted >>= RXRPC_NR_TXQUEUE - nr_reported; + _debug("tail %16lx %u", extracted, nr_reported); + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, nr_reported, + seq & ~RXRPC_TXQ_MASK, &lowest_nak); } /* We *can* have more nacks than we did - the peer is permitted to drop @@ -857,9 +931,14 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * possible for the nack distribution to change whilst the number of * nacks stays the same or goes down. */ - if (old_nacks < summary->nr_retained_snacks) - summary->nr_new_sacks += summary->nr_retained_snacks - old_nacks; - summary->nr_retained_snacks = old_nacks; + if (lowest_nak != call->acks_lowest_nak) { + call->acks_lowest_nak = lowest_nak; + summary->new_low_snack = true; + } + + _debug("summary A=%d+%d N=%d+%d", + call->acks_nr_sacks, summary->nr_new_sacks, + call->acks_nr_snacks, summary->nr_new_snacks); } /* @@ -902,7 +981,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_acktrailer trailer; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); rxrpc_serial_t ack_serial, acked_serial; - rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since; + rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; _enter(""); @@ -920,6 +999,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) trace_rxrpc_rx_ack(call, sp); rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); + prefetch(call->tx_queue); if (acked_serial != 0) { switch (summary.ack_reason) { @@ -980,16 +1060,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) skb_condense(skb); - if (call->cong_last_nack) { - since = rxrpc_input_check_prev_ack(call, &summary, hard_ack); - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } else { - summary.nr_new_sacks = hard_ack - call->acks_hard_ack; - call->acks_lowest_nak = hard_ack + nr_acks; - since = hard_ack; - } - call->acks_latest_ts = skb->tstamp; call->acks_hard_ack = hard_ack; call->acks_prev_seq = prev_pkt; @@ -1037,9 +1107,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) { if (offset > (int)skb->len - nr_acks) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack); - rxrpc_input_soft_acks(call, &summary, skb, since); - rxrpc_get_skb(skb, rxrpc_skb_get_last_nack); - call->cong_last_nack = skb; + rxrpc_input_soft_acks(call, &summary, skb); } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && @@ -1049,6 +1117,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_lost_reply); rxrpc_congestion_management(call, skb, &summary, acked_serial); + if (summary.need_retransmit) + rxrpc_resend(call, ack_serial, summary.ack_reason == RXRPC_ACK_PING_RESPONSE); send_response: if (summary.ack_reason == RXRPC_ACK_PING) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 7ed928b6f0e16..978c2dc6a7d45 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -461,7 +461,7 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, len += sizeof(*jumbo); } - trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, false); + trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags | flags, false); kv->iov_len = len; return len; } @@ -522,6 +522,13 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se } /* Set timeouts */ + if (call->peer->rtt_count > 1) { + ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + + call->ack_lost_at = ktime_add(req->now, delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); + } + if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo)); @@ -596,6 +603,7 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req ret = 0; trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, true); + conn->peer->last_tx_at = ktime_get_seconds(); goto done; } } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index dfbf9f4b24b6d..381b25597f4e8 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -299,6 +299,9 @@ static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) kfree(tq); return -ENOMEM; } else { + /* We start at seq 1, so pretend seq 0 is hard-acked. */ + tq->nr_reported_acks = 1; + tq->segment_acked = 1UL; tq->qbase = 0; call->tx_qbase = 0; call->send_queue = tq; From dcdff0d8e3b61033b28c72926997d458949fcc05 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:52 +0000 Subject: [PATCH 0301/1450] rxrpc: Store the DATA serial in the txqueue and use this in RTT calc Store the serial number set on a DATA packet at the point of transmission in the rxrpc_txqueue struct and when an ACK is received, match the reference number in the ACK by trawling the txqueue rather than sharing an RTT table with ACK RTT. This can be done as part of Tx queue rotation. This means we have a lot more RTT samples available and is faster to search with all the serial numbers packed together into a few cachelines rather than being hung off different txbufs. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-25-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 14 ++---- net/rxrpc/ar-internal.h | 4 ++ net/rxrpc/call_event.c | 8 +-- net/rxrpc/input.c | 94 +++++++++++++++++++++++------------- net/rxrpc/output.c | 6 ++- 5 files changed, 79 insertions(+), 47 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 609522a5bd0f7..798bea0853c4c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -337,11 +337,10 @@ E_(rxrpc_rtt_tx_ping, "PING") #define rxrpc_rtt_rx_traces \ - EM(rxrpc_rtt_rx_other_ack, "OACK") \ + EM(rxrpc_rtt_rx_data_ack, "DACK") \ EM(rxrpc_rtt_rx_obsolete, "OBSL") \ EM(rxrpc_rtt_rx_lost, "LOST") \ - EM(rxrpc_rtt_rx_ping_response, "PONG") \ - E_(rxrpc_rtt_rx_requested_ack, "RACK") + E_(rxrpc_rtt_rx_ping_response, "PONG") #define rxrpc_timer_traces \ EM(rxrpc_timer_trace_delayed_ack, "DelayAck ") \ @@ -1695,10 +1694,9 @@ TRACE_EVENT(rxrpc_retransmit, ); TRACE_EVENT(rxrpc_congest, - TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - rxrpc_serial_t ack_serial), + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary), - TP_ARGS(call, summary, ack_serial), + TP_ARGS(call, summary), TP_STRUCT__entry( __field(unsigned int, call) @@ -1706,7 +1704,6 @@ TRACE_EVENT(rxrpc_congest, __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, top) __field(rxrpc_seq_t, lowest_nak) - __field(rxrpc_serial_t, ack_serial) __field(u16, nr_sacks) __field(u16, nr_snacks) __field(u16, cwnd) @@ -1722,7 +1719,6 @@ TRACE_EVENT(rxrpc_congest, __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; - __entry->ack_serial = ack_serial; __entry->nr_sacks = call->acks_nr_sacks; __entry->nr_snacks = call->acks_nr_snacks; __entry->cwnd = call->cong_cwnd; @@ -1734,7 +1730,7 @@ TRACE_EVENT(rxrpc_congest, TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u A=%u+%u/%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, - __entry->ack_serial, + __entry->sum.acked_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), __entry->hard_ack, __print_symbolic(__entry->ca_state, rxrpc_ca_states), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9a70f0b86570a..297be421639cc 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -769,6 +769,7 @@ struct rxrpc_call { * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { + rxrpc_serial_t acked_serial; /* Serial number ACK'd */ u16 in_flight; /* Number of unreceived transmissions */ u16 nr_new_hacks; /* Number of rotated new ACKs */ u16 nr_new_sacks; /* Number of new soft ACKs in packet */ @@ -777,6 +778,7 @@ struct rxrpc_ack_summary { bool new_low_snack:1; /* T if new low soft NACK found */ bool retrans_timeo:1; /* T if reTx due to timeout happened */ bool need_retransmit:1; /* T if we need transmission */ + bool rtt_sample_avail:1; /* T if RTT sample available */ u8 /*enum rxrpc_congest_change*/ change; }; @@ -859,12 +861,14 @@ struct rxrpc_txqueue { unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */ unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */ unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */ + unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */ /* The arrays we want to pack into as few cache lines as possible. */ struct { #define RXRPC_NR_TXQUEUE BITS_PER_LONG #define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1) struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE]; + unsigned int segment_serial[RXRPC_NR_TXQUEUE]; unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE]; } ____cacheline_aligned; }; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index e25921d39d4d7..f71773b18e221 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -159,11 +159,11 @@ void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_ rxrpc_seq_t stop = earliest(tq_top, call->tx_transmitted); _debug("unrep %x-%x", start, stop); - for (rxrpc_seq_t seq = start; before(seq, stop); seq++) { - struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK]; + for (rxrpc_seq_t seq = start; before_eq(seq, stop); seq++) { + rxrpc_serial_t serial = tq->segment_serial[seq & RXRPC_TXQ_MASK]; if (ping_response && - before(txb->serial, call->acks_highest_serial)) + before(serial, call->acks_highest_serial)) break; /* Wasn't accounted for by a more recent ping. */ req.tq = tq; req.seq = seq; @@ -198,7 +198,7 @@ void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_ _debug("delay %llu %lld", delay, ktime_sub(resend_at, req.now)); call->resend_at = resend_at; - trace_rxrpc_timer_set(call, resend_at - req.now, + trace_rxrpc_timer_set(call, ktime_sub(resend_at, req.now), rxrpc_timer_trace_resend_reset); } else { call->resend_at = KTIME_MAX; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 6e7ff133b5aac..41b4fb56f96cd 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -30,9 +30,7 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, * Do TCP-style congestion management [RFC 5681]. */ static void rxrpc_congestion_management(struct rxrpc_call *call, - struct sk_buff *skb, - struct rxrpc_ack_summary *summary, - rxrpc_serial_t acked_serial) + struct rxrpc_ack_summary *summary) { summary->change = rxrpc_cong_no_change; summary->in_flight = (call->tx_top - call->tx_bottom) - call->acks_nr_sacks; @@ -44,7 +42,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (call->cong_cwnd >= call->cong_ssthresh && call->cong_ca_state == RXRPC_CA_SLOW_START) { call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; + call->cong_tstamp = call->acks_latest_ts; call->cong_cumul_acks = 0; } } @@ -62,7 +60,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, call->cong_cwnd += 1; if (call->cong_cwnd >= call->cong_ssthresh) { call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; + call->cong_tstamp = call->acks_latest_ts; } goto out; @@ -75,12 +73,12 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, */ if (call->peer->rtt_count == 0) goto out; - if (ktime_before(skb->tstamp, + if (ktime_before(call->acks_latest_ts, ktime_add_us(call->cong_tstamp, call->peer->srtt_us >> 3))) goto out_no_clear_ca; summary->change = rxrpc_cong_rtt_window_end; - call->cong_tstamp = skb->tstamp; + call->cong_tstamp = call->acks_latest_ts; if (call->cong_cumul_acks >= call->cong_cwnd) call->cong_cwnd++; goto out; @@ -137,7 +135,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, summary->change = rxrpc_cong_cleared_nacks; call->cong_dup_acks = 0; call->cong_extra = 0; - call->cong_tstamp = skb->tstamp; + call->cong_tstamp = call->acks_latest_ts; if (call->cong_cwnd < call->cong_ssthresh) call->cong_ca_state = RXRPC_CA_SLOW_START; else @@ -147,7 +145,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, out_no_clear_ca: if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW) call->cong_cwnd = RXRPC_TX_MAX_WINDOW; - trace_rxrpc_congest(call, summary, acked_serial); + trace_rxrpc_congest(call, summary); return; packet_loss_detected: @@ -194,11 +192,29 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND); } +/* + * Add an RTT sample derived from an ACK'd DATA packet. + */ +static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + int ix, + rxrpc_serial_t ack_serial) +{ + rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_data_ack, -1, + summary->acked_serial, ack_serial, + ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]), + call->acks_latest_ts); + summary->rtt_sample_avail = false; + __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */ +} + /* * Apply a hard ACK by advancing the Tx window. */ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, - struct rxrpc_ack_summary *summary) + struct rxrpc_ack_summary *summary, + rxrpc_serial_t ack_serial) { struct rxrpc_txqueue *tq = call->tx_queue; rxrpc_seq_t seq = call->tx_bottom + 1; @@ -236,6 +252,11 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, rot_last = true; } + if (summary->rtt_sample_avail && + summary->acked_serial == tq->segment_serial[ix] && + test_bit(ix, &tq->rtt_samples)) + rxrpc_add_data_rtt_sample(call, summary, tq, ix, ack_serial); + if (ix == tq->nr_reported_acks) { /* Packet directly hard ACK'd. */ tq->nr_reported_acks++; @@ -348,7 +369,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { - if (!rxrpc_rotate_tx_window(call, top, &summary)) { + if (!rxrpc_rotate_tx_window(call, top, &summary, 0)) { rxrpc_proto_abort(call, top, rxrpc_eproto_early_reply); return false; } @@ -800,6 +821,19 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb }) #endif +/* + * Deal with RTT samples from soft ACKs. + */ +static void rxrpc_input_soft_rtt(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + rxrpc_serial_t ack_serial) +{ + for (int ix = 0; ix < RXRPC_NR_TXQUEUE; ix++) + if (summary->acked_serial == tq->segment_serial[ix]) + return rxrpc_add_data_rtt_sample(call, summary, tq, ix, ack_serial); +} + /* * Process a batch of soft ACKs specific to a transmission queue segment. */ @@ -909,6 +943,8 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, _debug("bound %16lx %u", extracted, nr); + if (summary->rtt_sample_avail) + rxrpc_input_soft_rtt(call, summary, tq, sp->hdr.serial); rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE, seq - RXRPC_NR_TXQUEUE, &lowest_nak); extracted = ~0UL; @@ -980,7 +1016,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_ack_summary summary = { 0 }; struct rxrpc_acktrailer trailer; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - rxrpc_serial_t ack_serial, acked_serial; + rxrpc_serial_t ack_serial; rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; @@ -989,11 +1025,11 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); ack_serial = sp->hdr.serial; - acked_serial = sp->ack.acked_serial; first_soft_ack = sp->ack.first_ack; prev_pkt = sp->ack.prev_ack; nr_acks = sp->ack.nr_acks; hard_ack = first_soft_ack - 1; + summary.acked_serial = sp->ack.acked_serial; summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? sp->ack.reason : RXRPC_ACK__INVALID); @@ -1001,21 +1037,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); prefetch(call->tx_queue); - if (acked_serial != 0) { - switch (summary.ack_reason) { - case RXRPC_ACK_PING_RESPONSE: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_ping_response); - break; - case RXRPC_ACK_REQUESTED: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_requested_ack); - break; - default: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_other_ack); - break; - } + if (summary.acked_serial != 0) { + if (summary.ack_reason == RXRPC_ACK_PING_RESPONSE) + rxrpc_complete_rtt_probe(call, skb->tstamp, summary.acked_serial, + ack_serial, rxrpc_rtt_rx_ping_response); + else + summary.rtt_sample_avail = true; } /* If we get an EXCEEDS_WINDOW ACK from the server, it probably @@ -1068,8 +1095,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_ACK_PING: break; default: - if (acked_serial && after(acked_serial, call->acks_highest_serial)) - call->acks_highest_serial = acked_serial; + if (summary.acked_serial && + after(summary.acked_serial, call->acks_highest_serial)) + call->acks_highest_serial = summary.acked_serial; break; } @@ -1098,7 +1126,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow); if (after(hard_ack, call->tx_bottom)) { - if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { + if (rxrpc_rotate_tx_window(call, hard_ack, &summary, ack_serial)) { rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); goto send_response; } @@ -1116,7 +1144,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ping(call, ack_serial, rxrpc_propose_ack_ping_for_lost_reply); - rxrpc_congestion_management(call, skb, &summary, acked_serial); + rxrpc_congestion_management(call, &summary); if (summary.need_retransmit) rxrpc_resend(call, ack_serial, summary.ack_reason == RXRPC_ACK_PING_RESPONSE); @@ -1136,7 +1164,7 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - if (rxrpc_rotate_tx_window(call, call->tx_top, &summary)) + if (rxrpc_rotate_tx_window(call, call->tx_top, &summary, 0)) rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ackall); } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 978c2dc6a7d45..20bf45317264b 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -436,7 +436,7 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, trace_rxrpc_req_ack(call->debug_id, txb->seq, why); if (why != rxrpc_reqack_no_srv_last) { flags |= RXRPC_REQUEST_ACK; - rxrpc_begin_rtt_probe(call, serial, req->now, rxrpc_rtt_tx_data); + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, -1, serial); call->peer->rtt_last_req = req->now; } dont_set_request_ack: @@ -508,6 +508,10 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq); tq->segment_xmit_ts[ix] = xmit_ts; + tq->segment_serial[ix] = serial; + if (i + 1 == req->n) + /* Only sample the last subpacket in a jumbo. */ + __set_bit(ix, &tq->rtt_samples); len += rxrpc_prepare_data_subpacket(call, req, txb, serial, i); serial++; seq++; From 7903d4438b3f50b2f44af1ce4560631e0e0a9779 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:53 +0000 Subject: [PATCH 0302/1450] rxrpc: Don't use received skbuff timestamps Don't use received skbuff timestamps, but rather set a timestamp when an ack is processed so that the time taken to get to rxrpc_input_ack() is included in the RTT. The timestamp of the latest ACK received is tracked in call->acks_latest_ts. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-26-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/input.c | 19 ++++++++++--------- net/rxrpc/local_object.c | 3 --- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 41b4fb56f96cd..c682e95e15dca 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1037,14 +1037,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); prefetch(call->tx_queue); - if (summary.acked_serial != 0) { - if (summary.ack_reason == RXRPC_ACK_PING_RESPONSE) - rxrpc_complete_rtt_probe(call, skb->tstamp, summary.acked_serial, - ack_serial, rxrpc_rtt_rx_ping_response); - else - summary.rtt_sample_avail = true; - } - /* If we get an EXCEEDS_WINDOW ACK from the server, it probably * indicates that the client address changed due to NAT. The server * lost the call because it switched to a different peer. @@ -1087,7 +1079,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) skb_condense(skb); - call->acks_latest_ts = skb->tstamp; + call->acks_latest_ts = ktime_get_real(); call->acks_hard_ack = hard_ack; call->acks_prev_seq = prev_pkt; @@ -1108,6 +1100,15 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (hard_ack + 1 == 0) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); + if (summary.acked_serial != 0) { + if (summary.ack_reason == RXRPC_ACK_PING_RESPONSE) + rxrpc_complete_rtt_probe(call, call->acks_latest_ts, + summary.acked_serial, ack_serial, + rxrpc_rtt_rx_ping_response); + else + summary.rtt_sample_avail = true; + } + /* Ignore ACKs unless we are or have just been transmitting. */ switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 2792d23046053..a74a4b43904fb 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -215,9 +215,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* we want to set the don't fragment bit */ rxrpc_local_dont_fragment(local, true); - - /* We want receive timestamps. */ - sock_enable_timestamps(usk); break; default: From c637bd066841de6d0a204898a62f1d9bb8fa1b7f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:54 +0000 Subject: [PATCH 0303/1450] rxrpc: Generate rtt_min Generate rtt_min as this is required by RACK-TLP. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-27-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- lib/win_minmax.c | 1 + net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/rtt.c | 20 ++++++++++++++++---- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/lib/win_minmax.c b/lib/win_minmax.c index ec10506834b62..1682e614309c7 100644 --- a/lib/win_minmax.c +++ b/lib/win_minmax.c @@ -97,3 +97,4 @@ u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) return minmax_subwin_update(m, win, &val); } +EXPORT_SYMBOL(minmax_running_min); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 297be421639cc..d0d0ab4539095 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -371,6 +371,8 @@ struct rxrpc_peer { spinlock_t rtt_input_lock; /* RTT lock for input routine */ ktime_t rtt_last_req; /* Time of last RTT request */ unsigned int rtt_count; /* Number of samples we've got */ + unsigned int rtt_taken; /* Number of samples taken (wrapping) */ + struct minmax min_rtt; /* Estimated minimum RTT */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index 6dc51486b5a60..8048467f4bee8 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -127,16 +127,27 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) peer->rto_us = rxrpc_bound_rto(rto); } -static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) +static void rxrpc_update_rtt_min(struct rxrpc_peer *peer, ktime_t resp_time, long rtt_us) +{ + /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */ + u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024; + + minmax_running_min(&peer->min_rtt, wlen_us, resp_time / 1024, + (u32)rtt_us ? : jiffies_to_usecs(1)); +} + +static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, ktime_t resp_time, long rtt_us) { if (rtt_us < 0) return; - //rxrpc_update_rtt_min(peer, rtt_us); + /* Update RACK min RTT [RFC8985 6.1 Step 1]. */ + rxrpc_update_rtt_min(peer, resp_time, rtt_us); + rxrpc_rtt_estimator(peer, rtt_us); rxrpc_set_rto(peer); - /* RFC6298: only reset backoff on valid RTT measurement. */ + /* Only reset backoff on valid RTT measurement [RFC6298]. */ peer->backoff = 0; } @@ -157,9 +168,10 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, return; spin_lock(&peer->rtt_input_lock); - rxrpc_ack_update_rtt(peer, rtt_us); + rxrpc_ack_update_rtt(peer, resp_time, rtt_us); if (peer->rtt_count < 3) peer->rtt_count++; + peer->rtt_taken++; spin_unlock(&peer->rtt_input_lock); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, From 93dfca65a1df42a3c8b1094299dc42ab8f18e5c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:55 +0000 Subject: [PATCH 0304/1450] rxrpc: Adjust the rxrpc_rtt_rx tracepoint Adjust the rxrpc_rtt_rx tracepoint in the following ways: (1) Display the collected RTT sample in the rxrpc_rtt_rx trace. (2) Move the division of srtt by 8 to the TP_printk() rather doing it before invoking the trace point. (3) Display the min_rtt value. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 14 ++++++++++---- net/rxrpc/input.c | 4 ++-- net/rxrpc/rtt.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 798bea0853c4c..6e929f4448acf 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1415,9 +1415,9 @@ TRACE_EVENT(rxrpc_rtt_rx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, - u32 rtt, u32 rto), + u32 rtt, u32 srtt, u32 rto), - TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, rto), + TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, srtt, rto), TP_STRUCT__entry( __field(unsigned int, call) @@ -1426,7 +1426,9 @@ TRACE_EVENT(rxrpc_rtt_rx, __field(rxrpc_serial_t, send_serial) __field(rxrpc_serial_t, resp_serial) __field(u32, rtt) + __field(u32, srtt) __field(u32, rto) + __field(u32, min_rtt) ), TP_fast_assign( @@ -1436,17 +1438,21 @@ TRACE_EVENT(rxrpc_rtt_rx, __entry->send_serial = send_serial; __entry->resp_serial = resp_serial; __entry->rtt = rtt; + __entry->srtt = srtt; __entry->rto = rto; + __entry->min_rtt = minmax_get(&call->peer->min_rtt) ), - TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u rto=%u", + TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u srtt=%u rto=%u min=%u", __entry->call, __entry->slot, __print_symbolic(__entry->why, rxrpc_rtt_rx_traces), __entry->send_serial, __entry->resp_serial, __entry->rtt, - __entry->rto) + __entry->srtt / 8, + __entry->rto, + __entry->min_rtt) ); TRACE_EVENT(rxrpc_timer_set, diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c682e95e15dca..1eb9c22aba516 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -740,7 +740,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, */ if (after(acked_serial, orig_serial)) { trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i, - orig_serial, acked_serial, 0, 0); + orig_serial, acked_serial, 0, 0, 0); clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_wmb(); set_bit(i, &call->rtt_avail); @@ -748,7 +748,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, } if (!matched) - trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0); + trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0, 0); } /* diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index 8048467f4bee8..e0b7d99854b41 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -175,7 +175,7 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, spin_unlock(&peer->rtt_input_lock); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, - peer->srtt_us >> 3, peer->rto_us); + rtt_us, peer->srtt_us, peer->rto_us); } /* From a3d7f46d983fb2ed528b9cceb457c067fe4277a2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:56 +0000 Subject: [PATCH 0305/1450] rxrpc: Display userStatus in rxrpc_rx_ack trace Display the userStatus field from the Rx packet header in the rxrpc_rx_ack trace line. This is used for flow control purposes by FS.StoreData-type kafs RPC calls. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 6e929f4448acf..7681c67f7d65f 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1031,11 +1031,13 @@ TRACE_EVENT(rxrpc_rx_ack, __field(rxrpc_seq_t, prev) __field(u8, reason) __field(u8, n_acks) + __field(u8, user_status) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = sp->hdr.serial; + __entry->user_status = sp->hdr.userStatus; __entry->ack_serial = sp->ack.acked_serial; __entry->first = sp->ack.first_ack; __entry->prev = sp->ack.prev_ack; @@ -1043,11 +1045,12 @@ TRACE_EVENT(rxrpc_rx_ack, __entry->n_acks = sp->ack.nr_acks; ), - TP_printk("c=%08x %08x %s r=%08x f=%08x p=%08x n=%u", + TP_printk("c=%08x %08x %s r=%08x us=%02x f=%08x p=%08x n=%u", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_serial, + __entry->user_status, __entry->first, __entry->prev, __entry->n_acks) From 5c0ceba23bb47085d6c9c53bff08a29634ee4e7e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:57 +0000 Subject: [PATCH 0306/1450] rxrpc: Fix the calculation and use of RTO Make the following changes to the calculation and use of RTO: (1) Fix rxrpc_resend() to use the backed-off RTO value obtained by calling rxrpc_get_rto_backoff() rather than extracting the value itself. Without this, it may retransmit packets too early. (2) The RTO value being similar to the RTT causes a lot of extraneous resends because the RTT doesn't end up taking account of clearing out of the receive queue on the server. Worse, responses to PING-ACKs are made as fast as possible and so are less than the DATA-requested-ACK RTT and so skew the RTT down. Fix this by putting a lower bound on the RTO by adding 100ms to it and limiting the lower end to 200ms. Fixes: c410bf01933e ("rxrpc: Fix the excessive initial retransmission timeout") Fixes: 37473e416234 ("rxrpc: Clean up the resend algorithm") Signed-off-by: David Howells Suggested-by: Simon Wilkinson cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- net/rxrpc/call_event.c | 3 ++- net/rxrpc/rtt.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index f71773b18e221..4390c97e3ba60 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -103,7 +103,8 @@ void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_ .now = ktime_get_real(), }; struct rxrpc_txqueue *tq = call->tx_queue; - ktime_t lowest_xmit_ts = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); + ktime_t lowest_xmit_ts = KTIME_MAX; + ktime_t rto = rxrpc_get_rto_backoff(call->peer, false); bool unacked = false; _enter("{%d,%d}", call->tx_bottom, call->tx_top); diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index e0b7d99854b41..3f1ec8e420a65 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -27,7 +27,7 @@ static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) static u32 rxrpc_bound_rto(u32 rto) { - return umin(rto, RXRPC_RTO_MAX); + return clamp(200000, rto + 100000, RXRPC_RTO_MAX); } /* From 0130eff911b13e0ad5fc2eebd44833cacd5a8b0b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:58 +0000 Subject: [PATCH 0307/1450] rxrpc: Fix initial resend timeout The constant for the initial resend timeout is in milliseconds, but the variable it's assigned to is in microseconds. Fix the constant to be in microseconds. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- net/rxrpc/rtt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index 3f1ec8e420a65..aff75e168de86 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -12,7 +12,7 @@ #include "ar-internal.h" #define RXRPC_RTO_MAX (120 * USEC_PER_SEC) -#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ +#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) From fe24a5494390d22ff645fd201d2bf1669fa3aab1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:59 +0000 Subject: [PATCH 0308/1450] rxrpc: Send jumbo DATA packets Send jumbo DATA packets if the path-MTU probing using padded PING ACK packets shows up sufficient capacity to do so. This allows larger chunks of data to be sent without reducing the retryability as the subpackets in a jumbo packet can also be retransmitted individually. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_event.c | 2 +- net/rxrpc/call_object.c | 1 + net/rxrpc/input.c | 3 +++ 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d0d0ab4539095..1307749a1a740 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -710,6 +710,7 @@ struct rxrpc_call { u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ u8 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 + u8 tx_jumbo_max; /* Maximum subpkts peer will accept */ ktime_t tx_last_sent; /* Last time a transmission occurred */ /* Received data tracking */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 4390c97e3ba60..39772459426ba 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -288,7 +288,7 @@ static void rxrpc_transmit_fresh_data(struct rxrpc_call *call) struct rxrpc_txqueue *tq; struct rxrpc_txbuf *txb; rxrpc_seq_t send_top, seq; - int limit = min(space, 1); + int limit = min(space, max(call->peer->pmtud_jumbo, 1)); /* Order send_top before the contents of the new txbufs and * txqueue pointers diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index bba058055c97f..e0644e9a8d218 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -155,6 +155,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; + call->tx_jumbo_max = 1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; call->ackr_window = 1; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 1eb9c22aba516..a7a249872a541 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -796,8 +796,11 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb peer->ackr_adv_pmtud = true; } else { peer->ackr_adv_pmtud = false; + capacity = clamp(capacity, 1, jumbo_max); } + call->tx_jumbo_max = capacity; + if (wake) wake_up(&call->waitq); } From 08d55d7cf3f33c730ce2694393efe16b7983a9c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:00 +0000 Subject: [PATCH 0309/1450] rxrpc: Don't allocate a txbuf for an ACK transmission Don't allocate an rxrpc_txbuf struct for an ACK transmission. There's now no need as the memory to hold the ACK content is allocated with a page frag allocator. The allocation and freeing of a txbuf is just unnecessary overhead. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 2 - net/rxrpc/ar-internal.h | 5 +- net/rxrpc/output.c | 210 ++++++++++++++++++++++------------- net/rxrpc/txbuf.c | 76 ------------- 4 files changed, 131 insertions(+), 162 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 7681c67f7d65f..326a4c257aeae 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -462,13 +462,11 @@ /* ---- Must update size of stat_why_req_ack[] if more are added! */ #define rxrpc_txbuf_traces \ - EM(rxrpc_txbuf_alloc_ack, "ALLOC ACK ") \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ - EM(rxrpc_txbuf_put_ack_tx, "PUT ACK TX ") \ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1307749a1a740..db93d7f78902f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -834,11 +834,9 @@ struct rxrpc_txbuf { #define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ __be16 cksum; /* Checksum to go in header */ - unsigned short ack_rwind; /* ACK receive window */ - u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ bool jumboable; /* Can be non-terminal jumbo subpacket */ u8 nr_kvec; /* Amount of kvec[] used */ - struct kvec kvec[3]; + struct kvec kvec[1]; }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) @@ -1364,7 +1362,6 @@ static inline void rxrpc_sysctl_exit(void) {} extern atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp); -struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size); void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 20bf45317264b..a7de8a02f4199 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -71,22 +71,97 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call, ktime_t now) trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_keepalive); } +/* + * Allocate transmission buffers for an ACK and attach them to local->kv[]. + */ +static int rxrpc_alloc_ack(struct rxrpc_call *call, size_t sack_size) +{ + struct rxrpc_wire_header *whdr; + struct rxrpc_acktrailer *trailer; + struct rxrpc_ackpacket *ack; + struct kvec *kv = call->local->kvec; + gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; + void *buf, *buf2 = NULL; + u8 *filler; + + buf = page_frag_alloc(&call->local->tx_alloc, + sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); + if (!buf) + return -ENOMEM; + + if (sack_size) { + buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); + if (!buf2) { + page_frag_free(buf); + return -ENOMEM; + } + } + + whdr = buf; + ack = buf + sizeof(*whdr); + filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; + trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; + + kv[0].iov_base = whdr; + kv[0].iov_len = sizeof(*whdr) + sizeof(*ack); + kv[1].iov_base = buf2; + kv[1].iov_len = sack_size; + kv[2].iov_base = filler; + kv[2].iov_len = 3 + sizeof(*trailer); + return 3; /* Number of kvec[] used. */ +} + +static void rxrpc_free_ack(struct rxrpc_call *call) +{ + page_frag_free(call->local->kvec[0].iov_base); + if (call->local->kvec[1].iov_base) + page_frag_free(call->local->kvec[1].iov_base); +} + +/* + * Record the beginning of an RTT probe. + */ +static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, + ktime_t now, enum rxrpc_rtt_tx_trace why) +{ + unsigned long avail = call->rtt_avail; + int rtt_slot = 9; + + if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) + goto no_slot; + + rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); + if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) + goto no_slot; + + call->rtt_serial[rtt_slot] = serial; + call->rtt_sent_at[rtt_slot] = now; + smp_wmb(); /* Write data before avail bit */ + set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + + trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); + return; + +no_slot: + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); +} + /* * Fill out an ACK packet. */ -static void rxrpc_fill_out_ack(struct rxrpc_call *call, - struct rxrpc_txbuf *txb, - u8 ack_reason, - rxrpc_serial_t serial) +static int rxrpc_fill_out_ack(struct rxrpc_call *call, int nr_kv, u8 ack_reason, + rxrpc_serial_t serial_to_ack, rxrpc_serial_t *_ack_serial) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3; + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); unsigned int qsize, sack, wrap, to, max_mtu, if_mtu; rxrpc_seq_t window, wtop; + ktime_t now = ktime_get_real(); int rsize; - u8 *filler = txb->kvec[2].iov_base; - u8 *sackp = txb->kvec[1].iov_base; + u8 *filler = kv[2].iov_base; + u8 *sackp = kv[1].iov_base; rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); @@ -94,14 +169,25 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, wtop = call->ackr_wtop; sack = call->ackr_sack_base % RXRPC_SACK_SIZE; + *_ack_serial = rxrpc_get_next_serial(call->conn); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->serial = htonl(*_ack_serial); whdr->seq = 0; whdr->type = RXRPC_PACKET_TYPE_ACK; - txb->flags |= RXRPC_SLOW_START_OK; + whdr->flags = call->conn->out_clientflag | RXRPC_SLOW_START_OK; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->dest_srx.srx_service); + ack->bufferSpace = 0; ack->maxSkew = 0; ack->firstPacket = htonl(window); ack->previousPacket = htonl(call->rx_highest_seq); - ack->serial = htonl(serial); + ack->serial = htonl(serial_to_ack); ack->reason = ack_reason; ack->nAcks = wtop - window; filler[0] = 0; @@ -109,12 +195,10 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, filler[2] = 0; if (ack_reason == RXRPC_ACK_PING) - txb->flags |= RXRPC_REQUEST_ACK; + whdr->flags |= RXRPC_REQUEST_ACK; if (after(wtop, window)) { - txb->len += ack->nAcks; - txb->kvec[1].iov_base = sackp; - txb->kvec[1].iov_len = ack->nAcks; + kv[1].iov_len = ack->nAcks; wrap = RXRPC_SACK_SIZE - sack; to = umin(ack->nAcks, RXRPC_SACK_SIZE); @@ -133,7 +217,6 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, qsize = (window - 1) - call->rx_consumed; rsize = max_t(int, call->rx_winsize - qsize, 0); - txb->ack_rwind = rsize; if_mtu = call->peer->if_mtu - call->peer->hdrsize; if (call->peer->ackr_adv_pmtud) { @@ -147,48 +230,27 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, trailer->ifMTU = htonl(if_mtu); trailer->rwind = htonl(rsize); trailer->jumbo_max = 0; /* Advertise pmtu discovery */ -} - -/* - * Record the beginning of an RTT probe. - */ -static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, - ktime_t now, enum rxrpc_rtt_tx_trace why) -{ - unsigned long avail = call->rtt_avail; - int rtt_slot = 9; - if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) - goto no_slot; - - rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); - if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) - goto no_slot; - - call->rtt_serial[rtt_slot] = serial; - call->rtt_sent_at[rtt_slot] = now; - smp_wmb(); /* Write data before avail bit */ - set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); - - trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); - return; - -no_slot: - trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); + if (ack_reason == RXRPC_ACK_PING) + rxrpc_begin_rtt_probe(call, *_ack_serial, now, rxrpc_rtt_tx_ping); + if (whdr->flags & RXRPC_REQUEST_ACK) + call->peer->rtt_last_req = now; + rxrpc_set_keepalive(call, now); + return nr_kv; } /* * Transmit an ACK packet. */ -static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb, - int nr_kv, enum rxrpc_propose_ack_trace why) +static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len, + rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { struct kvec *kv = call->local->kvec; struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; struct rxrpc_connection *conn; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); struct msghdr msg; - ktime_t now; int ret; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) @@ -202,41 +264,31 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t msg.msg_controllen = 0; msg.msg_flags = MSG_SPLICE_PAGES; - whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; - - txb->serial = rxrpc_get_next_serial(conn); - whdr->serial = htonl(txb->serial); - trace_rxrpc_tx_ack(call->debug_id, txb->serial, + trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(ack->firstPacket), ntohl(ack->serial), ack->reason, ack->nAcks, - txb->ack_rwind); + ntohl(trailer->rwind)); rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, txb->len); + iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, len); rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe); - ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); + ret = do_udp_sendmsg(conn->local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { - trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, + trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); if (why == rxrpc_propose_ack_ping_for_mtu_probe && ret == -EMSGSIZE) - rxrpc_input_probe_for_pmtud(conn, txb->serial, true); + rxrpc_input_probe_for_pmtud(conn, serial, true); } else { trace_rxrpc_tx_packet(call->debug_id, whdr, rxrpc_tx_point_call_ack); - now = ktime_get_real(); - if (ack->reason == RXRPC_ACK_PING) - rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_ping); - if (txb->flags & RXRPC_REQUEST_ACK) - call->peer->rtt_last_req = now; - rxrpc_set_keepalive(call, now); if (why == rxrpc_propose_ack_ping_for_mtu_probe) { call->peer->pmtud_pending = false; call->peer->pmtud_probing = true; - call->conn->pmtud_probe = txb->serial; + call->conn->pmtud_probe = serial; call->conn->pmtud_call = call->debug_id; trace_rxrpc_pmtud_tx(call); } @@ -248,10 +300,11 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t * Queue an ACK for immediate transmission. */ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, - rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) + rxrpc_serial_t serial_to_ack, enum rxrpc_propose_ack_trace why) { - struct rxrpc_txbuf *txb; struct kvec *kv = call->local->kvec; + rxrpc_serial_t ack_serial; + size_t len; int nr_kv; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) @@ -259,32 +312,29 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); - txb = rxrpc_alloc_ack_txbuf(call, call->ackr_wtop - call->ackr_window); - if (!txb) { + nr_kv = rxrpc_alloc_ack(call, call->ackr_wtop - call->ackr_window); + if (nr_kv < 0) { kleave(" = -ENOMEM"); return; } - txb->ack_why = why; - - rxrpc_fill_out_ack(call, txb, ack_reason, serial); + nr_kv = rxrpc_fill_out_ack(call, nr_kv, ack_reason, serial_to_ack, &ack_serial); + len = kv[0].iov_len; + len += kv[1].iov_len; + len += kv[2].iov_len; /* Extend a path MTU probe ACK. */ - nr_kv = txb->nr_kvec; - kv[0] = txb->kvec[0]; - kv[1] = txb->kvec[1]; - kv[2] = txb->kvec[2]; if (why == rxrpc_propose_ack_ping_for_mtu_probe) { size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header); - if (txb->len > probe_mtu) + if (len > probe_mtu) goto skip; - while (txb->len < probe_mtu) { - size_t part = umin(probe_mtu - txb->len, PAGE_SIZE); + while (len < probe_mtu) { + size_t part = umin(probe_mtu - len, PAGE_SIZE); kv[nr_kv].iov_base = page_address(ZERO_PAGE(0)); kv[nr_kv].iov_len = part; - txb->len += part; + len += part; nr_kv++; } } @@ -293,10 +343,10 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, atomic_set(&call->ackr_nr_consumed, 0); clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); - trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb, nr_kv, why); + trace_rxrpc_send_ack(call, why, ack_reason, ack_serial); + rxrpc_send_ack_packet(call, nr_kv, len, ack_serial, why); skip: - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); + rxrpc_free_ack(call); } /* diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 067223c8c35f2..131d9e55c8e97 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -73,82 +73,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return txb; } -/* - * Allocate and partially initialise an ACK packet. - */ -struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size) -{ - struct rxrpc_wire_header *whdr; - struct rxrpc_acktrailer *trailer; - struct rxrpc_ackpacket *ack; - struct rxrpc_txbuf *txb; - gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; - void *buf, *buf2 = NULL; - u8 *filler; - - txb = kmalloc(sizeof(*txb), gfp); - if (!txb) - return NULL; - - buf = page_frag_alloc(&call->local->tx_alloc, - sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); - if (!buf) { - kfree(txb); - return NULL; - } - - if (sack_size) { - buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); - if (!buf2) { - page_frag_free(buf); - kfree(txb); - return NULL; - } - } - - whdr = buf; - ack = buf + sizeof(*whdr); - filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; - trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; - - refcount_set(&txb->ref, 1); - txb->call_debug_id = call->debug_id; - txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); - txb->space = 0; - txb->len = sizeof(*whdr) + sizeof(*ack) + 3 + sizeof(*trailer); - txb->offset = 0; - txb->flags = call->conn->out_clientflag; - txb->ack_rwind = 0; - txb->seq = 0; - txb->serial = 0; - txb->cksum = 0; - txb->nr_kvec = 3; - txb->kvec[0].iov_base = whdr; - txb->kvec[0].iov_len = sizeof(*whdr) + sizeof(*ack); - txb->kvec[1].iov_base = buf2; - txb->kvec[1].iov_len = sack_size; - txb->kvec[2].iov_base = filler; - txb->kvec[2].iov_len = 3 + sizeof(*trailer); - - whdr->epoch = htonl(call->conn->proto.epoch); - whdr->cid = htonl(call->cid); - whdr->callNumber = htonl(call->call_id); - whdr->seq = 0; - whdr->type = RXRPC_PACKET_TYPE_ACK; - whdr->flags = 0; - whdr->userStatus = 0; - whdr->securityIndex = call->security_ix; - whdr->_rsvd = 0; - whdr->serviceId = htons(call->dest_srx.srx_service); - - get_page(virt_to_head_page(trailer)); - - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, - rxrpc_txbuf_alloc_ack); - atomic_inc(&rxrpc_nr_txbuf); - return txb; -} - void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) { int r; From a2ea9a9072607c2fd6442bd1ffb4dbdbf882aed7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:01 +0000 Subject: [PATCH 0310/1450] rxrpc: Use irq-disabling spinlocks between app and I/O thread Where a spinlock is used by both the application thread and the I/O thread, use irq-disabling locking so that an interrupt taken on the app thread doesn't also slow down the I/O thread. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- net/rxrpc/af_rxrpc.c | 4 ++-- net/rxrpc/ar-internal.h | 1 - net/rxrpc/call_accept.c | 20 ++++++++++---------- net/rxrpc/call_object.c | 15 +++++++-------- net/rxrpc/conn_client.c | 12 ++++++------ net/rxrpc/conn_event.c | 8 ++++---- net/rxrpc/conn_object.c | 8 ++++---- net/rxrpc/input.c | 5 +---- net/rxrpc/io_thread.c | 8 ++++---- net/rxrpc/peer_event.c | 8 ++++---- net/rxrpc/peer_object.c | 1 + net/rxrpc/recvmsg.c | 18 +++++++++--------- net/rxrpc/security.c | 4 ++-- net/rxrpc/sendmsg.c | 2 -- 14 files changed, 54 insertions(+), 60 deletions(-) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 9d8bd0b37e41d..86873399f7d50 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -408,9 +408,9 @@ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call) /* Make sure we're not going to call back into a kernel service */ if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx = rxrpc_dummy_notify_rx; - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } } mutex_unlock(&call->user_mutex); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index db93d7f78902f..ffd80dc88f40f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -700,7 +700,6 @@ struct rxrpc_call { struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */ /* Transmitted data tracking. */ - spinlock_t tx_lock; /* Transmit queue lock */ struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */ struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */ rxrpc_seq_t tx_qbase; /* First slot in tx_queue */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index a6776b1604bab..e685034ce4f7c 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -188,8 +188,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) /* Make sure that there aren't any incoming calls in progress before we * clear the preallocation buffers. */ - spin_lock(&rx->incoming_lock); - spin_unlock(&rx->incoming_lock); + spin_lock_irq(&rx->incoming_lock); + spin_unlock_irq(&rx->incoming_lock); head = b->peer_backlog_head; tail = b->peer_backlog_tail; @@ -343,7 +343,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); - read_lock(&local->services_lock); + read_lock_irq(&local->services_lock); /* Weed out packets to services we're not offering. Packets that would * begin a call are explicitly rejected and the rest are just @@ -399,12 +399,12 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, spin_unlock(&conn->state_lock); spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); if (hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } _leave(" = %p{%d}", call, call->debug_id); @@ -413,20 +413,20 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, return true; unsupported_service: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EOPNOTSUPP); unsupported_security: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); _leave(" = f [%u]", skb->mark); return false; discard: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return true; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index e0644e9a8d218..75cd0b06e14cb 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -49,7 +49,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) bool busy; if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke)) @@ -57,7 +57,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) if (!busy) { list_add_tail(&call->attend_link, &local->call_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); if (!busy) rxrpc_wake_up_io_thread(local); } @@ -151,7 +151,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); - spin_lock_init(&call->tx_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; @@ -302,9 +301,9 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); rxrpc_get_call(call, rxrpc_call_get_io_thread); - spin_lock(&local->client_call_lock); + spin_lock_irq(&local->client_call_lock); list_add_tail(&call->wait_link, &local->new_client_calls); - spin_unlock(&local->client_call_lock); + spin_unlock_irq(&local->client_call_lock); rxrpc_wake_up_io_thread(local); return 0; @@ -434,7 +433,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, /* * Set up an incoming call. call->conn points to the connection. - * This is called in BH context and isn't allowed to fail. + * This is called with interrupts disabled and isn't allowed to fail. */ void rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -576,7 +575,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_put_call_slot(call); /* Make sure we don't get any more notifications */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", @@ -589,7 +588,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 5f76bd90567c0..db00991978906 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -510,9 +510,9 @@ void rxrpc_connect_client_calls(struct rxrpc_local *local) struct rxrpc_call *call; LIST_HEAD(new_client_calls); - spin_lock(&local->client_call_lock); + spin_lock_irq(&local->client_call_lock); list_splice_tail_init(&local->new_client_calls, &new_client_calls); - spin_unlock(&local->client_call_lock); + spin_unlock_irq(&local->client_call_lock); while ((call = list_first_entry_or_null(&new_client_calls, struct rxrpc_call, wait_link))) { @@ -547,9 +547,9 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); trace_rxrpc_client(conn, channel, rxrpc_client_exposed); - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } } @@ -590,9 +590,9 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); /* May still be on ->new_client_calls. */ - spin_lock(&local->client_call_lock); + spin_lock_irq(&local->client_call_lock); list_del_init(&call->wait_link); - spin_unlock(&local->client_call_lock); + spin_unlock_irq(&local->client_call_lock); return; } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index f6c02cc44d98e..6b29a294ee07d 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -26,7 +26,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff bool aborted = false; if (conn->state != RXRPC_CONN_ABORTED) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state != RXRPC_CONN_ABORTED) { conn->abort_code = abort_code; conn->error = err; @@ -37,7 +37,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events); aborted = true; } - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } return aborted; @@ -261,10 +261,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) conn->state = RXRPC_CONN_SERVICE; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE) { /* Offload call state flipping to the I/O thread. As diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index b0627398311b4..7eba4d7d9a380 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -31,13 +31,13 @@ void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) if (WARN_ON_ONCE(!local)) return; - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&conn->attend_link); if (!busy) { rxrpc_get_connection(conn, why); list_add_tail(&conn->attend_link, &local->conn_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); rxrpc_wake_up_io_thread(local); } @@ -196,9 +196,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_del_init(&call->error_link); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } if (rxrpc_is_client_call(call)) { diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index a7a249872a541..821e10c030868 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -424,7 +424,7 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); bool last = sp->hdr.flags & RXRPC_LAST_PACKET; - __skb_queue_tail(&call->recvmsg_queue, skb); + skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); if (last) @@ -501,7 +501,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg); - spin_lock(&call->recvmsg_queue.lock); rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue); *_notify = true; @@ -523,8 +522,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_receive_queue_oos); } - spin_unlock(&call->recvmsg_queue.lock); - call->ackr_sack_base = sack; } else { unsigned int slot; diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index bc678a299bd87..fbacf2056f643 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -500,9 +500,9 @@ int rxrpc_io_thread(void *data) } /* Deal with connections that want immediate attention. */ - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); while ((conn = list_first_entry_or_null(&conn_attend_q, struct rxrpc_connection, @@ -519,9 +519,9 @@ int rxrpc_io_thread(void *data) rxrpc_discard_expired_client_conns(local); /* Deal with calls that want immediate attention. */ - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); list_splice_tail_init(&local->call_attend_q, &call_attend_q); - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); while ((call = list_first_entry_or_null(&call_attend_q, struct rxrpc_call, diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index ff30e0c055079..d82e44a3901b0 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -213,23 +213,23 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb, struct rxrpc_call *call; HLIST_HEAD(error_targets); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); hlist_move_list(&peer->error_targets, &error_targets); while (!hlist_empty(&error_targets)) { call = hlist_entry(error_targets.first, struct rxrpc_call, error_link); hlist_del_init(&call->error_link); - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); rxrpc_see_call(call, rxrpc_call_see_distribute_error); rxrpc_set_call_completion(call, compl, 0, -err); rxrpc_input_call_event(call); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); } - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); } /* diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 80ef6f06d5122..27b34ed4d76aa 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -320,6 +320,7 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer) * Set up a new incoming peer. There shouldn't be any other matching peers * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. + * Called with interrupts disabled. */ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a482f88c5fc5b..32cd5f1d541db 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call) sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } else { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); @@ -337,14 +337,14 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, * We also want to weed out calls that got requeued whilst we were * shovelling data out. */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && skb_queue_empty(&call->recvmsg_queue)) { list_del_init(&call->recvmsg_link); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); rxrpc_put_call(call, rxrpc_call_put_recvmsg); @@ -355,7 +355,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); call_debug_id = call->debug_id; trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); @@ -445,9 +445,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, error_requeue_call: if (!(flags & MSG_PEEK)) { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index cb8dd1d3b1d49..9784adc8f2759 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -114,10 +114,10 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = conn->security->init_connection_security(conn, token); if (ret == 0) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) conn->state = RXRPC_CONN_CLIENT; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } } mutex_unlock(&conn->security_lock); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 381b25597f4e8..df501a7c92fa3 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -261,7 +261,6 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue); /* Add the packet to the call's output buffer */ - spin_lock(&call->tx_lock); poke = (READ_ONCE(call->tx_bottom) == call->send_top); sq->bufs[ix] = txb; /* Order send_top after the queue->next pointer and txb content. */ @@ -270,7 +269,6 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, rxrpc_notify_end_tx(rx, call, notify_end_tx); call->send_queue = NULL; } - spin_unlock(&call->tx_lock); if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); From 547a9acd4c5e95190c6c93a6d8628c5b8b74a4d6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:02 +0000 Subject: [PATCH 0311/1450] rxrpc: Tidy up the ACK parsing a bit Tidy up the ACK parsing in the following ways: (1) Put the serial number of the ACK packet into the rxrpc_ack_summary struct and access it from there whilst parsing an ACK. (2) Be consistent about using "if (summary.acked_serial)" rather than "if (summary.acked_serial != 0)". Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/input.c | 55 +++++++++++++++++++---------------------- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ffd80dc88f40f..aa240b4b4bec3 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -771,6 +771,7 @@ struct rxrpc_call { * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { + rxrpc_serial_t ack_serial; /* Serial number of ACK */ rxrpc_serial_t acked_serial; /* Serial number ACK'd */ u16 in_flight; /* Number of unreceived transmissions */ u16 nr_new_hacks; /* Number of rotated new ACKs */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 821e10c030868..036cf440b63b0 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -198,11 +198,10 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, struct rxrpc_txqueue *tq, - int ix, - rxrpc_serial_t ack_serial) + int ix) { rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_data_ack, -1, - summary->acked_serial, ack_serial, + summary->acked_serial, summary->ack_serial, ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]), call->acks_latest_ts); summary->rtt_sample_avail = false; @@ -213,8 +212,7 @@ static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call, * Apply a hard ACK by advancing the Tx window. */ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, - struct rxrpc_ack_summary *summary, - rxrpc_serial_t ack_serial) + struct rxrpc_ack_summary *summary) { struct rxrpc_txqueue *tq = call->tx_queue; rxrpc_seq_t seq = call->tx_bottom + 1; @@ -255,7 +253,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, if (summary->rtt_sample_avail && summary->acked_serial == tq->segment_serial[ix] && test_bit(ix, &tq->rtt_samples)) - rxrpc_add_data_rtt_sample(call, summary, tq, ix, ack_serial); + rxrpc_add_data_rtt_sample(call, summary, tq, ix); if (ix == tq->nr_reported_acks) { /* Packet directly hard ACK'd. */ @@ -369,7 +367,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { - if (!rxrpc_rotate_tx_window(call, top, &summary, 0)) { + if (!rxrpc_rotate_tx_window(call, top, &summary)) { rxrpc_proto_abort(call, top, rxrpc_eproto_early_reply); return false; } @@ -826,12 +824,11 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb */ static void rxrpc_input_soft_rtt(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - struct rxrpc_txqueue *tq, - rxrpc_serial_t ack_serial) + struct rxrpc_txqueue *tq) { for (int ix = 0; ix < RXRPC_NR_TXQUEUE; ix++) if (summary->acked_serial == tq->segment_serial[ix]) - return rxrpc_add_data_rtt_sample(call, summary, tq, ix, ack_serial); + return rxrpc_add_data_rtt_sample(call, summary, tq, ix); } /* @@ -944,7 +941,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, _debug("bound %16lx %u", extracted, nr); if (summary->rtt_sample_avail) - rxrpc_input_soft_rtt(call, summary, tq, sp->hdr.serial); + rxrpc_input_soft_rtt(call, summary, tq); rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE, seq - RXRPC_NR_TXQUEUE, &lowest_nak); extracted = ~0UL; @@ -1016,7 +1013,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_ack_summary summary = { 0 }; struct rxrpc_acktrailer trailer; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - rxrpc_serial_t ack_serial; rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; @@ -1024,14 +1020,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - ack_serial = sp->hdr.serial; - first_soft_ack = sp->ack.first_ack; - prev_pkt = sp->ack.prev_ack; - nr_acks = sp->ack.nr_acks; - hard_ack = first_soft_ack - 1; - summary.acked_serial = sp->ack.acked_serial; - summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? - sp->ack.reason : RXRPC_ACK__INVALID); + summary.ack_serial = sp->hdr.serial; + first_soft_ack = sp->ack.first_ack; + prev_pkt = sp->ack.prev_ack; + nr_acks = sp->ack.nr_acks; + hard_ack = first_soft_ack - 1; + summary.acked_serial = sp->ack.acked_serial; + summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? + sp->ack.reason : RXRPC_ACK__INVALID); trace_rxrpc_rx_ack(call, sp); rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); @@ -1066,7 +1062,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) /* Discard any out-of-order or duplicate ACKs (outside lock). */ if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) { - trace_rxrpc_rx_discard_ack(call, ack_serial, hard_ack, prev_pkt); + trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt); goto send_response; } @@ -1100,10 +1096,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (hard_ack + 1 == 0) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); - if (summary.acked_serial != 0) { + if (summary.acked_serial) { if (summary.ack_reason == RXRPC_ACK_PING_RESPONSE) rxrpc_complete_rtt_probe(call, call->acks_latest_ts, - summary.acked_serial, ack_serial, + summary.acked_serial, summary.ack_serial, rxrpc_rtt_rx_ping_response); else summary.rtt_sample_avail = true; @@ -1127,7 +1123,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow); if (after(hard_ack, call->tx_bottom)) { - if (rxrpc_rotate_tx_window(call, hard_ack, &summary, ack_serial)) { + if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); goto send_response; } @@ -1142,19 +1138,20 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && call->acks_nr_sacks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) - rxrpc_propose_ping(call, ack_serial, + rxrpc_propose_ping(call, summary.ack_serial, rxrpc_propose_ack_ping_for_lost_reply); rxrpc_congestion_management(call, &summary); if (summary.need_retransmit) - rxrpc_resend(call, ack_serial, summary.ack_reason == RXRPC_ACK_PING_RESPONSE); + rxrpc_resend(call, summary.ack_serial, + summary.ack_reason == RXRPC_ACK_PING_RESPONSE); send_response: if (summary.ack_reason == RXRPC_ACK_PING) - rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, + rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, summary.ack_serial, rxrpc_propose_ack_respond_to_ping); else if (sp->hdr.flags & RXRPC_REQUEST_ACK) - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, summary.ack_serial, rxrpc_propose_ack_respond_to_ack); } @@ -1165,7 +1162,7 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - if (rxrpc_rotate_tx_window(call, call->tx_top, &summary, 0)) + if (rxrpc_rotate_tx_window(call, call->tx_top, &summary)) rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ackall); } From 372d12d191cb80720319e224d401fd82c602e9e4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:03 +0000 Subject: [PATCH 0312/1450] rxrpc: Add a reason indicator to the tx_data tracepoint Add an indicator to the rxrpc_tx_data tracepoint to indicate what triggered the transmission of a particular packet. At this point, it's only normal transmission and retransmission, plus the tracepoint is also used to record loss injection, but in a future patch, TLP-induced (re-)transmission will also be a thing. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 21 ++++++++++++++------- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_event.c | 12 ++++++++---- net/rxrpc/output.c | 6 +++--- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 326a4c257aeae..d79623fff7464 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -302,6 +302,11 @@ EM(rxrpc_txqueue_rotate_last, "RLS") \ E_(rxrpc_txqueue_wait, "WAI") +#define rxrpc_txdata_traces \ + EM(rxrpc_txdata_inject_loss, " *INJ-LOSS*") \ + EM(rxrpc_txdata_new_data, " ") \ + E_(rxrpc_txdata_retransmit, " *RETRANS*") + #define rxrpc_receive_traces \ EM(rxrpc_receive_end, "END") \ EM(rxrpc_receive_front, "FRN") \ @@ -534,6 +539,7 @@ enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); enum rxrpc_tq_trace { rxrpc_tq_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); +enum rxrpc_txdata_trace { rxrpc_txdata_traces } __mode(byte); enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ @@ -572,6 +578,7 @@ rxrpc_timer_traces; rxrpc_tq_traces; rxrpc_tx_points; rxrpc_txbuf_traces; +rxrpc_txdata_traces; rxrpc_txqueue_traces; /* @@ -1222,9 +1229,10 @@ TRACE_EVENT(rxrpc_tx_packet, TRACE_EVENT(rxrpc_tx_data, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, - rxrpc_serial_t serial, unsigned int flags, bool lose), + rxrpc_serial_t serial, unsigned int flags, + enum rxrpc_txdata_trace trace), - TP_ARGS(call, seq, serial, flags, lose), + TP_ARGS(call, seq, serial, flags, trace), TP_STRUCT__entry( __field(unsigned int, call) @@ -1233,7 +1241,7 @@ TRACE_EVENT(rxrpc_tx_data, __field(u32, cid) __field(u32, call_id) __field(u16, flags) - __field(bool, lose) + __field(enum rxrpc_txdata_trace, trace) ), TP_fast_assign( @@ -1243,18 +1251,17 @@ TRACE_EVENT(rxrpc_tx_data, __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; - __entry->lose = lose; + __entry->trace = trace; ), - TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s%s", + TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s", __entry->call, __entry->cid, __entry->call_id, __entry->serial, __entry->seq, __entry->flags & RXRPC_TXBUF_WIRE_FLAGS, - __entry->flags & RXRPC_TXBUF_RESENT ? " *RETRANS*" : "", - __entry->lose ? " *LOSE*" : "") + __print_symbolic(__entry->trace, rxrpc_txdata_traces)) ); TRACE_EVENT(rxrpc_tx_ack, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index aa240b4b4bec3..139575032ae2b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -883,6 +883,7 @@ struct rxrpc_send_data_req { rxrpc_seq_t seq; /* Sequence of first data */ int n; /* Number of DATA packets to glue into jumbo */ bool did_send; /* T if did actually send */ + int /* enum rxrpc_txdata_trace */ trace; }; #include diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 39772459426ba..99d9502564cc9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -101,6 +101,7 @@ void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_ { struct rxrpc_send_data_req req = { .now = ktime_get_real(), + .trace = rxrpc_txdata_retransmit, }; struct rxrpc_txqueue *tq = call->tx_queue; ktime_t lowest_xmit_ts = KTIME_MAX; @@ -269,7 +270,8 @@ static unsigned int rxrpc_tx_window_space(struct rxrpc_call *call) /* * Transmit some as-yet untransmitted data. */ -static void rxrpc_transmit_fresh_data(struct rxrpc_call *call) +static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, + enum rxrpc_txdata_trace trace) { int space = rxrpc_tx_window_space(call); @@ -284,6 +286,7 @@ static void rxrpc_transmit_fresh_data(struct rxrpc_call *call) .now = ktime_get_real(), .seq = call->tx_transmitted + 1, .n = 0, + .trace = trace, }; struct rxrpc_txqueue *tq; struct rxrpc_txbuf *txb; @@ -332,7 +335,8 @@ static void rxrpc_transmit_fresh_data(struct rxrpc_call *call) } } -static void rxrpc_transmit_some_data(struct rxrpc_call *call) +static void rxrpc_transmit_some_data(struct rxrpc_call *call, + enum rxrpc_txdata_trace trace) { switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_ACK_REQUEST: @@ -349,7 +353,7 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); return; } - rxrpc_transmit_fresh_data(call); + rxrpc_transmit_fresh_data(call, trace); break; default: return; @@ -463,7 +467,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) resend = true; } - rxrpc_transmit_some_data(call); + rxrpc_transmit_some_data(call, rxrpc_txdata_new_data); now = ktime_get_real(); t = ktime_sub(call->keepalive_at, now); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index a7de8a02f4199..2633f955d1d0e 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -511,7 +511,7 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, len += sizeof(*jumbo); } - trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags | flags, false); + trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, req->trace); kv->iov_len = len; return len; } @@ -655,8 +655,8 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req if ((lose++ & 7) == 7) { ret = 0; - trace_rxrpc_tx_data(call, txb->seq, txb->serial, - txb->flags, true); + trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, + rxrpc_txdata_inject_loss); conn->peer->last_tx_at = ktime_get_seconds(); goto done; } From b509934094fd52ac3a49ee2a2c144e885517069f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:04 +0000 Subject: [PATCH 0313/1450] rxrpc: Add a reason indicator to the tx_ack tracepoint Record the reason for the transmission of an ACK in the rxrpc_tx_ack tracepoint, and not just in the rxrpc_propose_ack tracepoint. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 13 +++++++++---- net/rxrpc/conn_event.c | 3 ++- net/rxrpc/output.c | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d79623fff7464..0cfc8e1baf1f9 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -375,6 +375,7 @@ EM(rxrpc_propose_ack_processing_op, "ProcOp ") \ EM(rxrpc_propose_ack_respond_to_ack, "Rsp2Ack") \ EM(rxrpc_propose_ack_respond_to_ping, "Rsp2Png") \ + EM(rxrpc_propose_ack_retransmit, "Retrans") \ EM(rxrpc_propose_ack_retry_tx, "RetryTx") \ EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \ EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \ @@ -1267,9 +1268,10 @@ TRACE_EVENT(rxrpc_tx_data, TRACE_EVENT(rxrpc_tx_ack, TP_PROTO(unsigned int call, rxrpc_serial_t serial, rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial, - u8 reason, u8 n_acks, u16 rwind), + u8 reason, u8 n_acks, u16 rwind, + enum rxrpc_propose_ack_trace trace), - TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks, rwind), + TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks, rwind, trace), TP_STRUCT__entry( __field(unsigned int, call) @@ -1279,6 +1281,7 @@ TRACE_EVENT(rxrpc_tx_ack, __field(u8, reason) __field(u8, n_acks) __field(u16, rwind) + __field(enum rxrpc_propose_ack_trace, trace) ), TP_fast_assign( @@ -1289,16 +1292,18 @@ TRACE_EVENT(rxrpc_tx_ack, __entry->reason = reason; __entry->n_acks = n_acks; __entry->rwind = rwind; + __entry->trace = trace; ), - TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u rw=%u", + TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u rw=%u %s", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_first, __entry->ack_serial, __entry->n_acks, - __entry->rwind) + __entry->rwind, + __print_symbolic(__entry->trace, rxrpc_propose_ack_traces)) ); TRACE_EVENT(rxrpc_receive, diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 6b29a294ee07d..713e04394ceb7 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -177,7 +177,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, trace_rxrpc_tx_ack(chan->call_debug_id, serial, ntohl(pkt.ack.firstPacket), ntohl(pkt.ack.serial), - pkt.ack.reason, 0, rxrpc_rx_window_size); + pkt.ack.reason, 0, rxrpc_rx_window_size, + rxrpc_propose_ack_retransmit); break; default: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 2633f955d1d0e..74c3ff55b4825 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -267,7 +267,7 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(ack->firstPacket), ntohl(ack->serial), ack->reason, ack->nAcks, - ntohl(trailer->rwind)); + ntohl(trailer->rwind), why); rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); From b40ef2b85a7d117dd323b5910e504899e0a3e7dc Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:05 +0000 Subject: [PATCH 0314/1450] rxrpc: Manage RTT per-call rather than per-peer Manage the determination of RTT on a per-call (ie. per-RPC op) basis rather than on a per-peer basis, averaging across all calls going to that peer. The problem is that the RTT measurements from the initial packets on a call may be off because the server may do some setting up (such as getting a lock on a file) before accepting the rest of the data in the RPC and, further, the RTT may be affected by server-side file operations, for instance if a large amount of data is being written or read. Note: When handling the FS.StoreData-type RPCs, for example, the server uses the userStatus field in the header of ACK packets as supplementary flow control to aid in managing this. AF_RXRPC does not yet support this, but it should be added. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 2 +- net/rxrpc/ar-internal.h | 39 +++++++++------- net/rxrpc/call_event.c | 18 +++---- net/rxrpc/call_object.c | 2 + net/rxrpc/input.c | 10 ++-- net/rxrpc/output.c | 14 +++--- net/rxrpc/peer_object.c | 9 +--- net/rxrpc/proc.c | 6 +-- net/rxrpc/rtt.c | 91 ++++++++++++++++++------------------ net/rxrpc/sendmsg.c | 2 +- 10 files changed, 97 insertions(+), 96 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 0cfc8e1baf1f9..71df5c48a413d 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1453,7 +1453,7 @@ TRACE_EVENT(rxrpc_rtt_rx, __entry->rtt = rtt; __entry->srtt = srtt; __entry->rto = rto; - __entry->min_rtt = minmax_get(&call->peer->min_rtt) + __entry->min_rtt = minmax_get(&call->min_rtt) ), TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u srtt=%u rto=%u min=%u", diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 139575032ae2b..a9d732ba6df0d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -366,20 +366,9 @@ struct rxrpc_peer { unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ unsigned short tx_seg_max; /* Maximum number of transmissable segments */ - /* calculated RTT cache */ -#define RXRPC_RTT_CACHE_SIZE 32 - spinlock_t rtt_input_lock; /* RTT lock for input routine */ - ktime_t rtt_last_req; /* Time of last RTT request */ - unsigned int rtt_count; /* Number of samples we've got */ - unsigned int rtt_taken; /* Number of samples taken (wrapping) */ - struct minmax min_rtt; /* Estimated minimum RTT */ - - u32 srtt_us; /* smoothed round trip time << 3 in usecs */ - u32 mdev_us; /* medium deviation */ - u32 mdev_max_us; /* maximal mdev for the last rtt period */ - u32 rttvar_us; /* smoothed mdev_max */ - u32 rto_us; /* Retransmission timeout in usec */ - u8 backoff; /* Backoff timeout (as shift) */ + /* Calculated RTT cache */ + unsigned int recent_srtt_us; + unsigned int recent_rto_us; u8 cong_ssthresh; /* Congestion slow-start threshold */ }; @@ -765,6 +754,18 @@ struct rxrpc_call { rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ unsigned short acks_nr_sacks; /* Number of soft acks recorded */ unsigned short acks_nr_snacks; /* Number of soft nacks recorded */ + + /* Calculated RTT cache */ + ktime_t rtt_last_req; /* Time of last RTT request */ + unsigned int rtt_count; /* Number of samples we've got */ + unsigned int rtt_taken; /* Number of samples taken (wrapping) */ + struct minmax min_rtt; /* Estimated minimum RTT */ + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 mdev_us; /* medium deviation */ + u32 mdev_max_us; /* maximal mdev for the last rtt period */ + u32 rttvar_us; /* smoothed mdev_max */ + u32 rto_us; /* Retransmission timeout in usec */ + u8 backoff; /* Backoff timeout (as shift) */ }; /* @@ -1287,10 +1288,12 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call, /* * rtt.c */ -void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int, - rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); -ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans); -void rxrpc_peer_init_rtt(struct rxrpc_peer *); +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + int rtt_slot, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + ktime_t send_time, ktime_t resp_time); +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); +void rxrpc_call_init_rtt(struct rxrpc_call *call); /* * rxkad.c diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 99d9502564cc9..7af2755442512 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -44,8 +44,8 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial); - if (call->peer->srtt_us) - delay = (call->peer->srtt_us >> 3) * NSEC_PER_USEC; + if (call->srtt_us) + delay = (call->srtt_us >> 3) * NSEC_PER_USEC; else delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay)); ktime_add_ms(delay, call->tx_backoff); @@ -105,7 +105,7 @@ void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_ }; struct rxrpc_txqueue *tq = call->tx_queue; ktime_t lowest_xmit_ts = KTIME_MAX; - ktime_t rto = rxrpc_get_rto_backoff(call->peer, false); + ktime_t rto = rxrpc_get_rto_backoff(call, false); bool unacked = false; _enter("{%d,%d}", call->tx_bottom, call->tx_top); @@ -195,7 +195,7 @@ void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_ } while ((tq = tq->next)); if (lowest_xmit_ts < KTIME_MAX) { - ktime_t delay = rxrpc_get_rto_backoff(call->peer, req.did_send); + ktime_t delay = rxrpc_get_rto_backoff(call, req.did_send); ktime_t resend_at = ktime_add(lowest_xmit_ts, delay); _debug("delay %llu %lld", delay, ktime_sub(resend_at, req.now)); @@ -216,7 +216,7 @@ void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_ */ if (!req.did_send) { ktime_t next_ping = ktime_add_us(call->acks_latest_ts, - call->peer->srtt_us >> 3); + call->srtt_us >> 3); if (ktime_sub(next_ping, req.now) <= 0) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, @@ -366,8 +366,8 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call, */ static void rxrpc_send_initial_ping(struct rxrpc_call *call) { - if (call->peer->rtt_count < 3 || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + if (call->rtt_count < 3 || + ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_params); @@ -499,10 +499,10 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) rxrpc_propose_ack_rx_idle); if (call->ackr_nr_unacked > 2) { - if (call->peer->rtt_count < 3) + if (call->rtt_count < 3) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_rtt); - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_old_rtt); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 75cd0b06e14cb..fb4ee0d2e9e16 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -176,6 +176,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->cong_cwnd = RXRPC_MIN_CWND; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; + rxrpc_call_init_rtt(call); + call->rxnet = rxnet; call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 036cf440b63b0..9f308bd512e93 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -71,11 +71,11 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, /* We analyse the number of packets that get ACK'd per RTT * period and increase the window if we managed to fill it. */ - if (call->peer->rtt_count == 0) + if (call->rtt_count == 0) goto out; if (ktime_before(call->acks_latest_ts, ktime_add_us(call->cong_tstamp, - call->peer->srtt_us >> 3))) + call->srtt_us >> 3))) goto out_no_clear_ca; summary->change = rxrpc_cong_rtt_window_end; call->cong_tstamp = call->acks_latest_ts; @@ -179,7 +179,7 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY) return; - rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8)); + rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8)); now = ktime_get_real(); if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now)) return; @@ -200,7 +200,7 @@ static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call, struct rxrpc_txqueue *tq, int ix) { - rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_data_ack, -1, + rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1, summary->acked_serial, summary->ack_serial, ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]), call->acks_latest_ts); @@ -725,7 +725,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_mb(); /* Read data before setting avail bit */ set_bit(i, &call->rtt_avail); - rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, + rxrpc_call_add_rtt(call, type, i, acked_serial, ack_serial, sent_at, resp_time); matched = true; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 74c3ff55b4825..ecaf3becee40d 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -234,7 +234,7 @@ static int rxrpc_fill_out_ack(struct rxrpc_call *call, int nr_kv, u8 ack_reason, if (ack_reason == RXRPC_ACK_PING) rxrpc_begin_rtt_probe(call, *_ack_serial, now, rxrpc_rtt_tx_ping); if (whdr->flags & RXRPC_REQUEST_ACK) - call->peer->rtt_last_req = now; + call->rtt_last_req = now; rxrpc_set_keepalive(call, now); return nr_kv; } @@ -473,9 +473,9 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, why = rxrpc_reqack_slow_start; else if (call->tx_winsize <= 2) why = rxrpc_reqack_small_txwin; - else if (call->peer->rtt_count < 3 && txb->seq & 1) + else if (call->rtt_count < 3) why = rxrpc_reqack_more_rtt; - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) why = rxrpc_reqack_old_rtt; else if (!last && !after(READ_ONCE(call->send_top), txb->seq)) why = rxrpc_reqack_app_stall; @@ -487,7 +487,7 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, if (why != rxrpc_reqack_no_srv_last) { flags |= RXRPC_REQUEST_ACK; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, -1, serial); - call->peer->rtt_last_req = req->now; + call->rtt_last_req = req->now; } dont_set_request_ack: @@ -576,8 +576,8 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se } /* Set timeouts */ - if (call->peer->rtt_count > 1) { - ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + if (call->rtt_count > 1) { + ktime_t delay = rxrpc_get_rto_backoff(call, false); call->ack_lost_at = ktime_add(req->now, delay); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); @@ -590,7 +590,7 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); } if (call->resend_at == KTIME_MAX) { - ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + ktime_t delay = rxrpc_get_rto_backoff(call, false); call->resend_at = ktime_add(req->now, delay); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 27b34ed4d76aa..e1c63129586b1 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -235,12 +235,9 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); - spin_lock_init(&peer->rtt_input_lock); seqcount_init(&peer->mtu_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); - - rxrpc_peer_init_rtt(peer); - + peer->recent_srtt_us = UINT_MAX; peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; trace_rxrpc_peer(peer->debug_id, 1, why); } @@ -283,8 +280,6 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, peer->max_data = peer->if_mtu - peer->hdrsize; rxrpc_assess_MTU_size(local, peer); - - peer->rtt_last_req = ktime_get_real(); } /* @@ -496,7 +491,7 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); */ unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { - return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX; + return READ_ONCE(peer->recent_srtt_us); } EXPORT_SYMBOL(rxrpc_kernel_get_srtt); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 5f974ec13d694..d803562ca0ac1 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -296,15 +296,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) now = ktime_get_seconds(); seq_printf(seq, - "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8u %8u\n", + "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8d %8d\n", lbuff, rbuff, refcount_read(&peer->ref), peer->cong_ssthresh, peer->max_data, now - peer->last_tx_at, - peer->srtt_us >> 3, - peer->rto_us); + READ_ONCE(peer->recent_srtt_us), + READ_ONCE(peer->recent_rto_us)); return 0; } diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index aff75e168de86..7474f88d7b180 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -15,14 +15,14 @@ #define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ -static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) +static u32 rxrpc_rto_min_us(struct rxrpc_call *call) { return 200; } -static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) +static u32 __rxrpc_set_rto(const struct rxrpc_call *call) { - return (peer->srtt_us >> 3) + peer->rttvar_us; + return (call->srtt_us >> 3) + call->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) @@ -40,10 +40,10 @@ static u32 rxrpc_bound_rto(u32 rto) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) +static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ - u32 srtt = peer->srtt_us; + u32 srtt = call->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -66,7 +66,7 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -78,31 +78,31 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) if (m > 0) m >>= 3; } else { - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ } - peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ - if (peer->mdev_us > peer->mdev_max_us) { - peer->mdev_max_us = peer->mdev_us; - if (peer->mdev_max_us > peer->rttvar_us) - peer->rttvar_us = peer->mdev_max_us; + call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (call->mdev_us > call->mdev_max_us) { + call->mdev_max_us = call->mdev_us; + if (call->mdev_max_us > call->rttvar_us) + call->rttvar_us = call->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ - peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ - peer->rttvar_us = umax(peer->mdev_us, rxrpc_rto_min_us(peer)); - peer->mdev_max_us = peer->rttvar_us; + call->mdev_us = m << 1; /* make sure rto = 3*rtt */ + call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call)); + call->mdev_max_us = call->rttvar_us; } - peer->srtt_us = umax(srtt, 1); + call->srtt_us = umax(srtt, 1); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static void rxrpc_set_rto(struct rxrpc_peer *peer) +static void rxrpc_set_rto(struct rxrpc_call *call) { u32 rto; @@ -113,7 +113,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ - rto = __rxrpc_set_rto(peer); + rto = __rxrpc_set_rto(call); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -124,73 +124,73 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ - peer->rto_us = rxrpc_bound_rto(rto); + call->rto_us = rxrpc_bound_rto(rto); } -static void rxrpc_update_rtt_min(struct rxrpc_peer *peer, ktime_t resp_time, long rtt_us) +static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */ u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024; - minmax_running_min(&peer->min_rtt, wlen_us, resp_time / 1024, + minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024, (u32)rtt_us ? : jiffies_to_usecs(1)); } -static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, ktime_t resp_time, long rtt_us) +static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { if (rtt_us < 0) return; /* Update RACK min RTT [RFC8985 6.1 Step 1]. */ - rxrpc_update_rtt_min(peer, resp_time, rtt_us); + rxrpc_update_rtt_min(call, resp_time, rtt_us); - rxrpc_rtt_estimator(peer, rtt_us); - rxrpc_set_rto(peer); + rxrpc_rtt_estimator(call, rtt_us); + rxrpc_set_rto(call); /* Only reset backoff on valid RTT measurement [RFC6298]. */ - peer->backoff = 0; + call->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has - * exclusive access to the peer RTT data. + * exclusive access to the call RTT data. */ -void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { - struct rxrpc_peer *peer = call->peer; s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; - spin_lock(&peer->rtt_input_lock); - rxrpc_ack_update_rtt(peer, resp_time, rtt_us); - if (peer->rtt_count < 3) - peer->rtt_count++; - peer->rtt_taken++; - spin_unlock(&peer->rtt_input_lock); + rxrpc_ack_update_rtt(call, resp_time, rtt_us); + if (call->rtt_count < 3) + call->rtt_count++; + call->rtt_taken++; + + WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8); + WRITE_ONCE(call->peer->recent_rto_us, call->rto_us); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, - rtt_us, peer->srtt_us, peer->rto_us); + rtt_us, call->srtt_us, call->rto_us); } /* * Get the retransmission timeout to set in nanoseconds, backing it off each * time we retransmit. */ -ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans) { u64 timo_us; - u32 backoff = READ_ONCE(peer->backoff); + u32 backoff = READ_ONCE(call->backoff); - timo_us = peer->rto_us; + timo_us = call->rto_us; timo_us <<= backoff; if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) - WRITE_ONCE(peer->backoff, backoff + 1); + WRITE_ONCE(call->backoff, backoff + 1); if (timo_us < 1) timo_us = 1; @@ -198,10 +198,11 @@ ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) return ns_to_ktime(timo_us * NSEC_PER_USEC); } -void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) +void rxrpc_call_init_rtt(struct rxrpc_call *call) { - peer->rto_us = RXRPC_TIMEOUT_INIT; - peer->mdev_us = RXRPC_TIMEOUT_INIT; - peer->backoff = 0; - //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); + call->rtt_last_req = KTIME_MIN; + call->rto_us = RXRPC_TIMEOUT_INIT; + call->mdev_us = RXRPC_TIMEOUT_INIT; + call->backoff = 0; + //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U); } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index df501a7c92fa3..c4c8b718cafaa 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -134,7 +134,7 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rxrpc_seq_t tx_start, tx_win; signed long rtt, timeout; - rtt = READ_ONCE(call->peer->srtt_us) >> 3; + rtt = READ_ONCE(call->srtt_us) >> 3; rtt = usecs_to_jiffies(rtt) * 2; if (rtt < 2) rtt = 2; From 4ee4c2f82b81c088d1514b04c28c84c15e98ba1a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:06 +0000 Subject: [PATCH 0315/1450] rxrpc: Fix request for an ACK when cwnd is minimum rxrpc_prepare_data_subpacket() sets the REQUEST-ACK flag on the outgoing DATA packet under a number of circumstances, including, theoretically, when the cwnd is at minimum (or less). However, the minimum in this function is hard-coded as 2, but the actual minimum is RXRPC_MIN_CWND (which is currently 4) and so this never occurs. Without this, we will miss the request of some ACKs, potentially leading to a transmission stall until a timeout occurs on one side or the other that leads to an ACK being generated. Fix the function to use RXRPC_MIN_CWND rather than a hard-coded number. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- net/rxrpc/output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index ecaf3becee40d..f934551a9b1c6 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -469,7 +469,7 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, why = rxrpc_reqack_ack_lost; else if (txb->flags & RXRPC_TXBUF_RESENT) why = rxrpc_reqack_retrans; - else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= 2) + else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= RXRPC_MIN_CWND) why = rxrpc_reqack_slow_start; else if (call->tx_winsize <= 2) why = rxrpc_reqack_small_txwin; From 7c482665931b6ce7bc72fa5feae6c35567070296 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:07 +0000 Subject: [PATCH 0316/1450] rxrpc: Implement RACK/TLP to deal with transmission stalls [RFC8985] When an rxrpc call is in its transmission phase and is sending a lot of packets, stalls occasionally occur that cause severe performance degradation (eg. increasing the transmission time for a 256MiB payload from 0.7s to 2.5s over a 10G link). rxrpc already implements TCP-style congestion control [RFC5681] and this helps mitigate the effects, but occasionally we're missing a time event that deals with a missing ACK, leading to a stall until the RTO expires. Fix this by implementing RACK/TLP in rxrpc. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 342 ++++++++++++++++++++++++++-- net/rxrpc/Makefile | 1 + net/rxrpc/ar-internal.h | 107 ++++++++- net/rxrpc/call_event.c | 247 +++++++-------------- net/rxrpc/call_object.c | 3 +- net/rxrpc/input.c | 117 ++++++---- net/rxrpc/input_rack.c | 418 +++++++++++++++++++++++++++++++++++ net/rxrpc/io_thread.c | 1 + net/rxrpc/output.c | 41 +++- 9 files changed, 1041 insertions(+), 236 deletions(-) create mode 100644 net/rxrpc/input_rack.c diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 71df5c48a413d..2f119d18a061f 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -305,7 +305,9 @@ #define rxrpc_txdata_traces \ EM(rxrpc_txdata_inject_loss, " *INJ-LOSS*") \ EM(rxrpc_txdata_new_data, " ") \ - E_(rxrpc_txdata_retransmit, " *RETRANS*") + EM(rxrpc_txdata_retransmit, " *RETRANS*") \ + EM(rxrpc_txdata_tlp_new_data, " *TLP-NEW*") \ + E_(rxrpc_txdata_tlp_retransmit, " *TLP-RETRANS*") #define rxrpc_receive_traces \ EM(rxrpc_receive_end, "END") \ @@ -353,11 +355,12 @@ EM(rxrpc_timer_trace_hard, "HardLimit") \ EM(rxrpc_timer_trace_idle, "IdleLimit") \ EM(rxrpc_timer_trace_keepalive, "KeepAlive") \ - EM(rxrpc_timer_trace_lost_ack, "LostAck ") \ EM(rxrpc_timer_trace_ping, "DelayPing") \ - EM(rxrpc_timer_trace_resend, "Resend ") \ - EM(rxrpc_timer_trace_resend_reset, "ResendRst") \ - E_(rxrpc_timer_trace_resend_tx, "ResendTx ") + EM(rxrpc_timer_trace_rack_off, "RACK-OFF ") \ + EM(rxrpc_timer_trace_rack_zwp, "RACK-ZWP ") \ + EM(rxrpc_timer_trace_rack_reo, "RACK-Reo ") \ + EM(rxrpc_timer_trace_rack_tlp_pto, "TLP-PTO ") \ + E_(rxrpc_timer_trace_rack_rto, "RTO ") #define rxrpc_propose_ack_traces \ EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \ @@ -478,9 +481,9 @@ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ + EM(rxrpc_txbuf_see_lost, "SEE LOST ") \ EM(rxrpc_txbuf_see_out_of_step, "OUT-OF-STEP") \ - EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ - E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") + E_(rxrpc_txbuf_see_send_more, "SEE SEND+ ") #define rxrpc_tq_traces \ EM(rxrpc_tq_alloc, "ALLOC") \ @@ -505,6 +508,24 @@ EM(rxrpc_rotate_trace_sack, "soft-ack") \ E_(rxrpc_rotate_trace_snak, "soft-nack") +#define rxrpc_rack_timer_modes \ + EM(RXRPC_CALL_RACKTIMER_OFF, "---") \ + EM(RXRPC_CALL_RACKTIMER_RACK_REORDER, "REO") \ + EM(RXRPC_CALL_RACKTIMER_TLP_PTO, "TLP") \ + E_(RXRPC_CALL_RACKTIMER_RTO, "RTO") + +#define rxrpc_tlp_probe_traces \ + EM(rxrpc_tlp_probe_trace_busy, "busy") \ + EM(rxrpc_tlp_probe_trace_transmit_new, "transmit-new") \ + E_(rxrpc_tlp_probe_trace_retransmit, "retransmit") + +#define rxrpc_tlp_ack_traces \ + EM(rxrpc_tlp_ack_trace_acked, "acked") \ + EM(rxrpc_tlp_ack_trace_dup_acked, "dup-acked") \ + EM(rxrpc_tlp_ack_trace_hard_beyond, "hard-beyond") \ + EM(rxrpc_tlp_ack_trace_incomplete, "incomplete") \ + E_(rxrpc_tlp_ack_trace_new_data, "new-data") + /* * Generate enums for tracing information. */ @@ -537,6 +558,8 @@ enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); +enum rxrpc_tlp_ack_trace { rxrpc_tlp_ack_traces } __mode(byte); +enum rxrpc_tlp_probe_trace { rxrpc_tlp_probe_traces } __mode(byte); enum rxrpc_tq_trace { rxrpc_tq_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); @@ -567,6 +590,7 @@ rxrpc_conn_traces; rxrpc_local_traces; rxrpc_pmtud_reduce_traces; rxrpc_propose_ack_traces; +rxrpc_rack_timer_modes; rxrpc_receive_traces; rxrpc_recvmsg_traces; rxrpc_req_ack_traces; @@ -576,6 +600,8 @@ rxrpc_rtt_tx_traces; rxrpc_sack_traces; rxrpc_skb_traces; rxrpc_timer_traces; +rxrpc_tlp_ack_traces; +rxrpc_tlp_probe_traces; rxrpc_tq_traces; rxrpc_tx_points; rxrpc_txbuf_traces; @@ -618,6 +644,20 @@ TRACE_EVENT(rxrpc_local, __entry->usage) ); +TRACE_EVENT(rxrpc_iothread_rx, + TP_PROTO(struct rxrpc_local *local, unsigned int nr_rx), + TP_ARGS(local, nr_rx), + TP_STRUCT__entry( + __field(unsigned int, local) + __field(unsigned int, nr_rx) + ), + TP_fast_assign( + __entry->local = local->debug_id; + __entry->nr_rx = nr_rx; + ), + TP_printk("L=%08x nrx=%u", __entry->local, __entry->nr_rx) + ); + TRACE_EVENT(rxrpc_peer, TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why), @@ -1684,16 +1724,15 @@ TRACE_EVENT(rxrpc_drop_ack, TRACE_EVENT(rxrpc_retransmit, TP_PROTO(struct rxrpc_call *call, struct rxrpc_send_data_req *req, - struct rxrpc_txbuf *txb, ktime_t expiry), + struct rxrpc_txbuf *txb), - TP_ARGS(call, req, txb, expiry), + TP_ARGS(call, req, txb), TP_STRUCT__entry( __field(unsigned int, call) __field(unsigned int, qbase) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) - __field(ktime_t, expiry) ), TP_fast_assign( @@ -1701,15 +1740,13 @@ TRACE_EVENT(rxrpc_retransmit, __entry->qbase = req->tq->qbase; __entry->seq = req->seq; __entry->serial = txb->serial; - __entry->expiry = expiry; ), - TP_printk("c=%08x tq=%x q=%x r=%x xp=%lld", + TP_printk("c=%08x tq=%x q=%x r=%x", __entry->call, __entry->qbase, __entry->seq, - __entry->serial, - ktime_to_us(__entry->expiry)) + __entry->serial) ); TRACE_EVENT(rxrpc_congest, @@ -1767,9 +1804,9 @@ TRACE_EVENT(rxrpc_congest, ); TRACE_EVENT(rxrpc_reset_cwnd, - TP_PROTO(struct rxrpc_call *call, ktime_t now), + TP_PROTO(struct rxrpc_call *call, ktime_t since_last_tx, ktime_t rtt), - TP_ARGS(call, now), + TP_ARGS(call, since_last_tx, rtt), TP_STRUCT__entry( __field(unsigned int, call) @@ -1779,6 +1816,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, prepared) __field(ktime_t, since_last_tx) + __field(ktime_t, rtt) __field(bool, has_data) ), @@ -1789,18 +1827,20 @@ TRACE_EVENT(rxrpc_reset_cwnd, __entry->extra = call->cong_extra; __entry->hard_ack = call->acks_hard_ack; __entry->prepared = call->send_top - call->tx_bottom; - __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); + __entry->since_last_tx = since_last_tx; + __entry->rtt = rtt; __entry->has_data = call->tx_bottom != call->tx_top; ), - TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", + TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu/%llu d=%u", __entry->call, __entry->hard_ack, __print_symbolic(__entry->ca_state, rxrpc_ca_states), __entry->cwnd, __entry->extra, __entry->prepared, - ktime_to_ns(__entry->since_last_tx), + ktime_to_us(__entry->since_last_tx), + ktime_to_us(__entry->rtt), __entry->has_data) ); @@ -1925,6 +1965,32 @@ TRACE_EVENT(rxrpc_resend, __entry->transmitted) ); +TRACE_EVENT(rxrpc_resend_lost, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, unsigned long lost), + + TP_ARGS(call, tq, lost), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(u8, nr_rep) + __field(unsigned long, lost) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->nr_rep = tq->nr_reported_acks; + __entry->lost = lost; + ), + + TP_printk("c=%08x tq=%x lost=%016lx nr=%u", + __entry->call, + __entry->qbase, + __entry->lost, + __entry->nr_rep) + ); + TRACE_EVENT(rxrpc_rotate, TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, struct rxrpc_ack_summary *summary, rxrpc_seq_t seq, @@ -2363,6 +2429,244 @@ TRACE_EVENT(rxrpc_pmtud_reduce, __entry->serial, __entry->max_data) ); +TRACE_EVENT(rxrpc_rack, + TP_PROTO(struct rxrpc_call *call, ktime_t timo), + + TP_ARGS(call, timo), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, ack_serial) + __field(rxrpc_seq_t, seq) + __field(enum rxrpc_rack_timer_mode, mode) + __field(unsigned short, nr_sent) + __field(unsigned short, nr_lost) + __field(unsigned short, nr_resent) + __field(unsigned short, nr_sacked) + __field(ktime_t, timo) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->ack_serial = call->rx_serial; + __entry->seq = call->rack_end_seq; + __entry->mode = call->rack_timer_mode; + __entry->nr_sent = call->tx_nr_sent; + __entry->nr_lost = call->tx_nr_lost; + __entry->nr_resent = call->tx_nr_resent; + __entry->nr_sacked = call->acks_nr_sacks; + __entry->timo = timo; + ), + + TP_printk("c=%08x r=%08x q=%08x %s slrs=%u,%u,%u,%u t=%lld", + __entry->call, __entry->ack_serial, __entry->seq, + __print_symbolic(__entry->mode, rxrpc_rack_timer_modes), + __entry->nr_sent, __entry->nr_lost, + __entry->nr_resent, __entry->nr_sacked, + ktime_to_us(__entry->timo)) + ); + +TRACE_EVENT(rxrpc_rack_update, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary), + + TP_ARGS(call, summary), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, ack_serial) + __field(rxrpc_seq_t, seq) + __field(int, xmit_ts) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->ack_serial = call->rx_serial; + __entry->seq = call->rack_end_seq; + __entry->xmit_ts = ktime_sub(call->acks_latest_ts, call->rack_xmit_ts); + ), + + TP_printk("c=%08x r=%08x q=%08x xt=%lld", + __entry->call, __entry->ack_serial, __entry->seq, + ktime_to_us(__entry->xmit_ts)) + ); + +TRACE_EVENT(rxrpc_rack_scan_loss, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(ktime_t, rack_rtt) + __field(ktime_t, rack_reo_wnd) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->rack_rtt = call->rack_rtt; + __entry->rack_reo_wnd = call->rack_reo_wnd; + ), + + TP_printk("c=%08x rtt=%lld reow=%lld", + __entry->call, ktime_to_us(__entry->rack_rtt), + ktime_to_us(__entry->rack_reo_wnd)) + ); + +TRACE_EVENT(rxrpc_rack_scan_loss_tq, + TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq, + unsigned long nacks), + + TP_ARGS(call, tq, nacks), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(unsigned long, nacks) + __field(unsigned long, lost) + __field(unsigned long, retrans) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->nacks = nacks; + __entry->lost = tq->segment_lost; + __entry->retrans = tq->segment_retransmitted; + ), + + TP_printk("c=%08x q=%08x n=%lx l=%lx r=%lx", + __entry->call, __entry->qbase, + __entry->nacks, __entry->lost, __entry->retrans) + ); + +TRACE_EVENT(rxrpc_rack_detect_loss, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, + rxrpc_seq_t seq), + + TP_ARGS(call, summary, seq), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, ack_serial) + __field(rxrpc_seq_t, seq) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->ack_serial = call->rx_serial; + __entry->seq = seq; + ), + + TP_printk("c=%08x r=%08x q=%08x", + __entry->call, __entry->ack_serial, __entry->seq) + ); + +TRACE_EVENT(rxrpc_rack_mark_loss_tq, + TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq), + + TP_ARGS(call, tq), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(rxrpc_seq_t, trans) + __field(unsigned long, acked) + __field(unsigned long, lost) + __field(unsigned long, retrans) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->trans = call->tx_transmitted; + __entry->acked = tq->segment_acked; + __entry->lost = tq->segment_lost; + __entry->retrans = tq->segment_retransmitted; + ), + + TP_printk("c=%08x tq=%08x txq=%08x a=%lx l=%lx r=%lx", + __entry->call, __entry->qbase, __entry->trans, + __entry->acked, __entry->lost, __entry->retrans) + ); + +TRACE_EVENT(rxrpc_tlp_probe, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_tlp_probe_trace trace), + + TP_ARGS(call, trace), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, serial) + __field(rxrpc_seq_t, seq) + __field(enum rxrpc_tlp_probe_trace, trace) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->serial = call->tlp_serial; + __entry->seq = call->tlp_seq; + __entry->trace = trace; + ), + + TP_printk("c=%08x r=%08x pq=%08x %s", + __entry->call, __entry->serial, __entry->seq, + __print_symbolic(__entry->trace, rxrpc_tlp_probe_traces)) + ); + +TRACE_EVENT(rxrpc_tlp_ack, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, + enum rxrpc_tlp_ack_trace trace), + + TP_ARGS(call, summary, trace), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, serial) + __field(rxrpc_seq_t, tlp_seq) + __field(rxrpc_seq_t, hard_ack) + __field(enum rxrpc_tlp_ack_trace, trace) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->serial = call->tlp_serial; + __entry->tlp_seq = call->tlp_seq; + __entry->hard_ack = call->acks_hard_ack; + __entry->trace = trace; + ), + + TP_printk("c=%08x r=%08x pq=%08x hq=%08x %s", + __entry->call, __entry->serial, + __entry->tlp_seq, __entry->hard_ack, + __print_symbolic(__entry->trace, rxrpc_tlp_ack_traces)) + ); + +TRACE_EVENT(rxrpc_rack_timer, + TP_PROTO(struct rxrpc_call *call, ktime_t delay, bool exp), + + TP_ARGS(call, delay, exp), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(bool, exp) + __field(enum rxrpc_rack_timer_mode, mode) + __field(ktime_t, delay) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->exp = exp; + __entry->mode = call->rack_timer_mode; + __entry->delay = delay; + ), + + TP_printk("c=%08x %s %s to=%lld", + __entry->call, + __entry->exp ? "Exp" : "Set", + __print_symbolic(__entry->mode, rxrpc_rack_timer_modes), + ktime_to_us(__entry->delay)) + ); + #undef EM #undef E_ diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index ac5caf5a48e16..210b75e3179e9 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -16,6 +16,7 @@ rxrpc-y := \ conn_object.o \ conn_service.o \ input.o \ + input_rack.o \ insecure.o \ io_thread.o \ key.o \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a9d732ba6df0d..0c0a3c89dba32 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -621,6 +621,18 @@ enum rxrpc_ca_state { NR__RXRPC_CA_STATES } __mode(byte); +/* + * Current purpose of call RACK timer. According to the RACK-TLP protocol + * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for + * one of these at once. + */ +enum rxrpc_rack_timer_mode { + RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */ + RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */ + RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */ + RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */ +} __mode(byte); + /* * RxRPC call definition * - matched by { connection, call_id } @@ -638,8 +650,7 @@ struct rxrpc_call { struct mutex user_mutex; /* User access mutex */ struct sockaddr_rxrpc dest_srx; /* Destination address */ ktime_t delay_ack_at; /* When DELAY ACK needs to happen */ - ktime_t ack_lost_at; /* When ACK is figured as lost */ - ktime_t resend_at; /* When next resend needs to happen */ + ktime_t rack_timo_at; /* When ACK is figured as lost */ ktime_t ping_at; /* When next to send a ping */ ktime_t keepalive_at; /* When next to send a keepalive ping */ ktime_t expect_rx_by; /* When we expect to get a packet by */ @@ -695,8 +706,12 @@ struct rxrpc_call { rxrpc_seq_t tx_bottom; /* First packet in buffer */ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */ u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ - u8 tx_winsize; /* Maximum size of Tx window */ + u16 tx_nr_sent; /* Number of packets sent, but unacked */ + u16 tx_nr_lost; /* Number of packets marked lost */ + u16 tx_nr_resent; /* Number of packets resent, but unacked */ + u16 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 u8 tx_jumbo_max; /* Maximum subpkts peer will accept */ ktime_t tx_last_sent; /* Last time a transmission occurred */ @@ -725,6 +740,25 @@ struct rxrpc_call { u16 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ + /* RACK-TLP [RFC8985] state. */ + ktime_t rack_xmit_ts; /* Latest transmission timestamp */ + ktime_t rack_rtt; /* RTT of most recently ACK'd segment */ + ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */ + ktime_t rack_reo_wnd; /* Reordering window */ + unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */ + int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */ + rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */ + rxrpc_seq_t rack_end_seq; /* Highest sequence seen */ + rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */ + bool rack_dsack_round_none; /* T if dsack_round is "None" */ + bool rack_reordering_seen; /* T if detected reordering event */ + enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */ + bool tlp_is_retrans; /* T if unacked TLP retransmission */ + rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */ + rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */ + unsigned int tlp_rtt_taken; /* Last time RTT taken */ + ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */ + /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ u16 ackr_sack_base; /* Starting slot in SACK table ring */ @@ -783,6 +817,9 @@ struct rxrpc_ack_summary { bool retrans_timeo:1; /* T if reTx due to timeout happened */ bool need_retransmit:1; /* T if we need transmission */ bool rtt_sample_avail:1; /* T if RTT sample available */ + bool in_fast_or_rto_recovery:1; + bool exiting_fast_or_rto_recovery:1; + bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */ u8 /*enum rxrpc_congest_change*/ change; }; @@ -864,6 +901,7 @@ struct rxrpc_txqueue { unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */ unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */ unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */ + unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */ /* The arrays we want to pack into as few cache lines as possible. */ struct { @@ -883,7 +921,9 @@ struct rxrpc_send_data_req { struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */ rxrpc_seq_t seq; /* Sequence of first data */ int n; /* Number of DATA packets to glue into jumbo */ + bool retrans; /* T if this is a retransmission */ bool did_send; /* T if did actually send */ + bool tlp_probe; /* T if this is a TLP probe */ int /* enum rxrpc_txdata_trace */ trace; }; @@ -943,8 +983,9 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); -void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_response); - +void rxrpc_resend_tlp(struct rxrpc_call *call); +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace); bool rxrpc_input_call_event(struct rxrpc_call *call); /* @@ -1123,6 +1164,32 @@ void rxrpc_congestion_degrade(struct rxrpc_call *); void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); +/* + * input_rack.c + */ +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix); +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks); +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary); +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now); +void rxrpc_tlp_send_probe(struct rxrpc_call *call); +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary); +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by); + +/* Initialise TLP state [RFC8958 7.1]. */ +static inline void rxrpc_tlp_init(struct rxrpc_call *call) +{ + call->tlp_serial = 0; + call->tlp_seq = call->acks_hard_ack; + call->tlp_is_retrans = false; +} + /* * io_thread.c */ @@ -1402,6 +1469,11 @@ static inline u32 latest(u32 seq1, u32 seq2) return after(seq1, seq2) ? seq1 : seq2; } +static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq) +{ + return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase; +} + static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb) { rxrpc_get_skb(skb, rxrpc_skb_get_call_rx); @@ -1409,6 +1481,31 @@ static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk rxrpc_poke_call(call, rxrpc_call_poke_rx_packet); } +/* + * Calculate how much space there is for transmitting more DATA packets. + */ +static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call) +{ + int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); + int transmitted = call->tx_top - call->tx_bottom; + + return max(winsize - transmitted, 0); +} + +static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call) +{ + return call->acks_nr_sacks + call->tx_nr_lost; +} + +/* + * Calculate the number of transmitted DATA packets assumed to be in flight + * [approx RFC6675]. + */ +static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call) +{ + return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent; +} + /* * debug tracing */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 7af2755442512..8e477f7f88501 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -54,35 +54,21 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_delayed_ack); } -/* - * Handle congestion being detected by the retransmit timeout. - */ -static void rxrpc_congestion_timeout(struct rxrpc_call *call) -{ - set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags); -} - /* * Retransmit one or more packets. */ static bool rxrpc_retransmit_data(struct rxrpc_call *call, - struct rxrpc_send_data_req *req, - ktime_t rto, bool skip_too_young) + struct rxrpc_send_data_req *req) { struct rxrpc_txqueue *tq = req->tq; unsigned int ix = req->seq & RXRPC_TXQ_MASK; struct rxrpc_txbuf *txb = tq->bufs[ix]; - ktime_t xmit_ts, resend_at; _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id); - xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); - resend_at = ktime_add(xmit_ts, rto); - trace_rxrpc_retransmit(call, req, txb, ktime_sub(resend_at, req->now)); - if (skip_too_young && ktime_after(resend_at, req->now)) - return false; + req->retrans = true; + trace_rxrpc_retransmit(call, req, txb); - __set_bit(ix, &tq->segment_retransmitted); txb->flags |= RXRPC_TXBUF_RESENT; rxrpc_send_data_packet(call, req); rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); @@ -97,133 +83,76 @@ static bool rxrpc_retransmit_data(struct rxrpc_call *call, /* * Perform retransmission of NAK'd and unack'd packets. */ -void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_response) +static void rxrpc_resend(struct rxrpc_call *call) { struct rxrpc_send_data_req req = { .now = ktime_get_real(), .trace = rxrpc_txdata_retransmit, }; - struct rxrpc_txqueue *tq = call->tx_queue; - ktime_t lowest_xmit_ts = KTIME_MAX; - ktime_t rto = rxrpc_get_rto_backoff(call, false); - bool unacked = false; + struct rxrpc_txqueue *tq; _enter("{%d,%d}", call->tx_bottom, call->tx_top); - if (call->tx_bottom == call->tx_top) { - call->resend_at = KTIME_MAX; - trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); - return; - } + trace_rxrpc_resend(call, call->acks_highest_serial); - trace_rxrpc_resend(call, ack_serial); - - /* Scan the transmission queue, looking for explicitly NAK'd packets. */ - do { - unsigned long naks = ~tq->segment_acked; - rxrpc_seq_t tq_top = tq->qbase + RXRPC_NR_TXQUEUE - 1; + /* Scan the transmission queue, looking for lost packets. */ + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long lost = tq->segment_lost; if (after(tq->qbase, call->tx_transmitted)) break; - if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) - naks &= (1UL << tq->nr_reported_acks) - 1; - _debug("retr %16lx %u c=%08x [%x]", tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase); - _debug("nack %16lx", naks); + _debug("lost %16lx", lost); - while (naks) { - unsigned int ix = __ffs(naks); + trace_rxrpc_resend_lost(call, tq, lost); + while (lost) { + unsigned int ix = __ffs(lost); struct rxrpc_txbuf *txb = tq->bufs[ix]; - __clear_bit(ix, &naks); - if (after(txb->serial, call->acks_highest_serial)) - continue; /* Ack point not yet reached */ - - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); + __clear_bit(ix, &lost); + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_lost); req.tq = tq; req.seq = tq->qbase + ix; req.n = 1; - rxrpc_retransmit_data(call, &req, rto, false); - } - - /* Anything after the soft-ACK table up to and including - * ack.previousPacket will get ACK'd or NACK'd in due course, - * so don't worry about those here. We do, however, need to - * consider retransmitting anything beyond that point. - */ - if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE && - after(tq_top, call->acks_prev_seq)) { - rxrpc_seq_t start = latest(call->acks_prev_seq, - tq->qbase + tq->nr_reported_acks); - rxrpc_seq_t stop = earliest(tq_top, call->tx_transmitted); - - _debug("unrep %x-%x", start, stop); - for (rxrpc_seq_t seq = start; before_eq(seq, stop); seq++) { - rxrpc_serial_t serial = tq->segment_serial[seq & RXRPC_TXQ_MASK]; - - if (ping_response && - before(serial, call->acks_highest_serial)) - break; /* Wasn't accounted for by a more recent ping. */ - req.tq = tq; - req.seq = seq; - req.n = 1; - if (rxrpc_retransmit_data(call, &req, rto, true)) - unacked = true; - } + rxrpc_retransmit_data(call, &req); } - - /* Work out the next retransmission timeout. */ - if (ktime_before(tq->xmit_ts_base, lowest_xmit_ts)) { - unsigned int lowest_us = UINT_MAX; - - for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) - if (!test_bit(i, &tq->segment_acked) && - tq->segment_xmit_ts[i] < lowest_us) - lowest_us = tq->segment_xmit_ts[i]; - _debug("lowest[%x] %llx %u", tq->qbase, tq->xmit_ts_base, lowest_us); - - if (lowest_us != UINT_MAX) { - ktime_t lowest_ns = ktime_add_us(tq->xmit_ts_base, lowest_us); - - if (ktime_before(lowest_ns, lowest_xmit_ts)) - lowest_xmit_ts = lowest_ns; - } - } - } while ((tq = tq->next)); - - if (lowest_xmit_ts < KTIME_MAX) { - ktime_t delay = rxrpc_get_rto_backoff(call, req.did_send); - ktime_t resend_at = ktime_add(lowest_xmit_ts, delay); - - _debug("delay %llu %lld", delay, ktime_sub(resend_at, req.now)); - call->resend_at = resend_at; - trace_rxrpc_timer_set(call, ktime_sub(resend_at, req.now), - rxrpc_timer_trace_resend_reset); - } else { - call->resend_at = KTIME_MAX; - trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); } - if (unacked) - rxrpc_congestion_timeout(call); + rxrpc_get_rto_backoff(call, req.did_send); + _leave(""); +} - /* If there was nothing that needed retransmission then it's likely - * that an ACK got lost somewhere. Send a ping to find out instead of - * retransmitting data. - */ - if (!req.did_send) { - ktime_t next_ping = ktime_add_us(call->acks_latest_ts, - call->srtt_us >> 3); +/* + * Resend the highest-seq DATA packet so far transmitted for RACK-TLP [RFC8985 7.3]. + */ +void rxrpc_resend_tlp(struct rxrpc_call *call) +{ + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted, + .n = 1, + .tlp_probe = true, + .trace = rxrpc_txdata_tlp_retransmit, + }; - if (ktime_sub(next_ping, req.now) <= 0) - rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, - rxrpc_propose_ack_ping_for_0_retrans); + /* There's a chance it'll be on the tail segment of the queue. */ + req.tq = READ_ONCE(call->tx_qtail); + if (req.tq && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; } - _leave(""); + for (req.tq = call->tx_queue; req.tq; req.tq = req.tq->next) { + if (after_eq(call->tx_transmitted, req.tq->qbase) && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; + } + } } /* @@ -259,18 +188,10 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) } } -static unsigned int rxrpc_tx_window_space(struct rxrpc_call *call) -{ - int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); - int in_flight = call->tx_top - call->tx_bottom; - - return max(winsize - in_flight, 0); -} - /* - * Transmit some as-yet untransmitted data. + * Transmit some as-yet untransmitted data, to a maximum of the supplied limit. */ -static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, +static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, unsigned int limit, enum rxrpc_txdata_trace trace) { int space = rxrpc_tx_window_space(call); @@ -335,8 +256,8 @@ static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, } } -static void rxrpc_transmit_some_data(struct rxrpc_call *call, - enum rxrpc_txdata_trace trace) +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace) { switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_ACK_REQUEST: @@ -353,7 +274,7 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call, rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); return; } - rxrpc_transmit_fresh_data(call, trace); + rxrpc_transmit_fresh_data(call, limit, trace); break; default: return; @@ -380,7 +301,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) { struct sk_buff *skb; ktime_t now, t; - bool resend = false, did_receive = false, saw_ack = false; + bool did_receive = false, saw_ack = false; s32 abort_code; rxrpc_see_call(call, rxrpc_call_see_input); @@ -398,21 +319,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) goto out; } - while ((skb = __skb_dequeue(&call->rx_queue))) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + do { + skb = __skb_dequeue(&call->rx_queue); + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (__rxrpc_call_is_complete(call) || + skb->mark == RXRPC_SKB_MARK_ERROR) { + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + goto out; + } + + saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; - if (__rxrpc_call_is_complete(call) || - skb->mark == RXRPC_SKB_MARK_ERROR) { + rxrpc_input_call_packet(call, skb); rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); - goto out; + did_receive = true; } - saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, + rxrpc_timer_trace_rack_off + call->rack_timer_mode); + call->rack_timo_at = KTIME_MAX; + rxrpc_rack_timer_expired(call, t); + } - rxrpc_input_call_packet(call, skb); - rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); - did_receive = true; - } + } while (!skb_queue_empty(&call->rx_queue)); /* If we see our async-event poke, check for timeout trippage. */ now = ktime_get_real(); @@ -445,13 +378,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) rxrpc_propose_ack_delayed_ack); } - t = ktime_sub(call->ack_lost_at, now); - if (t <= 0) { - trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack); - call->ack_lost_at = KTIME_MAX; - set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events); - } - t = ktime_sub(call->ping_at, now); if (t <= 0) { trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping); @@ -460,15 +386,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) rxrpc_propose_ack_ping_for_keepalive); } - t = ktime_sub(call->resend_at, now); - if (t <= 0) { - trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend); - call->resend_at = KTIME_MAX; - resend = true; - } - - rxrpc_transmit_some_data(call, rxrpc_txdata_new_data); - now = ktime_get_real(); t = ktime_sub(call->keepalive_at, now); if (t <= 0) { @@ -478,21 +395,30 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) rxrpc_propose_ack_ping_for_keepalive); } + if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) + rxrpc_send_initial_ping(call); + + rxrpc_transmit_some_data(call, UINT_MAX, rxrpc_txdata_new_data); + if (saw_ack) rxrpc_congestion_degrade(call); - if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) - rxrpc_send_initial_ping(call); + if (did_receive && + (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_SEND_REQUEST || + __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SEND_REPLY)) { + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + trace_rxrpc_rack(call, t); + } /* Process events */ if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_lost_ack); - if (resend && + if (call->tx_nr_lost > 0 && __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY && !test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags)) - rxrpc_resend(call, 0, false); + rxrpc_resend(call); if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, @@ -520,8 +446,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) set(call->expect_req_by); set(call->expect_rx_by); set(call->delay_ack_at); - set(call->ack_lost_at); - set(call->resend_at); + set(call->rack_timo_at); set(call->keepalive_at); set(call->ping_at); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index fb4ee0d2e9e16..5a543c3f6fb08 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -160,8 +160,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->ackr_window = 1; call->ackr_wtop = 1; call->delay_ack_at = KTIME_MAX; - call->ack_lost_at = KTIME_MAX; - call->resend_at = KTIME_MAX; + call->rack_timo_at = KTIME_MAX; call->ping_at = KTIME_MAX; call->keepalive_at = KTIME_MAX; call->expect_rx_by = KTIME_MAX; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 9f308bd512e93..4974b5accafa3 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -27,13 +27,13 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, } /* - * Do TCP-style congestion management [RFC 5681]. + * Do TCP-style congestion management [RFC5681]. */ static void rxrpc_congestion_management(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) { summary->change = rxrpc_cong_no_change; - summary->in_flight = (call->tx_top - call->tx_bottom) - call->acks_nr_sacks; + summary->in_flight = rxrpc_tx_in_flight(call); if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; @@ -106,9 +106,12 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, call->cong_extra = 0; call->cong_dup_acks = 0; summary->need_retransmit = true; + summary->in_fast_or_rto_recovery = true; goto out; case RXRPC_CA_FAST_RETRANSMIT: + rxrpc_tlp_init(call); + summary->in_fast_or_rto_recovery = true; if (!summary->new_low_snack) { if (summary->nr_new_sacks == 0) call->cong_cwnd += 1; @@ -121,8 +124,10 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, } else { summary->change = rxrpc_cong_progress; call->cong_cwnd = call->cong_ssthresh; - if (call->acks_nr_snacks == 0) + if (call->acks_nr_snacks == 0) { + summary->exiting_fast_or_rto_recovery = true; goto resume_normality; + } } goto out; @@ -171,7 +176,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, */ void rxrpc_congestion_degrade(struct rxrpc_call *call) { - ktime_t rtt, now; + ktime_t rtt, now, time_since; if (call->cong_ca_state != RXRPC_CA_SLOW_START && call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE) @@ -181,10 +186,11 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8)); now = ktime_get_real(); - if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now)) + time_since = ktime_sub(now, call->tx_last_sent); + if (ktime_before(time_since, rtt)) return; - trace_rxrpc_reset_cwnd(call, now); + trace_rxrpc_reset_cwnd(call, time_since, rtt); rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); call->tx_last_sent = now; call->cong_ca_state = RXRPC_CA_SLOW_START; @@ -200,11 +206,11 @@ static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call, struct rxrpc_txqueue *tq, int ix) { + ktime_t xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); + rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1, summary->acked_serial, summary->ack_serial, - ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]), - call->acks_latest_ts); - summary->rtt_sample_avail = false; + xmit_ts, call->acks_latest_ts); __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */ } @@ -216,7 +222,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, { struct rxrpc_txqueue *tq = call->tx_queue; rxrpc_seq_t seq = call->tx_bottom + 1; - bool rot_last = false; + bool rot_last = false, trace = false; _enter("%x,%x", call->tx_bottom, to); @@ -250,14 +256,16 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, rot_last = true; } - if (summary->rtt_sample_avail && - summary->acked_serial == tq->segment_serial[ix] && + if (summary->acked_serial == tq->segment_serial[ix] && test_bit(ix, &tq->rtt_samples)) rxrpc_add_data_rtt_sample(call, summary, tq, ix); if (ix == tq->nr_reported_acks) { /* Packet directly hard ACK'd. */ tq->nr_reported_acks++; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; summary->nr_new_hacks++; __set_bit(ix, &tq->segment_acked); trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack); @@ -268,11 +276,21 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, } else { /* Soft NAK -> hard ACK. */ call->acks_nr_snacks--; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; summary->nr_new_hacks++; __set_bit(ix, &tq->segment_acked); trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak); } + call->tx_nr_sent--; + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + __clear_bit(ix, &tq->ever_retransmitted); + rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated); tq->bufs[ix] = NULL; @@ -282,7 +300,10 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, rxrpc_txqueue_rotate)); seq++; + trace = true; if (!(seq & RXRPC_TXQ_MASK)) { + trace_rxrpc_rack_update(call, summary); + trace = false; prefetch(tq->next); if (tq != call->tx_qtail) { call->tx_qbase += RXRPC_NR_TXQUEUE; @@ -299,6 +320,9 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, } while (before_eq(seq, to)); + if (trace) + trace_rxrpc_rack_update(call, summary); + if (rot_last) { set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); if (tq) { @@ -325,8 +349,10 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, { ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); - call->resend_at = KTIME_MAX; - trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + call->rack_timo_at = KTIME_MAX; + trace_rxrpc_rack_timer(call, 0, false); + trace_rxrpc_timer_can(call, rxrpc_timer_trace_rack_off + call->rack_timer_mode); switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: @@ -842,10 +868,13 @@ static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call, rxrpc_seq_t seq, rxrpc_seq_t *lowest_nak) { - unsigned long old_reported, flipped, new_acks, a_to_n, n_to_a; + unsigned long old_reported = 0, flipped, new_acks = 0; + unsigned long a_to_n, n_to_a = 0; int new, a, n; - old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks); + if (tq->nr_reported_acks > 0) + old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks); + _enter("{%x,%lx,%d},%lx,%d,%x", tq->qbase, tq->segment_acked, tq->nr_reported_acks, extracted_acks, nr_reported, seq); @@ -898,6 +927,18 @@ static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call, if (before(lowest, *lowest_nak)) *lowest_nak = lowest; } + + if (summary->acked_serial) + rxrpc_input_soft_rtt(call, summary, tq); + + new_acks |= n_to_a; + if (new_acks) + rxrpc_input_rack(call, summary, tq, new_acks); + + if (call->tlp_serial && + rxrpc_seq_in_txq(tq, call->tlp_seq) && + test_bit(call->tlp_seq - tq->qbase, &new_acks)) + summary->tlp_probe_acked = true; } /* @@ -940,8 +981,6 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, _debug("bound %16lx %u", extracted, nr); - if (summary->rtt_sample_avail) - rxrpc_input_soft_rtt(call, summary, tq); rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE, seq - RXRPC_NR_TXQUEUE, &lowest_nak); extracted = ~0UL; @@ -1063,7 +1102,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) /* Discard any out-of-order or duplicate ACKs (outside lock). */ if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) { trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt); - goto send_response; + goto send_response; /* Still respond if requested. */ } trailer.maxMTU = 0; @@ -1079,14 +1118,19 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) call->acks_hard_ack = hard_ack; call->acks_prev_seq = prev_pkt; - switch (summary.ack_reason) { - case RXRPC_ACK_PING: - break; - default: - if (summary.acked_serial && - after(summary.acked_serial, call->acks_highest_serial)) - call->acks_highest_serial = summary.acked_serial; - break; + if (summary.acked_serial) { + switch (summary.ack_reason) { + case RXRPC_ACK_PING_RESPONSE: + rxrpc_complete_rtt_probe(call, call->acks_latest_ts, + summary.acked_serial, summary.ack_serial, + rxrpc_rtt_rx_ping_response); + break; + default: + if (after(summary.acked_serial, call->acks_highest_serial)) + call->acks_highest_serial = summary.acked_serial; + summary.rtt_sample_avail = true; + break; + } } /* Parse rwind and mtu sizes if provided. */ @@ -1096,15 +1140,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (hard_ack + 1 == 0) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); - if (summary.acked_serial) { - if (summary.ack_reason == RXRPC_ACK_PING_RESPONSE) - rxrpc_complete_rtt_probe(call, call->acks_latest_ts, - summary.acked_serial, summary.ack_serial, - rxrpc_rtt_rx_ping_response); - else - summary.rtt_sample_avail = true; - } - /* Ignore ACKs unless we are or have just been transmitting. */ switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: @@ -1141,10 +1176,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ping(call, summary.ack_serial, rxrpc_propose_ack_ping_for_lost_reply); + /* Drive the congestion management algorithm first and then RACK-TLP as + * the latter depends on the state/change in state in the former. + */ rxrpc_congestion_management(call, &summary); - if (summary.need_retransmit) - rxrpc_resend(call, summary.ack_serial, - summary.ack_reason == RXRPC_ACK_PING_RESPONSE); + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + rxrpc_tlp_process_ack(call, &summary); + if (call->tlp_serial && after_eq(summary.acked_serial, call->tlp_serial)) + call->tlp_serial = 0; send_response: if (summary.ack_reason == RXRPC_ACK_PING) diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c new file mode 100644 index 0000000000000..13c371261e0a5 --- /dev/null +++ b/net/rxrpc/input_rack.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RACK-TLP [RFC8958] Implementation + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "ar-internal.h" + +static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1, + ktime_t t2, rxrpc_seq_t seq2) +{ + if (ktime_after(t1, t2)) + return true; + return t1 == t2 && after(seq1, seq2); +} + +/* + * Mark a packet lost. + */ +static void rxrpc_rack_mark_lost(struct rxrpc_call *call, + struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (__test_and_set_bit(ix, &tq->segment_lost)) { + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + } else { + call->tx_nr_lost++; + } + tq->segment_xmit_ts[ix] = UINT_MAX; +} + +/* + * Get the transmission time of a packet in the Tx queue. + */ +static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (tq->segment_xmit_ts[ix] == UINT_MAX) + return KTIME_MAX; + return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); +} + +/* + * Get a bitmask of nack bits for a queue segment and mask off any that aren't + * yet reported. + */ +static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq) +{ + unsigned long nacks = ~tq->segment_acked; + + if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) + nacks &= (1UL << tq->nr_reported_acks) - 1; + return nacks; +} + +/* + * Update the RACK state for the most recently sent packet that has been + * delivered [RFC8958 6.2 Step 2]. + */ +static void rxrpc_rack_update(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts); + + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + + if (test_bit(ix, &tq->segment_retransmitted)) { + /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */ + if (before(call->acks_highest_serial, tq->segment_serial[ix])) + return; + if (rtt < minmax_get(&call->min_rtt)) + return; + } + + /* The RACK algorithm requires the segment ACKs to be traversed in + * order of segment transmission - but the only thing this seems to + * matter for is that RACK.rtt is set to the rtt of the most recently + * transmitted segment. We should be able to achieve the same by only + * setting RACK.rtt if the xmit time is greater. + */ + if (ktime_after(xmit_ts, call->rack_rtt_ts)) { + call->rack_rtt = rtt; + call->rack_rtt_ts = xmit_ts; + } + + if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) { + call->rack_rtt = rtt; + call->rack_xmit_ts = xmit_ts; + call->rack_end_seq = seq; + } +} + +/* + * Detect data segment reordering [RFC8958 6.2 Step 3]. + */ +static void rxrpc_rack_detect_reordering(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + + /* Track the highest sequence number so far ACK'd. This is not + * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer + * could put a NACK in the last SACK slot. + */ + if (after(seq, call->rack_fack)) + call->rack_fack = seq; + else if (before(seq, call->rack_fack) && + test_bit(ix, &tq->segment_retransmitted)) + call->rack_reordering_seen = true; +} + +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_rack_update(call, summary, tq, ix); + rxrpc_rack_detect_reordering(call, summary, tq, ix); +} + +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks) +{ + while (new_acks) { + unsigned int ix = __ffs(new_acks); + + __clear_bit(ix, &new_acks); + rxrpc_input_rack_one(call, summary, tq, ix); + } + + trace_rxrpc_rack_update(call, summary); +} + +/* + * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated + * duration of the reordering window. + * + * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can + * be given a 'DUPLICATE' reason with the serial number referring to the + * duplicated DATA packet. Rx does not inform as to whether this was a + * reception of the same packet twice or of a retransmission of a packet we + * already received (though this could be determined by the transmitter based + * on the serial number). + */ +static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */ + bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE; + int dup_thresh = 3; + + /* DSACK-based reordering window adaptation */ + if (!call->rack_dsack_round_none && + after_eq(snd_una, call->rack_dsack_round)) + call->rack_dsack_round_none = true; + + /* Grow the reordering window per round that sees DSACK. Reset the + * window after 16 DSACK-free recoveries. + */ + if (call->rack_dsack_round_none && have_dsack_option) { + call->rack_dsack_round_none = false; + call->rack_dsack_round = snd_nxt; + call->rack_reo_wnd_mult++; + call->rack_reo_wnd_persist = 16; + } else if (summary->exiting_fast_or_rto_recovery) { + call->rack_reo_wnd_persist--; + if (call->rack_reo_wnd_persist <= 0) + call->rack_reo_wnd_mult = 1; + } + + if (!call->rack_reordering_seen) { + if (summary->in_fast_or_rto_recovery) + return 0; + if (call->acks_nr_sacks >= dup_thresh) + return 0; + } + + return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4, + call->srtt_us >> 3)); +} + +/* + * Detect losses [RFC8958 6.2 Step 5]. + */ +static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + struct rxrpc_txqueue *tq; + ktime_t timeout = 0, lost_after, now = ktime_get_real(); + + call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary); + lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + trace_rxrpc_rack_scan_loss(call); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long nacks = rxrpc_tq_nacks(tq); + + if (after(tq->qbase, call->tx_transmitted)) + break; + trace_rxrpc_rack_scan_loss_tq(call, tq, nacks); + + /* Skip ones marked lost but not yet retransmitted */ + nacks &= ~tq->segment_lost | tq->segment_retransmitted; + + while (nacks) { + unsigned int ix = __ffs(nacks); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t remaining; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + __clear_bit(ix, &nacks); + + if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq, + xmit_ts, seq)) { + remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now); + if (remaining <= 0) { + rxrpc_rack_mark_lost(call, tq, ix); + trace_rxrpc_rack_detect_loss(call, summary, seq); + } else { + timeout = max(remaining, timeout); + } + } + } + } + + return timeout; +} + +/* + * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5]. + */ +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + ktime_t timeout = rxrpc_rack_detect_loss(call, summary); + + if (timeout) { + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER; + call->rack_timo_at = ktime_add(ktime_get_real(), timeout); + trace_rxrpc_rack_timer(call, timeout, false); + trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo); + } +} + +/* + * Handle RACK-TLP RTO expiration [RFC8958 6.3]. + */ +static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + ktime_t deadline = ktime_sub(ktime_get_real(), lost_after); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long unacked = ~tq->segment_acked; + + trace_rxrpc_rack_mark_loss_tq(call, tq); + while (unacked) { + unsigned int ix = __ffs(unacked); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + if (after(seq, call->tx_transmitted)) + return; + __clear_bit(ix, &unacked); + + if (seq == snd_una || + ktime_before(xmit_ts, deadline)) + rxrpc_rack_mark_lost(call, tq, ix); + } + } +} + +/* + * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2]. + */ +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now) +{ + unsigned int flight_size = rxrpc_tx_in_flight(call); + ktime_t rto_at = ktime_add(call->tx_last_sent, + rxrpc_get_rto_backoff(call, false)); + ktime_t pto; + + if (call->rtt_count > 0) { + /* Use 2*SRTT as the timeout. */ + pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4); + if (flight_size) + pto = ktime_add(pto, call->tlp_max_ack_delay); + } else { + pto = NSEC_PER_SEC; + } + + if (ktime_after(ktime_add(now, pto), rto_at)) + pto = ktime_sub(rto_at, now); + return pto; +} + +/* + * Send a TLP loss probe on PTO expiration [RFC8958 7.3]. + */ +void rxrpc_tlp_send_probe(struct rxrpc_call *call) +{ + unsigned int in_flight = rxrpc_tx_in_flight(call); + + if (after_eq(call->acks_hard_ack, call->tx_transmitted)) + return; /* Everything we transmitted has been acked. */ + + /* There must be no other loss probe still in flight and we need to + * have taken a new RTT sample since last probe or the start of + * connection. + */ + if (!call->tlp_serial && + call->tlp_rtt_taken != call->rtt_taken) { + call->tlp_is_retrans = false; + if (after(call->send_top, call->tx_transmitted) && + rxrpc_tx_window_space(call) > 0) { + /* Transmit the lowest-sequence unsent DATA */ + call->tx_last_serial = 0; + rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data); + call->tlp_serial = call->tx_last_serial; + call->tlp_seq = call->tx_transmitted; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new); + in_flight = rxrpc_tx_in_flight(call); + } else { + /* Retransmit the highest-sequence DATA sent */ + call->tx_last_serial = 0; + rxrpc_resend_tlp(call); + call->tlp_is_retrans = true; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit); + } + } else { + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy); + } + + if (in_flight != 0) { + ktime_t rto = rxrpc_get_rto_backoff(call, false); + + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO; + call->rack_timo_at = ktime_add(ktime_get_real(), rto); + trace_rxrpc_rack_timer(call, rto, false); + trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto); + } +} + +/* + * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4]. + */ +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) +{ + if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack)) + return; + + if (!call->tlp_is_retrans) { + /* TLP of new data delivered */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data); + call->tlp_serial = 0; + } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE && + summary->acked_serial == call->tlp_serial) { + /* General Case: Detected packet losses using RACK [7.4.1] */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked); + call->tlp_serial = 0; + } else if (after(call->acks_hard_ack, call->tlp_seq)) { + /* Repaired the single loss */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond); + call->tlp_serial = 0; + // TODO: Invoke congestion control to react to the loss + // event the probe has repaired + } else if (summary->tlp_probe_acked) { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked); + /* Special Case: Detected a single loss repaired by the loss + * probe [7.4.2] + */ + call->tlp_serial = 0; + } else { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete); + } +} + +/* + * Handle RACK timer expiration; returns true to request a resend. + */ +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) +{ + struct rxrpc_ack_summary summary = {}; + enum rxrpc_rack_timer_mode mode = call->rack_timer_mode; + + trace_rxrpc_rack_timer(call, overran_by, true); + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + + switch (mode) { + case RXRPC_CALL_RACKTIMER_RACK_REORDER: + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + break; + case RXRPC_CALL_RACKTIMER_TLP_PTO: + rxrpc_tlp_send_probe(call); + break; + case RXRPC_CALL_RACKTIMER_RTO: + // Might need to poke the congestion algo in some way + rxrpc_rack_mark_losses_on_rto(call); + break; + //case RXRPC_CALL_RACKTIMER_ZEROWIN: + default: + pr_warn("Unexpected rack timer %u", call->rack_timer_mode); + } +} diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index fbacf2056f643..2925c7fc82cfb 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -470,6 +470,7 @@ int rxrpc_io_thread(void *data) spin_lock_irq(&local->rx_queue.lock); skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); spin_unlock_irq(&local->rx_queue.lock); + trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue)); } /* Distribute packets and errors. */ diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f934551a9b1c6..6f7a125d6e908 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -542,12 +542,14 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se unsigned int xmit_ts; rxrpc_seq_t seq = req->seq; size_t len = 0; + bool start_tlp = false; trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit); /* Each transmission of a Tx packet needs a new serial number */ serial = rxrpc_get_next_serials(call->conn, req->n); + call->tx_last_serial = serial + req->n - 1; call->tx_last_sent = req->now; xmit_ts = rxrpc_prepare_txqueue(tq, req); prefetch(tq->next); @@ -557,6 +559,18 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK]; _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq); + + /* Record (re-)transmission for RACK [RFC8985 6.1]. */ + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (req->retrans) { + __set_bit(ix, &tq->ever_retransmitted); + __set_bit(ix, &tq->segment_retransmitted); + call->tx_nr_resent++; + } else { + call->tx_nr_sent++; + start_tlp = true; + } tq->segment_xmit_ts[ix] = xmit_ts; tq->segment_serial[ix] = serial; if (i + 1 == req->n) @@ -576,11 +590,24 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se } /* Set timeouts */ - if (call->rtt_count > 1) { - ktime_t delay = rxrpc_get_rto_backoff(call, false); + if (req->tlp_probe) { + /* Sending TLP loss probe [RFC8985 7.3]. */ + call->tlp_serial = serial - 1; + call->tlp_seq = seq - 1; + } else if (start_tlp) { + /* Schedule TLP loss probe [RFC8985 7.2]. */ + ktime_t pto; + + if (!test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + /* The first packet may take longer to elicit a response. */ + pto = NSEC_PER_SEC; + else + pto = rxrpc_tlp_calc_pto(call, req->now); - call->ack_lost_at = ktime_add(req->now, delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_TLP_PTO; + call->rack_timo_at = ktime_add(req->now, pto); + trace_rxrpc_rack_timer(call, pto, false); + trace_rxrpc_timer_set(call, pto, rxrpc_timer_trace_rack_tlp_pto); } if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { @@ -589,12 +616,6 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se call->expect_rx_by = ktime_add(req->now, delay); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); } - if (call->resend_at == KTIME_MAX) { - ktime_t delay = rxrpc_get_rto_backoff(call, false); - - call->resend_at = ktime_add(req->now, delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend); - } rxrpc_set_keepalive(call, req->now); return len; From 2d20773aec14996b6cc4db92d885028319be683d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Dec 2024 22:38:11 +0000 Subject: [PATCH 0317/1450] mctp: no longer rely on net->dev_index_head[] mctp_dump_addrinfo() is one of the last users of net->dev_index_head[] in the control path. Switch to for_each_netdev_dump() for better scalability. Use C99 for mctp_device_rtnl_msg_handlers[] to prepare future RTNL removal from mctp_dump_addrinfo() (mdev->addrs is not yet RCU protected) Signed-off-by: Eric Dumazet Cc: Matt Johnston Reviewed-by: Kuniyuki Iwashima Acked-by: Jeremy Kerr Link: https://patch.msgid.link/20241206223811.1343076-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mctp/device.c | 50 ++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/net/mctp/device.c b/net/mctp/device.c index 26ce34b7e88e1..8e0724c56723d 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -20,8 +20,7 @@ #include struct mctp_dump_cb { - int h; - int idx; + unsigned long ifindex; size_t a_idx; }; @@ -115,43 +114,29 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct mctp_dump_cb *mcb = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - struct hlist_head *head; struct net_device *dev; struct ifaddrmsg *hdr; struct mctp_dev *mdev; - int ifindex; - int idx = 0, rc; + int ifindex, rc; hdr = nlmsg_data(cb->nlh); // filter by ifindex if requested ifindex = hdr->ifa_index; rcu_read_lock(); - for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) { - idx = 0; - head = &net->dev_index_head[mcb->h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx >= mcb->idx && - (ifindex == 0 || ifindex == dev->ifindex)) { - mdev = __mctp_dev_get(dev); - if (mdev) { - rc = mctp_dump_dev_addrinfo(mdev, - skb, cb); - mctp_dev_put(mdev); - // Error indicates full buffer, this - // callback will get retried. - if (rc < 0) - goto out; - } - } - idx++; - // reset for next iteration - mcb->a_idx = 0; - } + for_each_netdev_dump(net, dev, mcb->ifindex) { + if (ifindex && ifindex != dev->ifindex) + continue; + mdev = __mctp_dev_get(dev); + if (!mdev) + continue; + rc = mctp_dump_dev_addrinfo(mdev, skb, cb); + mctp_dev_put(mdev); + if (rc < 0) + break; + mcb->a_idx = 0; } -out: rcu_read_unlock(); - mcb->idx = idx; return skb->len; } @@ -531,9 +516,12 @@ static struct notifier_block mctp_dev_nb = { }; static const struct rtnl_msg_handler mctp_device_rtnl_msg_handlers[] = { - {THIS_MODULE, PF_MCTP, RTM_NEWADDR, mctp_rtm_newaddr, NULL, 0}, - {THIS_MODULE, PF_MCTP, RTM_DELADDR, mctp_rtm_deladdr, NULL, 0}, - {THIS_MODULE, PF_MCTP, RTM_GETADDR, NULL, mctp_dump_addrinfo, 0}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_NEWADDR, + .doit = mctp_rtm_newaddr}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_DELADDR, + .doit = mctp_rtm_deladdr}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_GETADDR, + .dumpit = mctp_dump_addrinfo}, }; int __init mctp_device_init(void) From ca7858880590d4f1dfe73b2cbf372b8ed80a6d81 Mon Sep 17 00:00:00 2001 From: Jesse Van Gavere Date: Fri, 6 Dec 2024 21:42:02 +0100 Subject: [PATCH 0318/1450] net: dsa: microchip: Make MDIO bus name unique In configurations with 2 or more DSA clusters it will fail to allocate unique MDIO bus names as only the switch ID is used, fix this by using a combination of the tree ID and switch ID when needed Signed-off-by: Jesse Van Gavere Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241206204202.649912-1-jesse.vangavere@scioteq.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 920443ee8ffd0..f5822c57be32e 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2550,7 +2550,11 @@ static int ksz_mdio_register(struct ksz_device *dev) bus->read = ksz_sw_mdio_read; bus->write = ksz_sw_mdio_write; bus->name = "ksz user smi"; - snprintf(bus->id, MII_BUS_ID_SIZE, "SMI-%d", ds->index); + if (ds->dst->index != 0) { + snprintf(bus->id, MII_BUS_ID_SIZE, "SMI-%d-%d", ds->dst->index, ds->index); + } else { + snprintf(bus->id, MII_BUS_ID_SIZE, "SMI-%d", ds->index); + } } ret = ksz_parse_dt_phy_config(dev, bus, mdio_np); From 3f330db30638b6489d548084a7e8843374d41ad0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 Dec 2024 08:59:14 -0800 Subject: [PATCH 0319/1450] net: reformat kdoc return statements kernel-doc -Wall warns about missing Return: statement for non-void functions. We have a number of kdocs in our headers which are missing the colon, IOW they use * Return some value or * Returns some value Having the colon makes some sense, it should help kdoc parser avoid false positives. So add them. This is mostly done with a sed script, and removing the unnecessary cases (mostly the comments which aren't kdoc). Acked-by: Johannes Berg Acked-by: Richard Cochran Acked-by: Sergey Ryazanov Reviewed-by: Edward Cree Acked-by: Alexandra Winter Acked-by: Pablo Neira Ayuso Link: https://patch.msgid.link/20241205165914.1071102-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/etherdevice.h | 18 ++++++------ include/linux/ethtool.h | 6 ++-- include/linux/if_vlan.h | 28 +++++++++---------- include/linux/netdevice.h | 14 ++++++---- include/linux/netfilter/x_tables.h | 2 +- include/linux/netfilter_netdev.h | 3 +- include/linux/ptp_clock_kernel.h | 4 +-- include/linux/rfkill.h | 2 +- include/linux/rtnetlink.h | 2 +- include/linux/skbuff.h | 16 +++++------ include/linux/wwan.h | 2 +- include/net/cfg80211.h | 2 +- include/net/dst.h | 2 +- include/net/genetlink.h | 6 ++-- include/net/ipv6.h | 2 +- include/net/iucv/iucv.h | 30 ++++++++++---------- include/net/netfilter/nf_tproxy.h | 4 +-- include/net/netlink.h | 44 +++++++++++++++--------------- include/net/page_pool/helpers.h | 9 ++---- include/net/pkt_cls.h | 4 +-- include/net/tcp.h | 2 +- 21 files changed, 101 insertions(+), 101 deletions(-) diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index ecf203f010343..9a1eacf35d370 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -81,7 +81,7 @@ static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if address is link local reserved addr (01:80:c2:00:00:0X) per + * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. @@ -104,7 +104,7 @@ static inline bool is_link_local_ether_addr(const u8 *addr) * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is all zeroes. + * Return: true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ @@ -123,7 +123,7 @@ static inline bool is_zero_ether_addr(const u8 *addr) * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a multicast address. + * Return: true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) @@ -157,7 +157,7 @@ static inline bool is_multicast_ether_addr_64bits(const u8 *addr) * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a local address. + * Return: true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { @@ -168,7 +168,7 @@ static inline bool is_local_ether_addr(const u8 *addr) * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is the broadcast address. + * Return: true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ @@ -183,7 +183,7 @@ static inline bool is_broadcast_ether_addr(const u8 *addr) * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a unicast address. + * Return: true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { @@ -197,7 +197,7 @@ static inline bool is_unicast_ether_addr(const u8 *addr) * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * - * Return true if the address is valid. + * Return: true if the address is valid. * * Please note: addr must be aligned to u16. */ @@ -214,7 +214,7 @@ static inline bool is_valid_ether_addr(const u8 *addr) * * Check that the value from the Ethertype/length field is a valid Ethertype. * - * Return true if the valid is an 802.3 supported Ethertype. + * Return: true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { @@ -458,7 +458,7 @@ static inline bool ether_addr_is_ip_mcast(const u8 *addr) * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return a u64 value of the address + * Return: a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index b8b935b526033..e217c6321ed0d 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -257,7 +257,7 @@ struct ethtool_link_ksettings { * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) * - * Returns true/false. + * Returns: true/false. */ #define ethtool_link_ksettings_test_link_mode(ptr, name, mode) \ test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) @@ -1199,7 +1199,7 @@ ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings, * @dev: pointer to net_device structure * @vclock_index: pointer to pointer of vclock index * - * Return number of phc vclocks + * Return: number of phc vclocks */ int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index); @@ -1253,7 +1253,7 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. * @dev: pointer to net_device structure * @info: buffer to hold the result - * Returns zero on success, non-zero otherwise. + * Returns: zero on success, non-zero otherwise. */ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct kernel_ethtool_ts_info *info); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed96..d6326b53e3364 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -310,7 +310,7 @@ static inline bool vlan_uses_dev(const struct net_device *dev) * eth_type_vlan - check for valid vlan ether type. * @ethertype: ether type to check * - * Returns true if the ether type is a vlan ether type. + * Returns: true if the ether type is a vlan ether type. */ static inline bool eth_type_vlan(__be16 ethertype) { @@ -341,9 +341,9 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features, * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len - * Returns error if skb_cow_head fails. - * * Does not change skb->protocol so this function can be used during receive. + * + * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, @@ -390,9 +390,9 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload - * Returns error if skb_cow_head fails. - * * Does not change skb->protocol so this function can be used during receive. + * + * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) @@ -533,7 +533,7 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if the skb is not of VLAN type + * Returns: error if the skb is not of VLAN type */ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { @@ -551,7 +551,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if @skb->vlan_tci is not set correctly + * Returns: error if @skb->vlan_tci is not set correctly */ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, u16 *vlan_tci) @@ -570,7 +570,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if the skb is not VLAN tagged + * Returns: error if the skb is not VLAN tagged */ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { @@ -587,7 +587,7 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * @type: first vlan protocol * @depth: buffer to store length of eth and vlan tags in bytes * - * Returns the EtherType of the packet, regardless of whether it is + * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, @@ -629,7 +629,7 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * - * Returns the EtherType of the packet, regardless of whether it is + * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) @@ -710,7 +710,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. * - * Returns a new pointer to skb->data, or NULL on failure to pull. + * Returns: a new pointer to skb->data, or NULL on failure to pull. */ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { @@ -727,7 +727,7 @@ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) * skb_vlan_tagged - check if skb is vlan tagged. * @skb: skbuff to query * - * Returns true if the skb is tagged, regardless of whether it is hardware + * Returns: true if the skb is tagged, regardless of whether it is hardware * accelerated or not. */ static inline bool skb_vlan_tagged(const struct sk_buff *skb) @@ -743,7 +743,7 @@ static inline bool skb_vlan_tagged(const struct sk_buff *skb) * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers. * @skb: skbuff to query * - * Returns true if the skb is tagged with multiple vlan headers, regardless + * Returns: true if the skb is tagged with multiple vlan headers, regardless * of whether it is hardware accelerated or not. */ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) @@ -774,7 +774,7 @@ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) * @skb: skbuff to query * @features: features to be checked * - * Returns features without unsafe ones if the skb has multiple tags. + * Returns: features without unsafe ones if the skb has multiple tags. */ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, netdev_features_t features) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1351054416812..d917949bba03f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -509,7 +509,7 @@ static inline bool napi_prefer_busy_poll(struct napi_struct *n) * is scheduled for example in the context of delayed timer * that can be skipped if a NAPI is already scheduled. * - * Return True if NAPI is scheduled, False otherwise. + * Return: True if NAPI is scheduled, False otherwise. */ static inline bool napi_is_scheduled(struct napi_struct *n) { @@ -524,7 +524,7 @@ bool napi_schedule_prep(struct napi_struct *n); * * Schedule NAPI poll routine to be called if it is not already * running. - * Return true if we schedule a NAPI or false if not. + * Return: true if we schedule a NAPI or false if not. * Refer to napi_schedule_prep() for additional reason on why * a NAPI might not be scheduled. */ @@ -558,7 +558,7 @@ static inline void napi_schedule_irqoff(struct napi_struct *n) * Mark NAPI processing as complete. Should only be called if poll budget * has not been completely consumed. * Prefer over napi_complete(). - * Return false if device should avoid rearming interrupts. + * Return: false if device should avoid rearming interrupts. */ bool napi_complete_done(struct napi_struct *n, int work_done); @@ -3851,7 +3851,7 @@ static inline bool netif_attr_test_mask(unsigned long j, * @online_mask: bitmask for CPUs/Rx queues that are online * @nr_bits: number of bits in the bitmask * - * Returns true if a CPU/Rx queue is online. + * Returns: true if a CPU/Rx queue is online. */ static inline bool netif_attr_test_online(unsigned long j, const unsigned long *online_mask, @@ -3871,7 +3871,8 @@ static inline bool netif_attr_test_online(unsigned long j, * @srcp: the cpumask/Rx queue mask pointer * @nr_bits: number of bits in the bitmask * - * Returns >= nr_bits if no further CPUs/Rx queues set. + * Returns: next (after n) CPU/Rx queue index in the mask; + * >= nr_bits if no further CPUs/Rx queues set. */ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) @@ -3893,7 +3894,8 @@ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, * @src2p: the second CPUs/Rx queues mask pointer * @nr_bits: number of bits in the bitmask * - * Returns >= nr_bits if no further CPUs/Rx queues set in both. + * Returns: next (after n) CPU/Rx queue index set in both masks; + * >= nr_bits if no further CPUs/Rx queues set in both. */ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 5897f3dbaf7c3..f39f688d72852 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -357,7 +357,7 @@ extern struct static_key xt_tee_enabled; * Begin packet processing : all readers must wait the end * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) - * Returns : + * Returns: * 1 if no recursion on this cpu * 0 if recursion detected */ diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index 8676316547cc4..3175073a66ba6 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -66,7 +66,6 @@ static inline bool nf_hook_egress_active(void) * @rc: result code which shall be returned by __dev_queue_xmit() on failure * @dev: netdev whose egress hooks shall be applied to @skb * - * Returns @skb on success or %NULL if the packet was consumed or filtered. * Caller must hold rcu_read_lock. * * On ingress, packets are classified first by tc, then by netfilter. @@ -81,6 +80,8 @@ static inline bool nf_hook_egress_active(void) * called recursively by tunnel drivers such as vxlan, the flag is reverted to * false after sch_handle_egress(). This ensures that netfilter is applied * both on the overlay and underlying network. + * + * Returns: @skb on success or %NULL if the packet was consumed or filtered. */ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index c892d22ce0a7b..0d68d09bedd14 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -307,7 +307,7 @@ static inline u64 adjust_by_scaled_ppm(u64 base, long scaled_ppm) * @info: Structure describing the new clock. * @parent: Pointer to the parent device of the new clock. * - * Returns a valid pointer on success or PTR_ERR on failure. If PHC + * Returns: a valid pointer on success or PTR_ERR on failure. If PHC * support is missing at the configuration level, this function * returns NULL, and drivers are expected to gracefully handle that * case separately. @@ -445,7 +445,7 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); * @hwtstamp: timestamp * @vclock_index: phc index of ptp vclock. * - * Returns converted timestamp, or 0 on error. + * Returns: converted timestamp, or 0 on error. */ ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index); #else diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 997b341973859..6816e4c5f3f0a 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -241,7 +241,7 @@ bool rfkill_soft_blocked(struct rfkill *rfkill); * rfkill_find_type - Helper for finding rfkill type by name * @name: the name of the type * - * Returns enum rfkill_type that corresponds to the name. + * Returns: enum rfkill_type that corresponds to the name. */ enum rfkill_type rfkill_find_type(const char *name); diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 14b88f5519208..811ce44113f6f 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -78,7 +78,7 @@ static inline bool lockdep_rtnl_is_held(void) * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit + * Return: the value of the specified RCU-protected pointer, but omit * the READ_ONCE(), because caller holds RTNL. */ #define rtnl_dereference(p) \ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 95452d1a07fcf..69624b394cd94 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1134,7 +1134,7 @@ static inline bool skb_pfmemalloc(const struct sk_buff *skb) * skb_dst - returns skb dst_entry * @skb: buffer * - * Returns skb dst_entry, regardless of reference taken or not. + * Returns: skb dst_entry, regardless of reference taken or not. */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { @@ -1222,7 +1222,7 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) * skb_unref - decrement the skb's reference count * @skb: buffer * - * Returns true if we can free the skb. + * Returns: true if we can free the skb. */ static inline bool skb_unref(struct sk_buff *skb) { @@ -1344,7 +1344,7 @@ struct sk_buff_fclones { * @sk: socket * @skb: buffer * - * Returns true if skb is a fast clone, and its clone is not freed. + * Returns: true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that didn't happen. */ @@ -3516,7 +3516,7 @@ static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask) * A page shouldn't be considered for reusing/recycling if it was allocated * under memory pressure or at a distant memory node. * - * Returns false if this page should be returned to page allocator, true + * Returns: false if this page should be returned to page allocator, true * otherwise. */ static inline bool dev_page_is_reusable(const struct page *page) @@ -3633,7 +3633,7 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * - * Returns the address of the data within @frag. The page must already + * Returns: the address of the data within @frag. The page must already * be mapped. */ static inline void *skb_frag_address(const skb_frag_t *frag) @@ -3648,7 +3648,7 @@ static inline void *skb_frag_address(const skb_frag_t *frag) * skb_frag_address_safe - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * - * Returns the address of the data within @frag. Checks that the page + * Returns: the address of the data within @frag. Checks that the page * is mapped and returns %NULL otherwise. */ static inline void *skb_frag_address_safe(const skb_frag_t *frag) @@ -3890,7 +3890,7 @@ static inline int skb_linearize(struct sk_buff *skb) * skb_has_shared_frag - can any frag be overwritten * @skb: buffer to test * - * Return true if the skb has at least one frag that might be modified + * Return: true if the skb has at least one frag that might be modified * by an external entity (as in vmsplice()/sendfile()) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) @@ -4612,7 +4612,7 @@ static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) /* Check if we need to perform checksum complete validation. * - * Returns true if checksum complete is needed, false otherwise + * Returns: true if checksum complete is needed, false otherwise * (either checksum is unnecessary or zero checksum is allowed). */ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 79c781875c09f..a4d6cc0c9f68b 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -97,7 +97,7 @@ struct wwan_port_caps { * * This function must be balanced with a call to wwan_remove_port(). * - * Returns a valid pointer to wwan_port on success or PTR_ERR on failure + * Returns: a valid pointer to wwan_port on success or PTR_ERR on failure */ struct wwan_port *wwan_create_port(struct device *parent, enum wwan_port_type type, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 27acf1292a5ca..182f7965048f6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5957,7 +5957,7 @@ int wiphy_register(struct wiphy *wiphy); * @wiphy: the wiphy to check the locking on * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit the + * Return: the value of the specified RCU-protected pointer, but omit the * READ_ONCE(), because caller holds the wiphy mutex used for updates. */ #define wiphy_dereference(wiphy, p) \ diff --git a/include/net/dst.h b/include/net/dst.h index 08647c99d79c9..78c78cdce0e9a 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -307,7 +307,7 @@ static inline bool dst_hold_safe(struct dst_entry *dst) * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. - * Returns true if dst is refcounted. + * Returns: true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { diff --git a/include/net/genetlink.h b/include/net/genetlink.h index d096cc6352de5..a03d567658328 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -354,7 +354,7 @@ __genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags) * such requests) or a struct initialized by genl_info_init_ntf() * when constructing notifications. * - * Returns pointer to new genetlink header. + * Returns: pointer to new genetlink header. */ static inline void * genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) @@ -366,7 +366,7 @@ genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * - * Returns pointer to netlink header. + * Returns: pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { @@ -435,7 +435,7 @@ static inline void genl_dump_check_consistent(struct netlink_callback *cb, * @flags: netlink message flags * @cmd: generic netlink command * - * Returns pointer to user specific header + * Returns: pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 248bfb26e2af9..f5c43ad1565e2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -471,7 +471,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, /* This helper is specialized for BIG TCP needs. * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. * It assumes headers are already in skb->head. - * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there. + * Returns: 0, or IPPROTO_TCP if a BIG TCP packet is there. */ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) { diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index dd9e93c122602..9804fa5d9c677 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -202,7 +202,7 @@ struct iucv_handler { * * Registers a driver with IUCV. * - * Returns 0 on success, -ENOMEM if the memory allocation for the pathid + * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus. */ int iucv_register(struct iucv_handler *handler, int smp); @@ -224,7 +224,7 @@ void iucv_unregister(struct iucv_handler *handle, int smp); * * Allocate a new path structure for use with iucv_connect. * - * Returns NULL if the memory allocation failed or a pointer to the + * Returns: NULL if the memory allocation failed or a pointer to the * path structure. */ static inline struct iucv_path *iucv_path_alloc(u16 msglim, u8 flags, gfp_t gfp) @@ -260,7 +260,7 @@ static inline void iucv_path_free(struct iucv_path *path) * This function is issued after the user received a connection pending * external interrupt and now wishes to complete the IUCV communication path. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, u8 *userdata, void *private); @@ -278,7 +278,7 @@ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, * successfully, you are not able to use the path until you receive an IUCV * Connection Complete external interrupt. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, u8 *userid, u8 *system, u8 *userdata, @@ -292,7 +292,7 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, * This function temporarily suspends incoming messages on an IUCV path. * You can later reactivate the path by invoking the iucv_resume function. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); @@ -304,7 +304,7 @@ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); * This function resumes incoming messages on an IUCV path that has * been stopped with iucv_path_quiesce. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_resume(struct iucv_path *path, u8 *userdata); @@ -315,7 +315,7 @@ int iucv_path_resume(struct iucv_path *path, u8 *userdata); * * This function terminates an IUCV path. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_sever(struct iucv_path *path, u8 *userdata); @@ -327,7 +327,7 @@ int iucv_path_sever(struct iucv_path *path, u8 *userdata); * * Cancels a message you have sent. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, u32 srccls); @@ -347,7 +347,7 @@ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual); @@ -367,7 +367,7 @@ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, @@ -382,7 +382,7 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * are notified of a message and the time that you complete the message, * the message may be rejected. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); @@ -399,7 +399,7 @@ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into * the parameter list. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *reply, size_t size); @@ -419,7 +419,7 @@ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -439,7 +439,7 @@ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -461,7 +461,7 @@ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * reply to the message and a buffer is provided into which IUCV moves * the reply to this message. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size, diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 5adf6fda11e82..06985530517b5 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -49,7 +49,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); * * nf_tproxy_handle_time_wait4() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * @@ -108,7 +108,7 @@ nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, * * nf_tproxy_handle_time_wait6() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * diff --git a/include/net/netlink.h b/include/net/netlink.h index 39eaa6be6ca8d..e015ffbed819c 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -649,7 +649,7 @@ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * - * Returns the next netlink message in the message stream and + * Returns: the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * @@ -676,7 +676,7 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -701,7 +701,7 @@ static inline int nla_parse(struct nlattr **tb, int maxtype, * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -726,7 +726,7 @@ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, @@ -833,7 +833,7 @@ nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) @@ -854,7 +854,7 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, * specified policy. Validation is done in liberal mode. * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, @@ -877,7 +877,7 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len, * specified policy. Validation is done in strict mode. * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, @@ -914,7 +914,7 @@ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, * nlmsg_report - need to report back to application? * @nlh: netlink message header * - * Returns 1 if a report back to the application is requested. + * Returns: 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { @@ -925,7 +925,7 @@ static inline int nlmsg_report(const struct nlmsghdr *nlh) * nlmsg_seq - return the seq number of netlink message * @nlh: netlink message header * - * Returns 0 if netlink message is NULL + * Returns: 0 if netlink message is NULL */ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) { @@ -952,7 +952,7 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, @@ -971,7 +971,7 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se * * Append data to an existing nlmsg, used when constructing a message * with multiple fixed-format headers (which is rare). - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the extra payload. */ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) @@ -993,7 +993,7 @@ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, @@ -1050,7 +1050,7 @@ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * - * Returns a pointer to the current tail of the message. + * Returns: a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { @@ -1276,7 +1276,7 @@ static inline int nla_ok(const struct nlattr *nla, int remaining) * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * - * Returns the next netlink attribute in the attribute stream and + * Returns: the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) @@ -1292,7 +1292,7 @@ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) @@ -2091,7 +2091,7 @@ static inline int nla_get_flag(const struct nlattr *nla) * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * - * Returns the number of milliseconds in jiffies. + * Returns: the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { @@ -2183,7 +2183,7 @@ static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) @@ -2204,7 +2204,7 @@ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { @@ -2219,7 +2219,7 @@ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) * Corrects the container attribute header to include the all * appended attributes. * - * Returns the total data length of the skb. + * Returns: the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { @@ -2252,7 +2252,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * specified policy. Attributes with a type exceeding maxtype will be * ignored. See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, @@ -2285,7 +2285,7 @@ nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * - * Return true if padding is needed to align the next attribute (nla_data()) to + * Return: true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) @@ -2312,7 +2312,7 @@ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * - * Returns zero on success or a negative error code. + * Returns: zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 793e6fd78bc5c..26caa2c209125 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -104,8 +104,7 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) * * Get a page fragment from the page allocator or page_pool caches. * - * Return: - * Return allocated page fragment, otherwise return NULL. + * Return: allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, @@ -155,8 +154,7 @@ static inline struct page *page_pool_alloc(struct page_pool *pool, * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * - * Return: - * Return allocated page or page fragment, otherwise return NULL. + * Return: allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, @@ -190,8 +188,7 @@ static inline void *page_pool_alloc_va(struct page_pool *pool, * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * - * Return: - * Return the va for the allocated page or page fragment, otherwise return NULL. + * Return: the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index cf199af85c52e..22c5ab4269d76 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -319,7 +319,7 @@ tcf_exts_hw_stats_update(const struct tcf_exts *exts, * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * - * Returns true if at least one action is present. + * Returns: true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { @@ -501,7 +501,7 @@ int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, * through all ematches respecting their logic relations returning * as soon as the result is obvious. * - * Returns 1 if the ematch tree as-one matches, no ematches are configured + * Returns: 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, diff --git a/include/net/tcp.h b/include/net/tcp.h index e9b37b76e894b..5b2b04835688f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1817,7 +1817,7 @@ int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @c: returned tcp_sigpool for usage (uninitialized on failure) * - * Returns 0 on success, error otherwise. + * Returns: 0 on success, error otherwise. */ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); /** From 9234a37a495dc34cece943bec495ab541e4143ab Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:50 +0100 Subject: [PATCH 0320/1450] vxlan: In vxlan_rcv(), access flags through the vxlan netdevice vxlan_sock.flags is constructed from vxlan_dev.cfg.flags, as the subset of flags (named VXLAN_F_RCV_FLAGS) that is important from the point of view of socket sharing. Attempts to reconfigure these flags during the vxlan netdev lifetime are also bounced. It is therefore immaterial whether we access the flags through the vxlan_dev or through the socket. Convert the socket accesses to netdevice accesses in this separate patch to make the conversions that take place in the following patches more obvious. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/5d237ffd731055e524d7b7c436de43358d8743d2.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index b46a799bd3904..1ac2dcdd493ef 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1717,7 +1717,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ - if (vs->flags & VXLAN_F_GPE) { + if (vxlan->cfg.flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(&unparsed, &protocol)) goto drop; unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; @@ -1730,8 +1730,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) goto drop; } - if (vs->flags & VXLAN_F_REMCSUM_RX) { - reason = vxlan_remcsum(&unparsed, skb, vs->flags); + if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) { + reason = vxlan_remcsum(&unparsed, skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; } @@ -1756,8 +1756,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - if (vs->flags & VXLAN_F_GBP) - vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); + if (vxlan->cfg.flags & VXLAN_F_GBP) + vxlan_parse_gbp_hdr(&unparsed, skb, vxlan->cfg.flags, md); /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ From 0f09ae907818d593e55c4b058d286a0914a43c3f Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:51 +0100 Subject: [PATCH 0321/1450] vxlan: vxlan_rcv() callees: Move clearing of unparsed flags out In order to migrate away from the use of unparsed to detect invalid flags, move all the code that actually clears the flags from callees directly to vxlan_rcv(). Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/2857871d929375c881b9defe378473c8200ead9b.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 1ac2dcdd493ef..c2254b0ac9645 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1562,7 +1562,7 @@ static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, size_t start, offset; if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) - goto out; + return SKB_NOT_DROPPED_YET; start = vxlan_rco_start(unparsed->vx_vni); offset = start + vxlan_rco_offset(unparsed->vx_vni); @@ -1573,10 +1573,6 @@ static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); -out: - unparsed->vx_flags &= ~VXLAN_HF_RCO; - unparsed->vx_vni &= VXLAN_VNI_MASK; - return SKB_NOT_DROPPED_YET; } @@ -1588,7 +1584,7 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, struct metadata_dst *tun_dst; if (!(unparsed->vx_flags & VXLAN_HF_GBP)) - goto out; + return; md->gbp = ntohs(gbp->policy_id); @@ -1607,8 +1603,6 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, /* In flow-based mode, GBP is carried in dst_metadata */ if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; -out: - unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; } static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan, @@ -1734,6 +1728,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) reason = vxlan_remcsum(&unparsed, skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; + unparsed.vx_flags &= ~VXLAN_HF_RCO; + unparsed.vx_vni &= VXLAN_VNI_MASK; } if (vxlan_collect_metadata(vs)) { @@ -1756,8 +1752,10 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - if (vxlan->cfg.flags & VXLAN_F_GBP) + if (vxlan->cfg.flags & VXLAN_F_GBP) { vxlan_parse_gbp_hdr(&unparsed, skb, vxlan->cfg.flags, md); + unparsed.vx_flags &= ~VXLAN_GBP_USED_BITS; + } /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ From fe3dcbcfae522fae9c62954488398562ff6b5ece Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:52 +0100 Subject: [PATCH 0322/1450] vxlan: vxlan_rcv() callees: Drop the unparsed argument The functions vxlan_remcsum() and vxlan_parse_gbp_hdr() take both the SKB and the unparsed VXLAN header. Now that unparsed adjustment is handled directly by vxlan_rcv(), drop this argument, and have the function derive it from the SKB on its own. vxlan_parse_gpe_proto() does not take SKB, so keep the header parameter. However const it so that it's clear that the intention is that it does not get changed. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/5ea651f4e06485ba1a84a8eb556a457c39f0dfd4.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index c2254b0ac9645..37c20cf93f92b 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -622,9 +622,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, return 1; } -static bool vxlan_parse_gpe_proto(struct vxlanhdr *hdr, __be16 *protocol) +static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol) { - struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)hdr; + const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr; /* Need to have Next Protocol set for interfaces in GPE mode. */ if (!gpe->np_applied) @@ -1554,18 +1554,17 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan) #endif } -static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, - struct sk_buff *skb, - u32 vxflags) +static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags) { + const struct vxlanhdr *vh = vxlan_hdr(skb); enum skb_drop_reason reason; size_t start, offset; - if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) + if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) return SKB_NOT_DROPPED_YET; - start = vxlan_rco_start(unparsed->vx_vni); - offset = start + vxlan_rco_offset(unparsed->vx_vni); + start = vxlan_rco_start(vh->vx_vni); + offset = start + vxlan_rco_offset(vh->vx_vni); reason = pskb_may_pull_reason(skb, offset + sizeof(u16)); if (reason) @@ -1576,14 +1575,16 @@ static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, return SKB_NOT_DROPPED_YET; } -static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, - struct sk_buff *skb, u32 vxflags, +static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags, struct vxlan_metadata *md) { - struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; + const struct vxlanhdr *vh = vxlan_hdr(skb); + const struct vxlanhdr_gbp *gbp; struct metadata_dst *tun_dst; - if (!(unparsed->vx_flags & VXLAN_HF_GBP)) + gbp = (const struct vxlanhdr_gbp *)vh; + + if (!(vh->vx_flags & VXLAN_HF_GBP)) return; md->gbp = ntohs(gbp->policy_id); @@ -1712,7 +1713,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) * used by VXLAN extensions if explicitly requested. */ if (vxlan->cfg.flags & VXLAN_F_GPE) { - if (!vxlan_parse_gpe_proto(&unparsed, &protocol)) + if (!vxlan_parse_gpe_proto(vxlan_hdr(skb), &protocol)) goto drop; unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; @@ -1725,7 +1726,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) } if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) { - reason = vxlan_remcsum(&unparsed, skb, vxlan->cfg.flags); + reason = vxlan_remcsum(skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; unparsed.vx_flags &= ~VXLAN_HF_RCO; @@ -1753,7 +1754,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) } if (vxlan->cfg.flags & VXLAN_F_GBP) { - vxlan_parse_gbp_hdr(&unparsed, skb, vxlan->cfg.flags, md); + vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md); unparsed.vx_flags &= ~VXLAN_GBP_USED_BITS; } /* Note that GBP and GPE can never be active together. This is From e713130dfb4d6b5a2cd42f33a94b6ac983d2989d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:53 +0100 Subject: [PATCH 0323/1450] vxlan: vxlan_rcv(): Extract vxlan_hdr(skb) to a named variable Having a named reference to the VXLAN header is more handy than having to conjure it anew through vxlan_hdr() on every use. Add a new variable and convert several open-coded sites. Additionally, convert one "unparsed" use to the new variable as well. Thus the only "unparsed" uses that remain are the flag-clearing and the header validity check at the end. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Mateusz Polchlopek Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/2a0a940e883c435a0fdbcdc1d03c4858957ad00e.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 37c20cf93f92b..f114568b67a3f 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1667,6 +1667,7 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { struct vxlan_vni_node *vninode = NULL; + const struct vxlanhdr *vh; struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlanhdr unparsed; @@ -1685,11 +1686,11 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) goto drop; unparsed = *vxlan_hdr(skb); + vh = vxlan_hdr(skb); /* VNI flag always required to be set */ - if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { + if (!(vh->vx_flags & VXLAN_HF_VNI)) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxlan_hdr(skb)->vx_flags), - ntohl(vxlan_hdr(skb)->vx_vni)); + ntohl(vh->vx_flags), ntohl(vh->vx_vni)); reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; /* Return non vxlan pkt */ goto drop; @@ -1701,7 +1702,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (!vs) goto drop; - vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); + vni = vxlan_vni(vh->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); if (!vxlan) { @@ -1713,7 +1714,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) * used by VXLAN extensions if explicitly requested. */ if (vxlan->cfg.flags & VXLAN_F_GPE) { - if (!vxlan_parse_gpe_proto(vxlan_hdr(skb), &protocol)) + if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; From e4f8647767cfac0291def86ddfac23b925294701 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:54 +0100 Subject: [PATCH 0324/1450] vxlan: Track reserved bits explicitly as part of the configuration In order to make it possible to configure which bits in VXLAN header should be considered reserved, introduce a new field vxlan_config::reserved_bits. Have it cover the whole header, except for the VNI-present bit and the bits for VNI itself, and have individual enabled features clear more bits off reserved_bits. (This is expressed as first constructing a used_bits set, and then inverting it to get the reserved_bits. The set of used_bits will be useful on its own for validation of user-set reserved_bits in a following patch.) The patch also moves a comment relevant to the validation from the unparsed validation site up to the new site. Logically this patch should add the new comment, and a later patch that removes the unparsed bits would remove the old comment. But keeping both legs in the same patch is better from the history spelunking point of view. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/984dbf98d5940d3900268dbffaf70961f731d4a4.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 41 +++++++++++++++++++++++++--------- include/net/vxlan.h | 1 + 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index f114568b67a3f..e50f4cb70193f 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1710,9 +1710,20 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) goto drop; } - /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ + if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags || + vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) { + /* If the header uses bits besides those enabled by the + * netdevice configuration, treat this as a malformed packet. + * This behavior diverges from VXLAN RFC (RFC7348) which + * stipulates that bits in reserved in reserved fields are to be + * ignored. The approach here maintains compatibility with + * previous stack code, and also is more robust and provides a + * little more security in adding extensions to VXLAN. + */ + reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; + goto drop; + } + if (vxlan->cfg.flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; @@ -1763,14 +1774,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) */ if (unparsed.vx_flags || unparsed.vx_vni) { - /* If there are any unprocessed flags remaining treat - * this as a malformed packet. This behavior diverges from - * VXLAN RFC (RFC7348) which stipulates that bits in reserved - * in reserved fields are to be ignored. The approach here - * maintains compatibility with previous stack code, and also - * is more robust and provides a little more security in - * adding extensions to VXLAN. - */ reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; goto drop; } @@ -4070,6 +4073,10 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { + struct vxlanhdr used_bits = { + .vx_flags = VXLAN_HF_VNI, + .vx_vni = VXLAN_VNI_MASK, + }; struct vxlan_dev *vxlan = netdev_priv(dev); int err = 0; @@ -4296,6 +4303,8 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + used_bits.vx_flags |= VXLAN_HF_RCO; + used_bits.vx_vni |= ~VXLAN_VNI_MASK; } if (data[IFLA_VXLAN_GBP]) { @@ -4303,6 +4312,7 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], VXLAN_F_GBP, changelink, false, extack); if (err) return err; + used_bits.vx_flags |= VXLAN_GBP_USED_BITS; } if (data[IFLA_VXLAN_GPE]) { @@ -4311,8 +4321,17 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + + used_bits.vx_flags |= VXLAN_GPE_USED_BITS; } + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + conf->reserved_bits = (struct vxlanhdr) { + .vx_flags = ~used_bits.vx_flags, + .vx_vni = ~used_bits.vx_vni, + }; if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL, VXLAN_F_REMCSUM_NOPARTIAL, changelink, diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 33ba6fc151cf8..2dd23ee2bacdd 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -227,6 +227,7 @@ struct vxlan_config { unsigned int addrmax; bool no_share; enum ifla_vxlan_df df; + struct vxlanhdr reserved_bits; }; enum { From 752b1c8d8b409f2b03e61e153696689ee081bf07 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:55 +0100 Subject: [PATCH 0325/1450] vxlan: Bump error counters for header mismatches The VXLAN driver so far has not increased the error counters for packets that set reserved bits. It does so for other packet errors, so do it for this case as well. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/d096084167d56706d620afe5136cf37a2d34d1b9.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index e50f4cb70193f..92832a396ab79 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1721,6 +1721,10 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) * little more security in adding extensions to VXLAN. */ reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; + DEV_STATS_INC(vxlan->dev, rx_frame_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } From bb16786ed6fdff3a67ba33ed928ae138fd4254b5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:56 +0100 Subject: [PATCH 0326/1450] vxlan: vxlan_rcv(): Drop unparsed The code currently validates the VXLAN header in two ways: first by comparing it with the set of reserved bits, constructed ahead of time during the netdevice construction; and second by gradually clearing the bits off a separate copy of VXLAN header, "unparsed". Drop the latter validation method. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/4559f16c5664c189b3a4ee6f5da91f552ad4821c.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 92832a396ab79..ff5684a2103a8 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1670,7 +1670,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) const struct vxlanhdr *vh; struct vxlan_dev *vxlan; struct vxlan_sock *vs; - struct vxlanhdr unparsed; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); @@ -1685,7 +1684,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (reason) goto drop; - unparsed = *vxlan_hdr(skb); vh = vxlan_hdr(skb); /* VNI flag always required to be set */ if (!(vh->vx_flags & VXLAN_HF_VNI)) { @@ -1695,8 +1693,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) /* Return non vxlan pkt */ goto drop; } - unparsed.vx_flags &= ~VXLAN_HF_VNI; - unparsed.vx_vni &= ~VXLAN_VNI_MASK; vs = rcu_dereference_sk_user_data(sk); if (!vs) @@ -1731,7 +1727,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (vxlan->cfg.flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; - unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; } @@ -1745,8 +1740,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) reason = vxlan_remcsum(skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; - unparsed.vx_flags &= ~VXLAN_HF_RCO; - unparsed.vx_vni &= VXLAN_VNI_MASK; } if (vxlan_collect_metadata(vs)) { @@ -1769,19 +1762,12 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - if (vxlan->cfg.flags & VXLAN_F_GBP) { + if (vxlan->cfg.flags & VXLAN_F_GBP) vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md); - unparsed.vx_flags &= ~VXLAN_GBP_USED_BITS; - } /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ - if (unparsed.vx_flags || unparsed.vx_vni) { - reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; - goto drop; - } - if (!raw_proto) { reason = vxlan_set_mac(vxlan, vs, skb, vni); if (reason) From 6c11379b104e3718135fd7fc37bb254b41e4cf65 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:57 +0100 Subject: [PATCH 0327/1450] vxlan: Add an attribute to make VXLAN header validation configurable The set of bits that the VXLAN netdevice currently considers reserved is defined by the features enabled at the netdevice construction. In order to make this configurable, add an attribute, IFLA_VXLAN_RESERVED_BITS. The payload is a pair of big-endian u32's covering the VXLAN header. This is validated against the set of flags used by the various enabled VXLAN features, and attempts to override bits used by an enabled feature are bounced. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/c657275e5ceed301e62c69fe8e559e32909442e2.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 53 +++++++++++++++++++++++++++++----- include/uapi/linux/if_link.h | 1 + 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index ff5684a2103a8..43cf672b7b9fc 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3428,6 +3428,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), + [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4315,13 +4316,44 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], used_bits.vx_flags |= VXLAN_GPE_USED_BITS; } - /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ - conf->reserved_bits = (struct vxlanhdr) { - .vx_flags = ~used_bits.vx_flags, - .vx_vni = ~used_bits.vx_vni, - }; + if (data[IFLA_VXLAN_RESERVED_BITS]) { + struct vxlanhdr reserved_bits; + + if (changelink) { + NL_SET_ERR_MSG_ATTR(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Cannot change reserved_bits"); + return -EOPNOTSUPP; + } + + nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS], + sizeof(reserved_bits)); + if (used_bits.vx_flags & reserved_bits.vx_flags || + used_bits.vx_vni & reserved_bits.vx_vni) { + __be64 ub_be64, rb_be64; + + memcpy(&ub_be64, &used_bits, sizeof(ub_be64)); + memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64)); + + NL_SET_ERR_MSG_ATTR_FMT(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Used bits %#018llx cannot overlap reserved bits %#018llx", + be64_to_cpu(ub_be64), + be64_to_cpu(rb_be64)); + return -EINVAL; + } + + conf->reserved_bits = reserved_bits; + } else { + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + conf->reserved_bits = (struct vxlanhdr) { + .vx_flags = ~used_bits.vx_flags, + .vx_vni = ~used_bits.vx_vni, + }; + } + if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL, VXLAN_F_REMCSUM_NOPARTIAL, changelink, @@ -4506,6 +4538,8 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(0) + /* IFLA_VXLAN_GPE */ nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ + /* IFLA_VXLAN_RESERVED_BITS */ + nla_total_size(sizeof(struct vxlanhdr)) + 0; } @@ -4608,6 +4642,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) goto nla_put_failure; + if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS, + sizeof(vxlan->cfg.reserved_bits), + &vxlan->cfg.reserved_bits)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2575e0cd9b482..77730c340c8f3 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1394,6 +1394,7 @@ enum { IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ + IFLA_VXLAN_RESERVED_BITS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) From 8653eb21d68c6882ce5716b04379431817310b85 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:58 +0100 Subject: [PATCH 0328/1450] selftests: net: lib: Rename ip_link_master() to ip_link_set_master() Let's have a verb in that function name to make it clearer what's going on. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/fbf7c53a429b340b9cff5831280ea8c305a224f9.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fdb_notify.sh | 6 +++--- tools/testing/selftests/net/lib.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/fdb_notify.sh b/tools/testing/selftests/net/fdb_notify.sh index c03151e7791c6..c159230c9b623 100755 --- a/tools/testing/selftests/net/fdb_notify.sh +++ b/tools/testing/selftests/net/fdb_notify.sh @@ -49,7 +49,7 @@ test_dup_vxlan_self() { ip_link_add br up type bridge vlan_filtering 1 ip_link_add vx up type vxlan id 2000 dstport 4789 - ip_link_master vx br + ip_link_set_master vx br do_test_dup add "vxlan" dev vx self dst 192.0.2.1 do_test_dup del "vxlan" dev vx self dst 192.0.2.1 @@ -59,7 +59,7 @@ test_dup_vxlan_master() { ip_link_add br up type bridge vlan_filtering 1 ip_link_add vx up type vxlan id 2000 dstport 4789 - ip_link_master vx br + ip_link_set_master vx br do_test_dup add "vxlan master" dev vx master do_test_dup del "vxlan master" dev vx master @@ -79,7 +79,7 @@ test_dup_macvlan_master() ip_link_add br up type bridge vlan_filtering 1 ip_link_add dd up type dummy ip_link_add mv up link dd type macvlan mode passthru - ip_link_master mv br + ip_link_set_master mv br do_test_dup add "macvlan master" dev mv self do_test_dup del "macvlan master" dev mv self diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 8994fec1c38f6..5ea6537acd2b2 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -451,7 +451,7 @@ ip_link_add() defer ip link del dev "$name" } -ip_link_master() +ip_link_set_master() { local member=$1; shift local master=$1; shift From d76ccb2ec368c8a44f64839140cd253c19f6a79a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:59 +0100 Subject: [PATCH 0329/1450] selftests: net: lib: Add several autodefer helpers Add ip_link_set_addr(), ip_link_set_up(), ip_addr_add() and ip_route_add() to the suite of helpers that automatically schedule a corresponding cleanup. When setting a new MAC, one needs to remember the old address first. Move mac_get() from forwarding/ to that end. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/add6bcbe30828fd01363266df20c338cf13aaf25.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 7 ---- tools/testing/selftests/net/lib.sh | 39 +++++++++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 7337f398f9cc9..1fd40bada6949 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -932,13 +932,6 @@ packets_rate() echo $(((t1 - t0) / interval)) } -mac_get() -{ - local if_name=$1 - - ip -j link show dev $if_name | jq -r '.[]["address"]' -} - ether_addr_to_u64() { local addr="$1" diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 5ea6537acd2b2..2cd5c743b2d99 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -435,6 +435,13 @@ xfail_on_veth() fi } +mac_get() +{ + local if_name=$1 + + ip -j link show dev $if_name | jq -r '.[]["address"]' +} + kill_process() { local pid=$1; shift @@ -459,3 +466,35 @@ ip_link_set_master() ip link set dev "$member" master "$master" defer ip link set dev "$member" nomaster } + +ip_link_set_addr() +{ + local name=$1; shift + local addr=$1; shift + + local old_addr=$(mac_get "$name") + ip link set dev "$name" address "$addr" + defer ip link set dev "$name" address "$old_addr" +} + +ip_link_set_up() +{ + local name=$1; shift + + ip link set dev "$name" up + defer ip link set dev "$name" down +} + +ip_addr_add() +{ + local name=$1; shift + + ip addr add dev "$name" "$@" + defer ip addr del dev "$name" "$@" +} + +ip_route_add() +{ + ip route add "$@" + defer ip route del "$@" +} From d84b5dccf3ebdeeabef910d1c19b931c84f67884 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:41:00 +0100 Subject: [PATCH 0330/1450] selftests: forwarding: Add a selftest for the new reserved_bits UAPI Run VXLAN packets through a gateway. Flip individual bits of the packet and/or reserved bits of the gateway, and check that the gateway treats the packets as expected. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/388bef3c30ebc887d4e64cd86a362e2df2f2d2e1.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/forwarding/Makefile | 1 + .../net/forwarding/vxlan_reserved.sh | 352 ++++++++++++++++++ 2 files changed, 353 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/vxlan_reserved.sh diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 7d885cff8d79b..00bde7b6f39ee 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -105,6 +105,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ vxlan_bridge_1q_port_8472_ipv6.sh \ vxlan_bridge_1q_port_8472.sh \ vxlan_bridge_1q.sh \ + vxlan_reserved.sh \ vxlan_symmetric_ipv6.sh \ vxlan_symmetric.sh diff --git a/tools/testing/selftests/net/forwarding/vxlan_reserved.sh b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh new file mode 100755 index 0000000000000..46c31794b91b3 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh @@ -0,0 +1,352 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +--------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/28 | +# +----|---------------+ +# | +# +----|--------------------------------+ +# | SW | | +# | +--|------------------------------+ | +# | | + $swp1 BR1 (802.1d) | | +# | | | | +# | | + vx1 (vxlan) | | +# | | local 192.0.2.17 | | +# | | id 1000 dstport $VXPORT | | +# | +---------------------------------+ | +# | | +# | 192.0.2.32/28 via 192.0.2.18 | +# | | +# | + $rp1 | +# | | 192.0.2.17/28 | +# +--|----------------------------------+ +# | +# +--|----------------------------------+ +# | | | +# | + $rp2 | +# | 192.0.2.18/28 | +# | | +# | VRP2 (vrf) | +# +-------------------------------------+ + +: ${VXPORT:=4789} +: ${ALL_TESTS:=" + default_test + plain_test + reserved_0_test + reserved_10_test + reserved_31_test + reserved_56_test + reserved_63_test + "} + +NUM_NETIFS=4 +source lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 + defer simple_if_fini $h1 192.0.2.1/28 + + tc qdisc add dev $h1 clsact + defer tc qdisc del dev $h1 clsact + + tc filter add dev $h1 ingress pref 77 \ + prot ip flower skip_hw ip_proto icmp action drop + defer tc filter del dev $h1 ingress pref 77 +} + +switch_create() +{ + ip_link_add br1 type bridge vlan_filtering 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip_link_set_addr br1 $(mac_get $swp1) + ip_link_set_up br1 + + ip_link_set_up $rp1 + ip_addr_add $rp1 192.0.2.17/28 + ip_route_add 192.0.2.32/28 nexthop via 192.0.2.18 + + ip_link_set_master $swp1 br1 + ip_link_set_up $swp1 +} + +vrp2_create() +{ + simple_if_init $rp2 192.0.2.18/28 + defer simple_if_fini $rp2 192.0.2.18/28 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + rp1=${NETIFS[p3]} + rp2=${NETIFS[p4]} + + vrf_prepare + defer vrf_cleanup + + forwarding_enable + defer forwarding_restore + + h1_create + switch_create + + vrp2_create +} + +vxlan_header_bytes() +{ + local vni=$1; shift + local -a extra_bits=("$@") + local -a bits + local i + + for ((i=0; i < 64; i++)); do + bits[i]=0 + done + + # Bit 4 is the I flag and is always on. + bits[4]=1 + + for i in ${extra_bits[@]}; do + bits[i]=1 + done + + # Bits 32..55 carry the VNI + local mask=0x800000 + for ((i=0; i < 24; i++)); do + bits[$((i + 32))]=$(((vni & mask) != 0)) + ((mask >>= 1)) + done + + local bytes + for ((i=0; i < 8; i++)); do + local byte=0 + local j + for ((j=0; j < 8; j++)); do + local bit=${bits[8 * i + j]} + ((byte += bit << (7 - j))) + done + bytes+=$(printf %02x $byte): + done + + echo ${bytes%:} +} + +neg_bytes() +{ + local bytes=$1; shift + + local -A neg=([0]=f [1]=e [2]=d [3]=c [4]=b [5]=a [6]=9 [7]=8 + [8]=7 [9]=6 [a]=5 [b]=4 [c]=3 [d]=2 [e]=1 [f]=0 [:]=:) + local out + local i + + for ((i=0; i < ${#bytes}; i++)); do + local c=${bytes:$i:1} + out+=${neg[$c]} + done + echo $out +} + +vxlan_ping_do() +{ + local count=$1; shift + local dev=$1; shift + local next_hop_mac=$1; shift + local dest_ip=$1; shift + local dest_mac=$1; shift + local vni=$1; shift + local reserved_bits=$1; shift + + local vxlan_header=$(vxlan_header_bytes $vni $reserved_bits) + + $MZ $dev -c $count -d 100msec -q \ + -b $next_hop_mac -B $dest_ip \ + -t udp sp=23456,dp=$VXPORT,p=$(: + )"$vxlan_header:"$( : VXLAN + )"$dest_mac:"$( : ETH daddr + )"00:11:22:33:44:55:"$( : ETH saddr + )"08:00:"$( : ETH type + )"45:"$( : IP version + IHL + )"00:"$( : IP TOS + )"00:54:"$( : IP total length + )"99:83:"$( : IP identification + )"40:00:"$( : IP flags + frag off + )"40:"$( : IP TTL + )"01:"$( : IP proto + )"00:00:"$( : IP header csum + )"$(ipv4_to_bytes 192.0.2.3):"$( : IP saddr + )"$(ipv4_to_bytes 192.0.2.1):"$( : IP daddr + )"08:"$( : ICMP type + )"00:"$( : ICMP code + )"8b:f2:"$( : ICMP csum + )"1f:6a:"$( : ICMP request identifier + )"00:01:"$( : ICMP request seq. number + )"4f:ff:c5:5b:00:00:00:00:"$( : ICMP payload + )"6d:74:0b:00:00:00:00:00:"$( : + )"10:11:12:13:14:15:16:17:"$( : + )"18:19:1a:1b:1c:1d:1e:1f:"$( : + )"20:21:22:23:24:25:26:27:"$( : + )"28:29:2a:2b:2c:2d:2e:2f:"$( : + )"30:31:32:33:34:35:36:37" +} + +vxlan_device_add() +{ + ip_link_add vx1 up type vxlan id 1000 \ + local 192.0.2.17 dstport "$VXPORT" \ + nolearning noudpcsum tos inherit ttl 100 "$@" + ip_link_set_master vx1 br1 +} + +vxlan_all_reserved_bits() +{ + local i + + for ((i=0; i < 64; i++)); do + if ((i == 4 || i >= 32 && i < 56)); then + continue + fi + echo $i + done +} + +vxlan_ping_vanilla() +{ + vxlan_ping_do 10 $rp2 $(mac_get $rp1) 192.0.2.17 $(mac_get $h1) 1000 +} + +vxlan_ping_reserved() +{ + for bit in $(vxlan_all_reserved_bits); do + vxlan_ping_do 1 $rp2 $(mac_get $rp1) \ + 192.0.2.17 $(mac_get $h1) 1000 "$bit" + ((n++)) + done +} + +vxlan_ping_test() +{ + local what=$1; shift + local get_stat=$1; shift + local expect=$1; shift + + RET=0 + + local t0=$($get_stat) + + "$@" + check_err $? "Failure when running $@" + + local t1=$($get_stat) + local delta=$((t1 - t0)) + + ((expect == delta)) + check_err $? "Expected to capture $expect packets, got $delta." + + log_test "$what" +} + +__default_test_do() +{ + local n_allowed_bits=$1; shift + local what=$1; shift + + vxlan_ping_test "$what: clean packets" \ + "tc_rule_stats_get $h1 77 ingress" \ + 10 vxlan_ping_vanilla + + local t0=$(link_stats_get vx1 rx errors) + vxlan_ping_test "$what: mangled packets" \ + "tc_rule_stats_get $h1 77 ingress" \ + $n_allowed_bits vxlan_ping_reserved + local t1=$(link_stats_get vx1 rx errors) + + RET=0 + local expect=$((39 - n_allowed_bits)) + local delta=$((t1 - t0)) + ((expect == delta)) + check_err $? "Expected $expect error packets, got $delta." + log_test "$what: drops reported" +} + +default_test_do() +{ + vxlan_device_add + __default_test_do 0 "Default" +} + +default_test() +{ + in_defer_scope \ + default_test_do +} + +plain_test_do() +{ + vxlan_device_add reserved_bits 0xf7ffffff000000ff + __default_test_do 0 "reserved_bits 0xf7ffffff000000ff" +} + +plain_test() +{ + in_defer_scope \ + plain_test_do +} + +reserved_test() +{ + local bit=$1; shift + + local allowed_bytes=$(vxlan_header_bytes 0xffffff $bit) + local reserved_bytes=$(neg_bytes $allowed_bytes) + local reserved_bits=${reserved_bytes//:/} + + vxlan_device_add reserved_bits 0x$reserved_bits + __default_test_do 1 "reserved_bits 0x$reserved_bits" +} + +reserved_0_test() +{ + in_defer_scope \ + reserved_test 0 +} + +reserved_10_test() +{ + in_defer_scope \ + reserved_test 10 +} + +reserved_31_test() +{ + in_defer_scope \ + reserved_test 31 +} + +reserved_56_test() +{ + in_defer_scope \ + reserved_test 56 +} + +reserved_63_test() +{ + in_defer_scope \ + reserved_test 63 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS From 8a241ef9b9b86afc087e5b3e4a95cb60f9ee796c Mon Sep 17 00:00:00 2001 From: Shinas Rasheed Date: Thu, 5 Dec 2024 22:41:34 -0800 Subject: [PATCH 0331/1450] octeon_ep: add ndo ops for VFs in PF driver These APIs are needed to support applications that use netlink to get VF information from a PF driver. Signed-off-by: Shinas Rasheed Link: https://patch.msgid.link/20241206064135.2331790-1-srasheed@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeon_ep/octep_main.c | 39 +++++++++++++++++++ .../ethernet/marvell/octeon_ep/octep_main.h | 1 + .../marvell/octeon_ep/octep_pfvf_mbox.c | 23 ++++++++++- .../marvell/octeon_ep/octep_pfvf_mbox.h | 6 ++- 4 files changed, 65 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 549436efc2048..3a9825883d79a 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1137,6 +1137,43 @@ static int octep_set_features(struct net_device *dev, netdev_features_t features return err; } +static int octep_get_vf_config(struct net_device *dev, int vf, + struct ifla_vf_info *ivi) +{ + struct octep_device *oct = netdev_priv(dev); + + ivi->vf = vf; + ether_addr_copy(ivi->mac, oct->vf_info[vf].mac_addr); + ivi->spoofchk = true; + ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE; + ivi->trusted = false; + + return 0; +} + +static int octep_set_vf_mac(struct net_device *dev, int vf, u8 *mac) +{ + struct octep_device *oct = netdev_priv(dev); + int err; + + if (!is_valid_ether_addr(mac)) { + dev_err(&oct->pdev->dev, "Invalid MAC Address %pM\n", mac); + return -EADDRNOTAVAIL; + } + + dev_dbg(&oct->pdev->dev, "set vf-%d mac to %pM\n", vf, mac); + ether_addr_copy(oct->vf_info[vf].mac_addr, mac); + oct->vf_info[vf].flags |= OCTEON_PFVF_FLAG_MAC_SET_BY_PF; + + err = octep_ctrl_net_set_mac_addr(oct, vf, mac, true); + if (err) + dev_err(&oct->pdev->dev, + "Set VF%d MAC address failed via host control Mbox\n", + vf); + + return err; +} + static const struct net_device_ops octep_netdev_ops = { .ndo_open = octep_open, .ndo_stop = octep_stop, @@ -1146,6 +1183,8 @@ static const struct net_device_ops octep_netdev_ops = { .ndo_set_mac_address = octep_set_mac, .ndo_change_mtu = octep_change_mtu, .ndo_set_features = octep_set_features, + .ndo_get_vf_config = octep_get_vf_config, + .ndo_set_vf_mac = octep_set_vf_mac }; /** diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h index fee59e0e0138f..3b56916af4688 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h @@ -220,6 +220,7 @@ struct octep_iface_link_info { /* The Octeon VF device specific info data structure.*/ struct octep_pfvf_info { u8 mac_addr[ETH_ALEN]; + u32 flags; u32 mbox_version; }; diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c index e6eb98d70f3c4..ebecdd29f3bd0 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c @@ -156,12 +156,23 @@ static void octep_pfvf_set_mac_addr(struct octep_device *oct, u32 vf_id, { int err; + if (oct->vf_info[vf_id].flags & OCTEON_PFVF_FLAG_MAC_SET_BY_PF) { + dev_err(&oct->pdev->dev, + "VF%d attempted to override administrative set MAC address\n", + vf_id); + rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; + return; + } + err = octep_ctrl_net_set_mac_addr(oct, vf_id, cmd.s_set_mac.mac_addr, true); if (err) { rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; - dev_err(&oct->pdev->dev, "Set VF MAC address failed via host control Mbox\n"); + dev_err(&oct->pdev->dev, "Set VF%d MAC address failed via host control Mbox\n", + vf_id); return; } + + ether_addr_copy(oct->vf_info[vf_id].mac_addr, cmd.s_set_mac.mac_addr); rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; } @@ -171,10 +182,18 @@ static void octep_pfvf_get_mac_addr(struct octep_device *oct, u32 vf_id, { int err; + if (oct->vf_info[vf_id].flags & OCTEON_PFVF_FLAG_MAC_SET_BY_PF) { + dev_dbg(&oct->pdev->dev, "VF%d MAC address set by PF\n", vf_id); + ether_addr_copy(rsp->s_set_mac.mac_addr, + oct->vf_info[vf_id].mac_addr); + rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; + return; + } err = octep_ctrl_net_get_mac_addr(oct, vf_id, rsp->s_set_mac.mac_addr); if (err) { rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; - dev_err(&oct->pdev->dev, "Get VF MAC address failed via host control Mbox\n"); + dev_err(&oct->pdev->dev, "Get VF%d MAC address failed via host control Mbox\n", + vf_id); return; } rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.h b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.h index 0dc6eead292a3..386a095a99bc4 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.h +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.h @@ -8,8 +8,6 @@ #ifndef _OCTEP_PFVF_MBOX_H_ #define _OCTEP_PFVF_MBOX_H_ -/* VF flags */ -#define OCTEON_PFVF_FLAG_MAC_SET_BY_PF BIT_ULL(0) /* PF has set VF MAC address */ #define OCTEON_SDP_16K_HW_FRS 16380UL #define OCTEON_SDP_64K_HW_FRS 65531UL @@ -23,6 +21,10 @@ enum octep_pfvf_mbox_version { #define OCTEP_PFVF_MBOX_VERSION_CURRENT OCTEP_PFVF_MBOX_VERSION_V2 +/* VF flags */ +/* PF has set VF MAC address */ +#define OCTEON_PFVF_FLAG_MAC_SET_BY_PF BIT(0) + enum octep_pfvf_mbox_opcode { OCTEP_PFVF_MBOX_CMD_VERSION, OCTEP_PFVF_MBOX_CMD_SET_MTU, From afc6e39e824ad0e44b2af50a97885caec8d213d1 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Mon, 9 Dec 2024 11:46:15 +0100 Subject: [PATCH 0332/1450] power: supply: gpio-charger: Fix set charge current limits Fix set charge current limits for devices which allow to set the lowest charge current limit to be greater zero. If requested charge current limit is below lowest limit, the index equals current_limit_map_size which leads to accessing memory beyond allocated memory. Fixes: be2919d8355e ("power: supply: gpio-charger: add charge-current-limit feature") Cc: stable@vger.kernel.org Signed-off-by: Dimitri Fedrau Link: https://lore.kernel.org/r/20241209-fix-charge-current-limit-v1-1-760d9b8f2af3@liebherr.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/gpio-charger.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/power/supply/gpio-charger.c b/drivers/power/supply/gpio-charger.c index 68212b39785be..6139f736ecbe4 100644 --- a/drivers/power/supply/gpio-charger.c +++ b/drivers/power/supply/gpio-charger.c @@ -67,6 +67,14 @@ static int set_charge_current_limit(struct gpio_charger *gpio_charger, int val) if (gpio_charger->current_limit_map[i].limit_ua <= val) break; } + + /* + * If a valid charge current limit isn't found, default to smallest + * current limitation for safety reasons. + */ + if (i >= gpio_charger->current_limit_map_size) + i = gpio_charger->current_limit_map_size - 1; + mapping = gpio_charger->current_limit_map[i]; for (i = 0; i < ndescs; i++) { From e5f84d1cf562f7b45e28d6e5f6490626f870f81c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 15:59:26 +0100 Subject: [PATCH 0333/1450] power: supply: cros_charge-control: add mutex for driver data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concurrent accesses through sysfs may lead to inconsistent state in the priv data. Introduce a mutex to avoid this. Fixes: c6ed48ef5259 ("power: supply: add ChromeOS EC based charge control driver") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241208-cros_charge-control-v2-v1-1-8d168d0f08a3@weissschuh.net Signed-off-by: Sebastian Reichel --- drivers/power/supply/cros_charge-control.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/cros_charge-control.c b/drivers/power/supply/cros_charge-control.c index 17c53591ce197..58ca6d9ed6132 100644 --- a/drivers/power/supply/cros_charge-control.c +++ b/drivers/power/supply/cros_charge-control.c @@ -7,8 +7,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -49,6 +51,7 @@ struct cros_chctl_priv { struct attribute *attributes[_CROS_CHCTL_ATTR_COUNT]; struct attribute_group group; + struct mutex lock; /* protects fields below and cros_ec */ enum power_supply_charge_behaviour current_behaviour; u8 current_start_threshold, current_end_threshold; }; @@ -85,6 +88,8 @@ static int cros_chctl_configure_ec(struct cros_chctl_priv *priv) { struct ec_params_charge_control req = {}; + lockdep_assert_held(&priv->lock); + req.cmd = EC_CHARGE_CONTROL_CMD_SET; switch (priv->current_behaviour) { @@ -159,6 +164,7 @@ static ssize_t charge_control_start_threshold_show(struct device *dev, struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_START_THRESHOLD); + guard(mutex)(&priv->lock); return sysfs_emit(buf, "%u\n", (unsigned int)priv->current_start_threshold); } @@ -169,6 +175,7 @@ static ssize_t charge_control_start_threshold_store(struct device *dev, struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_START_THRESHOLD); + guard(mutex)(&priv->lock); return cros_chctl_store_threshold(dev, priv, 0, buf, count); } @@ -178,6 +185,7 @@ static ssize_t charge_control_end_threshold_show(struct device *dev, struct devi struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_END_THRESHOLD); + guard(mutex)(&priv->lock); return sysfs_emit(buf, "%u\n", (unsigned int)priv->current_end_threshold); } @@ -187,6 +195,7 @@ static ssize_t charge_control_end_threshold_store(struct device *dev, struct dev struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_END_THRESHOLD); + guard(mutex)(&priv->lock); return cros_chctl_store_threshold(dev, priv, 1, buf, count); } @@ -195,6 +204,7 @@ static ssize_t charge_behaviour_show(struct device *dev, struct device_attribute struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(&attr->attr, CROS_CHCTL_ATTR_CHARGE_BEHAVIOUR); + guard(mutex)(&priv->lock); return power_supply_charge_behaviour_show(dev, EC_CHARGE_CONTROL_BEHAVIOURS, priv->current_behaviour, buf); } @@ -210,6 +220,7 @@ static ssize_t charge_behaviour_store(struct device *dev, struct device_attribut if (ret < 0) return ret; + guard(mutex)(&priv->lock); priv->current_behaviour = ret; ret = cros_chctl_configure_ec(priv); @@ -290,6 +301,10 @@ static int cros_chctl_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; + ret = devm_mutex_init(dev, &priv->lock); + if (ret) + return ret; + ret = cros_ec_get_cmd_versions(cros_ec, EC_CMD_CHARGE_CONTROL); if (ret < 0) return ret; @@ -327,7 +342,8 @@ static int cros_chctl_probe(struct platform_device *pdev) priv->current_end_threshold = 100; /* Bring EC into well-known state */ - ret = cros_chctl_configure_ec(priv); + scoped_guard(mutex, &priv->lock) + ret = cros_chctl_configure_ec(priv); if (ret < 0) return ret; From e65a1b7fad0e112573eea7d64d4ab4fc513b8695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 15:59:27 +0100 Subject: [PATCH 0334/1450] power: supply: cros_charge-control: allow start_threshold == end_threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow setting the start and stop thresholds to the same value. There is no reason to disallow it. Suggested-by: Thomas Koch Fixes: c6ed48ef5259 ("power: supply: add ChromeOS EC based charge control driver") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241208-cros_charge-control-v2-v1-2-8d168d0f08a3@weissschuh.net Signed-off-by: Sebastian Reichel --- drivers/power/supply/cros_charge-control.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/cros_charge-control.c b/drivers/power/supply/cros_charge-control.c index 58ca6d9ed6132..108b121db4423 100644 --- a/drivers/power/supply/cros_charge-control.c +++ b/drivers/power/supply/cros_charge-control.c @@ -139,11 +139,11 @@ static ssize_t cros_chctl_store_threshold(struct device *dev, struct cros_chctl_ return -EINVAL; if (is_end_threshold) { - if (val <= priv->current_start_threshold) + if (val < priv->current_start_threshold) return -EINVAL; priv->current_end_threshold = val; } else { - if (val >= priv->current_end_threshold) + if (val > priv->current_end_threshold) return -EINVAL; priv->current_start_threshold = val; } From c28dc9fc24f5fa802d44ef7620a511035bdd803e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 8 Dec 2024 15:59:28 +0100 Subject: [PATCH 0335/1450] power: supply: cros_charge-control: hide start threshold on v2 cmd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ECs implementing the v2 command will not stop charging when the end threshold is reached. Instead they will begin discharging until the start threshold is reached, leading to permanent charge and discharge cycles. This defeats the point of the charge control mechanism. Avoid the issue by hiding the start threshold on v2 systems. Instead on those systems program the EC with start == end which forces the EC to reach and stay at that level. v1 does not support thresholds and v3 works correctly, at least judging from the code. Reported-by: Thomas Koch Fixes: c6ed48ef5259 ("power: supply: add ChromeOS EC based charge control driver") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241208-cros_charge-control-v2-v1-3-8d168d0f08a3@weissschuh.net Signed-off-by: Sebastian Reichel --- drivers/power/supply/cros_charge-control.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/power/supply/cros_charge-control.c b/drivers/power/supply/cros_charge-control.c index 108b121db4423..9b0a7500296b4 100644 --- a/drivers/power/supply/cros_charge-control.c +++ b/drivers/power/supply/cros_charge-control.c @@ -139,6 +139,10 @@ static ssize_t cros_chctl_store_threshold(struct device *dev, struct cros_chctl_ return -EINVAL; if (is_end_threshold) { + /* Start threshold is not exposed, use fixed value */ + if (priv->cmd_version == 2) + priv->current_start_threshold = val == 100 ? 0 : val; + if (val < priv->current_start_threshold) return -EINVAL; priv->current_end_threshold = val; @@ -234,12 +238,10 @@ static umode_t cros_chtl_attr_is_visible(struct kobject *kobj, struct attribute { struct cros_chctl_priv *priv = cros_chctl_attr_to_priv(attr, n); - if (priv->cmd_version < 2) { - if (n == CROS_CHCTL_ATTR_START_THRESHOLD) - return 0; - if (n == CROS_CHCTL_ATTR_END_THRESHOLD) - return 0; - } + if (n == CROS_CHCTL_ATTR_START_THRESHOLD && priv->cmd_version < 3) + return 0; + else if (n == CROS_CHCTL_ATTR_END_THRESHOLD && priv->cmd_version < 2) + return 0; return attr->mode; } From 070927427d82debf5797e60c96165f2666c0bc42 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sat, 7 Dec 2024 00:21:39 +0500 Subject: [PATCH 0336/1450] net: renesas: rswitch: do not deinit disabled ports In rswitch_ether_port_init_all(), only enabled ports are initialized. Then, rswitch_ether_port_deinit_all() shall also only deinitialize enabled ports. Signed-off-by: Nikita Yushchenko Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20241206192140.1714-1-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 8d18dae4d8fbf..3ba4a605f7cfc 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1527,7 +1527,7 @@ static void rswitch_ether_port_deinit_all(struct rswitch_private *priv) { unsigned int i; - for (i = 0; i < RSWITCH_NUM_PORTS; i++) { + rswitch_for_each_enabled_port(priv, i) { phy_exit(priv->rdev[i]->serdes); rswitch_ether_port_deinit_one(priv->rdev[i]); } From 32fd46f5b69e9a2e1206d576359e533e5b7c4694 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sat, 7 Dec 2024 00:21:40 +0500 Subject: [PATCH 0337/1450] net: renesas: rswitch: remove speed from gwca structure This field is set but never used. GWCA is rswitch CPU interface module which connects rswitch to the host over AXI bus. Speed of the switch ports is not anyhow related to GWCA operation. Signed-off-by: Nikita Yushchenko Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20241206192140.1714-2-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 3 --- drivers/net/ethernet/renesas/rswitch.h | 1 - 2 files changed, 4 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 3ba4a605f7cfc..b754cc96e268d 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1901,9 +1901,6 @@ static int rswitch_device_alloc(struct rswitch_private *priv, unsigned int index if (err < 0) goto out_get_params; - if (rdev->priv->gwca.speed < rdev->etha->speed) - rdev->priv->gwca.speed = rdev->etha->speed; - err = rswitch_rxdmac_alloc(ndev); if (err < 0) goto out_rxdmac; diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index 72e3ff596d318..303883369b94a 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -993,7 +993,6 @@ struct rswitch_gwca { DECLARE_BITMAP(used, RSWITCH_MAX_NUM_QUEUES); u32 tx_irq_bits[RSWITCH_NUM_IRQ_REGS]; u32 rx_irq_bits[RSWITCH_NUM_IRQ_REGS]; - int speed; }; #define NUM_QUEUES_PER_NDEV 2 From 31cdd8418234e70043abd26894b57eb201489cba Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:42:58 +0100 Subject: [PATCH 0338/1450] net: stmmac: Fix CSR divider comment The comment in declaration of STMMAC_CSR_250_300M incorrectly describes the constant as '/* MDC = clk_scr_i/122 */' but the DWC Ether QOS Handbook version 5.20a says it is CSR clock/124. Signed-off-by: Jan Petrous (OSS) Reviewed-by: Jacob Keller Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-1-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index d79ff252cfdc1..75cbfb5763582 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -33,7 +33,7 @@ #define STMMAC_CSR_20_35M 0x2 /* MDC = clk_scr_i/16 */ #define STMMAC_CSR_35_60M 0x3 /* MDC = clk_scr_i/26 */ #define STMMAC_CSR_150_250M 0x4 /* MDC = clk_scr_i/102 */ -#define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/122 */ +#define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/124 */ /* MTL algorithms identifiers */ #define MTL_TX_ALGORITHM_WRR 0x0 From c8fab05d021dfc04401102f9fa1de07fc8f75d8d Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:42:59 +0100 Subject: [PATCH 0339/1450] net: stmmac: Extend CSR calc support Add support for CSR clock range up to 800 MHz. Reviewed-by: Jacob Keller Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-2-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/common.h | 2 ++ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 ++++ include/linux/stmmac.h | 2 ++ 3 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 1367fa5c9b8ea..70d601f454811 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -257,6 +257,8 @@ struct stmmac_safety_stats { #define CSR_F_150M 150000000 #define CSR_F_250M 250000000 #define CSR_F_300M 300000000 +#define CSR_F_500M 500000000 +#define CSR_F_800M 800000000 #define MAC_CSR_H_FRQ_MASK 0x20 diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 9b262cdad60b2..3cb7ad6ccc4e5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -325,6 +325,10 @@ static void stmmac_clk_csr_set(struct stmmac_priv *priv) priv->clk_csr = STMMAC_CSR_150_250M; else if ((clk_rate >= CSR_F_250M) && (clk_rate <= CSR_F_300M)) priv->clk_csr = STMMAC_CSR_250_300M; + else if ((clk_rate >= CSR_F_300M) && (clk_rate < CSR_F_500M)) + priv->clk_csr = STMMAC_CSR_300_500M; + else if ((clk_rate >= CSR_F_500M) && (clk_rate < CSR_F_800M)) + priv->clk_csr = STMMAC_CSR_500_800M; } if (priv->plat->flags & STMMAC_FLAG_HAS_SUN8I) { diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 75cbfb5763582..865d0fe26f98c 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -34,6 +34,8 @@ #define STMMAC_CSR_35_60M 0x3 /* MDC = clk_scr_i/26 */ #define STMMAC_CSR_150_250M 0x4 /* MDC = clk_scr_i/102 */ #define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/124 */ +#define STMMAC_CSR_300_500M 0x6 /* MDC = clk_scr_i/204 */ +#define STMMAC_CSR_500_800M 0x7 /* MDC = clk_scr_i/324 */ /* MTL algorithms identifiers */ #define MTL_TX_ALGORITHM_WRR 0x0 From cb09f61a9ab84369c62f2ef7f8a2b797f596f6d1 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:00 +0100 Subject: [PATCH 0340/1450] net: stmmac: Fix clock rate variables size The clock API clk_get_rate() returns unsigned long value. Expand affected members of stmmac platform data and convert the stmmac_clk_csr_set() and dwmac4_core_init() methods to defining the unsigned long clk_rate local variables. Reviewed-by: Andrew Lunn Reviewed-by: Serge Semin Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-3-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 2 +- include/linux/stmmac.h | 6 +++--- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 901a3c1959fa5..2a5b38723635b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -777,7 +777,7 @@ static void ethqos_ptp_clk_freq_config(struct stmmac_priv *priv) netdev_err(priv->dev, "Failed to max out clk_ptp_ref: %d\n", err); plat_dat->clk_ptp_rate = clk_get_rate(plat_dat->clk_ptp_ref); - netdev_dbg(priv->dev, "PTP rate %d\n", plat_dat->clk_ptp_rate); + netdev_dbg(priv->dev, "PTP rate %lu\n", plat_dat->clk_ptp_rate); } static int qcom_ethqos_probe(struct platform_device *pdev) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index c25781874aa78..c36f90a782c50 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -27,7 +27,7 @@ static void dwmac4_core_init(struct mac_device_info *hw, struct stmmac_priv *priv = netdev_priv(dev); void __iomem *ioaddr = hw->pcsr; u32 value = readl(ioaddr + GMAC_CONFIG); - u32 clk_rate; + unsigned long clk_rate; value |= GMAC_CORE_INIT; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 3cb7ad6ccc4e5..d45fd7a3acd5c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -301,7 +301,7 @@ static void stmmac_global_err(struct stmmac_priv *priv) */ static void stmmac_clk_csr_set(struct stmmac_priv *priv) { - u32 clk_rate; + unsigned long clk_rate; clk_rate = clk_get_rate(priv->plat->stmmac_clk); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 3ac32444e4922..06e07e6e180b7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -640,7 +640,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dev_info(&pdev->dev, "PTP uses main clock\n"); } else { plat->clk_ptp_rate = clk_get_rate(plat->clk_ptp_ref); - dev_dbg(&pdev->dev, "PTP rate %d\n", plat->clk_ptp_rate); + dev_dbg(&pdev->dev, "PTP rate %lu\n", plat->clk_ptp_rate); } plat->stmmac_rst = devm_reset_control_get_optional(&pdev->dev, diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 865d0fe26f98c..c9878a612e532 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -252,8 +252,8 @@ struct plat_stmmacenet_data { struct clk *stmmac_clk; struct clk *pclk; struct clk *clk_ptp_ref; - unsigned int clk_ptp_rate; - unsigned int clk_ref_rate; + unsigned long clk_ptp_rate; + unsigned long clk_ref_rate; unsigned int mult_fact_100ns; s32 ptp_max_adj; u32 cdc_error_adj; @@ -265,7 +265,7 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; int has_xgmac; u8 vlan_fail_q; - unsigned int eee_usecs_rate; + unsigned long eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int msi_mac_vec; From 386aa60abdb600a4e5ad818e6dba171685942e54 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:01 +0100 Subject: [PATCH 0341/1450] net: phy: Add helper for mapping RGMII link speed to clock rate The RGMII interface supports three data rates: 10/100 Mbps and 1 Gbps. These speeds correspond to clock frequencies of 2.5/25 MHz and 125 MHz, respectively. Many Ethernet drivers, including glues in stmmac, follow a similar pattern of converting RGMII speed to clock frequency. To simplify code, define the helper rgmii_clock(speed) to convert connection speed to clock frequency. Suggested-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-4-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/linux/phy.h b/include/linux/phy.h index bb157136351e7..e597a32cc787a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -298,6 +298,29 @@ static inline const char *phy_modes(phy_interface_t interface) } } +/** + * rgmii_clock - map link speed to the clock rate + * @speed: link speed value + * + * Description: maps RGMII supported link speeds + * into the clock rates. + * + * Returns: clock rate or negative errno + */ +static inline long rgmii_clock(int speed) +{ + switch (speed) { + case SPEED_10: + return 2500000; + case SPEED_100: + return 25000000; + case SPEED_1000: + return 125000000; + default: + return -EINVAL; + } +} + #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 From 37b66c483e4c8a72cd1fd22f8ced05cc40f9e128 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:02 +0100 Subject: [PATCH 0342/1450] net: dwmac-dwc-qos-eth: Use helper rgmii_clock Utilize a new helper function rgmii_clock(). Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-5-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index 83290e707df53..bd4eb187f8c64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -181,24 +181,19 @@ static void dwc_qos_remove(struct platform_device *pdev) static void tegra_eqos_fix_speed(void *priv, unsigned int speed, unsigned int mode) { struct tegra_eqos *eqos = priv; - unsigned long rate = 125000000; bool needs_calibration = false; + long rate = 125000000; u32 value; int err; switch (speed) { case SPEED_1000: - needs_calibration = true; - rate = 125000000; - break; - case SPEED_100: needs_calibration = true; - rate = 25000000; - break; + fallthrough; case SPEED_10: - rate = 2500000; + rate = rgmii_clock(speed); break; default: From 839b75ea4d940f810650a2ce11c91d94c5f01aa3 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:03 +0100 Subject: [PATCH 0343/1450] net: dwmac-imx: Use helper rgmii_clock Utilize a new helper function rgmii_clock(). Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-6-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 641f3cd019a3c..43e0fbba4f77b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -186,7 +186,7 @@ static void imx_dwmac_fix_speed(void *priv, unsigned int speed, unsigned int mod { struct plat_stmmacenet_data *plat_dat; struct imx_priv_data *dwmac = priv; - unsigned long rate; + long rate; int err; plat_dat = dwmac->plat_dat; @@ -196,17 +196,8 @@ static void imx_dwmac_fix_speed(void *priv, unsigned int speed, unsigned int mod (plat_dat->mac_interface == PHY_INTERFACE_MODE_MII)) return; - switch (speed) { - case SPEED_1000: - rate = 125000000; - break; - case SPEED_100: - rate = 25000000; - break; - case SPEED_10: - rate = 2500000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dwmac->dev, "invalid speed %u\n", speed); return; } From 8470bfc835154a80774e5ab0e46969f196c0dba1 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:04 +0100 Subject: [PATCH 0344/1450] net: dwmac-intel-plat: Use helper rgmii_clock Utilize a new helper function rgmii_clock(). When in, remove dead code in kmb_eth_fix_mac_speed(). Reviewed-by: Andrew Lunn Signed-off-by: Jan Petrous (OSS) Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-7-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- .../stmicro/stmmac/dwmac-intel-plat.c | 22 ++++--------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c index d94f0a150e934..ddee6154d40bc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c @@ -31,27 +31,13 @@ struct intel_dwmac_data { static void kmb_eth_fix_mac_speed(void *priv, unsigned int speed, unsigned int mode) { struct intel_dwmac *dwmac = priv; - unsigned long rate; + long rate; int ret; - rate = clk_get_rate(dwmac->tx_clk); - - switch (speed) { - case SPEED_1000: - rate = 125000000; - break; - - case SPEED_100: - rate = 25000000; - break; - - case SPEED_10: - rate = 2500000; - break; - - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dwmac->dev, "Invalid speed\n"); - break; + return; } ret = clk_set_rate(dwmac->tx_clk, rate); From 30b4a9b5c335b32a8c8234662b180876a2db173e Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:05 +0100 Subject: [PATCH 0345/1450] net: dwmac-rk: Use helper rgmii_clock Utilize a new helper function rgmii_clock(). Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-8-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-rk.c | 30 ++++--------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index 8cb374668b74c..a4dc89e23a68e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -1079,20 +1079,11 @@ static void rk3568_set_gmac_speed(struct rk_priv_data *bsp_priv, int speed) { struct clk *clk_mac_speed = bsp_priv->clks[RK_CLK_MAC_SPEED].clk; struct device *dev = &bsp_priv->pdev->dev; - unsigned long rate; + long rate; int ret; - switch (speed) { - case 10: - rate = 2500000; - break; - case 100: - rate = 25000000; - break; - case 1000: - rate = 125000000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dev, "unknown speed value for GMAC speed=%d", speed); return; } @@ -1540,20 +1531,11 @@ static void rv1126_set_rgmii_speed(struct rk_priv_data *bsp_priv, int speed) { struct clk *clk_mac_speed = bsp_priv->clks[RK_CLK_MAC_SPEED].clk; struct device *dev = &bsp_priv->pdev->dev; - unsigned long rate; + long rate; int ret; - switch (speed) { - case 10: - rate = 2500000; - break; - case 100: - rate = 25000000; - break; - case 1000: - rate = 125000000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dev, "unknown speed value for RGMII speed=%d", speed); return; } From b561d717a799241d260a7e5667e8d35de6fac874 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:06 +0100 Subject: [PATCH 0346/1450] net: dwmac-starfive: Use helper rgmii_clock Utilize a new helper function rgmii_clock(). Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Reviewed-by: Emil Renner Berthing Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-9-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/dwmac-starfive.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c index 421666279dd38..0a0a363d37309 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c @@ -34,24 +34,13 @@ struct starfive_dwmac { static void starfive_dwmac_fix_mac_speed(void *priv, unsigned int speed, unsigned int mode) { struct starfive_dwmac *dwmac = priv; - unsigned long rate; + long rate; int err; - rate = clk_get_rate(dwmac->clk_tx); - - switch (speed) { - case SPEED_1000: - rate = 125000000; - break; - case SPEED_100: - rate = 25000000; - break; - case SPEED_10: - rate = 2500000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) { dev_err(dwmac->dev, "invalid speed %u\n", speed); - break; + return; } err = clk_set_rate(dwmac->clk_tx, rate); From 04207d28f46870df113112a4afc42458495837d6 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:07 +0100 Subject: [PATCH 0347/1450] net: macb: Use helper rgmii_clock Utilize a new helper function rgmii_clock(). Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Acked-by: Nicolas Ferre Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-10-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index daa416fb1724e..640f500f989d5 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -530,19 +530,9 @@ static void macb_set_tx_clk(struct macb *bp, int speed) if (bp->phy_interface == PHY_INTERFACE_MODE_MII) return; - switch (speed) { - case SPEED_10: - rate = 2500000; - break; - case SPEED_100: - rate = 25000000; - break; - case SPEED_1000: - rate = 125000000; - break; - default: + rate = rgmii_clock(speed); + if (rate < 0) return; - } rate_rounded = clk_round_rate(bp->tx_clk, rate); if (rate_rounded < 0) From fd59bca4d5eaba6cadf78e74b5e72fd8852a7529 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:08 +0100 Subject: [PATCH 0348/1450] net: xgene_enet: Use helper rgmii_clock Utilize a new helper function rgmii_clock(). Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-11-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/apm/xgene/xgene_enet_hw.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c index e641dbbea1e27..b854b6b42d77b 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c @@ -421,18 +421,12 @@ static void xgene_enet_configure_clock(struct xgene_enet_pdata *pdata) if (dev->of_node) { struct clk *parent = clk_get_parent(pdata->clk); + long rate = rgmii_clock(pdata->phy_speed); - switch (pdata->phy_speed) { - case SPEED_10: - clk_set_rate(parent, 2500000); - break; - case SPEED_100: - clk_set_rate(parent, 25000000); - break; - default: - clk_set_rate(parent, 125000000); - break; - } + if (rate < 0) + rate = 125000000; + + clk_set_rate(parent, rate); } #ifdef CONFIG_ACPI else { From 1ead5777550717f77fa70d6342fc467bebc18519 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:09 +0100 Subject: [PATCH 0349/1450] net: dwmac-sti: Use helper rgmii_clock Utilize a new helper function rgmii_clock(). Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-12-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-sti.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index a6ff02d905a99..eabc4da9e1a98 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -21,10 +21,7 @@ #include "stmmac_platform.h" -#define DWMAC_125MHZ 125000000 #define DWMAC_50MHZ 50000000 -#define DWMAC_25MHZ 25000000 -#define DWMAC_2_5MHZ 2500000 #define IS_PHY_IF_MODE_RGMII(iface) (iface == PHY_INTERFACE_MODE_RGMII || \ iface == PHY_INTERFACE_MODE_RGMII_ID || \ @@ -140,7 +137,7 @@ static void stih4xx_fix_retime_src(void *priv, u32 spd, unsigned int mode) struct sti_dwmac *dwmac = priv; u32 src = dwmac->tx_retime_src; u32 reg = dwmac->ctrl_reg; - u32 freq = 0; + long freq = 0; if (dwmac->interface == PHY_INTERFACE_MODE_MII) { src = TX_RETIME_SRC_TXCLK; @@ -153,19 +150,14 @@ static void stih4xx_fix_retime_src(void *priv, u32 spd, unsigned int mode) } } else if (IS_PHY_IF_MODE_RGMII(dwmac->interface)) { /* On GiGa clk source can be either ext or from clkgen */ - if (spd == SPEED_1000) { - freq = DWMAC_125MHZ; - } else { + freq = rgmii_clock(spd); + + if (spd != SPEED_1000 && freq > 0) /* Switch to clkgen for these speeds */ src = TX_RETIME_SRC_CLKGEN; - if (spd == SPEED_100) - freq = DWMAC_25MHZ; - else if (spd == SPEED_10) - freq = DWMAC_2_5MHZ; - } } - if (src == TX_RETIME_SRC_CLKGEN && freq) + if (src == TX_RETIME_SRC_CLKGEN && freq > 0) clk_set_rate(dwmac->clk, freq); regmap_update_bits(dwmac->regmap, reg, STIH4XX_RETIME_SRC_MASK, From 91f10e5895209c855edc0f993410f5d82b54e049 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:10 +0100 Subject: [PATCH 0350/1450] dt-bindings: net: Add DT bindings for DWMAC on NXP S32G/R SoCs Add basic description for DWMAC ethernet IP on NXP S32G2xx, S32G3xx and S32R45 automotive series SoCs. Reviewed-by: Rob Herring (Arm) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-13-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- .../bindings/net/nxp,s32-dwmac.yaml | 105 ++++++++++++++++++ .../devicetree/bindings/net/snps,dwmac.yaml | 1 + 2 files changed, 106 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml diff --git a/Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml b/Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml new file mode 100644 index 0000000000000..2b8b74c5feec8 --- /dev/null +++ b/Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright 2021-2024 NXP +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/nxp,s32-dwmac.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NXP S32G2xx/S32G3xx/S32R45 GMAC ethernet controller + +maintainers: + - Jan Petrous (OSS) + +description: + This device is a Synopsys DWC IP, integrated on NXP S32G/R SoCs. + The SoC series S32G2xx and S32G3xx feature one DWMAC instance, + the SoC S32R45 has two instances. The devices can use RGMII/RMII/MII + interface over Pinctrl device or the output can be routed + to the embedded SerDes for SGMII connectivity. + +properties: + compatible: + oneOf: + - const: nxp,s32g2-dwmac + - items: + - enum: + - nxp,s32g3-dwmac + - nxp,s32r45-dwmac + - const: nxp,s32g2-dwmac + + reg: + items: + - description: Main GMAC registers + - description: GMAC PHY mode control register + + interrupts: + maxItems: 1 + + interrupt-names: + const: macirq + + clocks: + items: + - description: Main GMAC clock + - description: Transmit clock + - description: Receive clock + - description: PTP reference clock + + clock-names: + items: + - const: stmmaceth + - const: tx + - const: rx + - const: ptp_ref + +required: + - clocks + - clock-names + +allOf: + - $ref: snps,dwmac.yaml# + +unevaluatedProperties: false + +examples: + - | + #include + #include + #include + bus { + #address-cells = <2>; + #size-cells = <2>; + + ethernet@4033c000 { + compatible = "nxp,s32g2-dwmac"; + reg = <0x0 0x4033c000 0x0 0x2000>, /* gmac IP */ + <0x0 0x4007c004 0x0 0x4>; /* GMAC_0_CTRL_STS */ + interrupt-parent = <&gic>; + interrupts = ; + interrupt-names = "macirq"; + snps,mtl-rx-config = <&mtl_rx_setup>; + snps,mtl-tx-config = <&mtl_tx_setup>; + clocks = <&clks 24>, <&clks 17>, <&clks 16>, <&clks 15>; + clock-names = "stmmaceth", "tx", "rx", "ptp_ref"; + phy-mode = "rgmii-id"; + phy-handle = <&phy0>; + + mtl_rx_setup: rx-queues-config { + snps,rx-queues-to-use = <5>; + }; + + mtl_tx_setup: tx-queues-config { + snps,tx-queues-to-use = <5>; + }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + compatible = "snps,dwmac-mdio"; + + phy0: ethernet-phy@0 { + reg = <0>; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index eb1f3ae41ab9a..91e75eb3f329b 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -67,6 +67,7 @@ properties: - ingenic,x2000-mac - loongson,ls2k-dwmac - loongson,ls7a-dwmac + - nxp,s32g2-dwmac - qcom,qcs404-ethqos - qcom,sa8775p-ethqos - qcom,sc8280xp-ethqos From cd197ac5d661ee2ab36f1578164e276ad947506c Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:11 +0100 Subject: [PATCH 0351/1450] net: stmmac: dwmac-s32: add basic NXP S32G/S32R glue driver NXP S32G2xx/S32G3xx and S32R45 are automotive grade SoCs that integrate one or two Synopsys DWMAC 5.10/5.20 IPs. The basic driver supports only RGMII interface. Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-14-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 12 ++ drivers/net/ethernet/stmicro/stmmac/Makefile | 1 + .../net/ethernet/stmicro/stmmac/dwmac-s32.c | 202 ++++++++++++++++++ 3 files changed, 215 insertions(+) create mode 100644 drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index 6658536a4e171..4cc85a36a1ab6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -154,6 +154,18 @@ config DWMAC_RZN1 the stmmac device driver. This support can make use of a custom MII converter PCS device. +config DWMAC_S32 + tristate "NXP S32G/S32R GMAC support" + default ARCH_S32 + depends on OF && (ARCH_S32 || COMPILE_TEST) + help + Support for ethernet controller on NXP S32CC SOCs. + + This selects NXP SoC glue layer support for the stmmac + device driver. This driver is used for the S32CC series + SOCs GMAC ethernet controller, ie. S32G2xx, S32G3xx and + S32R45. + config DWMAC_SOCFPGA tristate "SOCFPGA dwmac support" default ARCH_INTEL_SOCFPGA diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile index 2389fd2613446..b26f0e79c2b32 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Makefile +++ b/drivers/net/ethernet/stmicro/stmmac/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_DWMAC_MESON) += dwmac-meson.o dwmac-meson8b.o obj-$(CONFIG_DWMAC_QCOM_ETHQOS) += dwmac-qcom-ethqos.o obj-$(CONFIG_DWMAC_ROCKCHIP) += dwmac-rk.o obj-$(CONFIG_DWMAC_RZN1) += dwmac-rzn1.o +obj-$(CONFIG_DWMAC_S32) += dwmac-s32.o obj-$(CONFIG_DWMAC_SOCFPGA) += dwmac-altr-socfpga.o obj-$(CONFIG_DWMAC_STARFIVE) += dwmac-starfive.o obj-$(CONFIG_DWMAC_STI) += dwmac-sti.o diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c new file mode 100644 index 0000000000000..9cc0e5817416e --- /dev/null +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NXP S32G/R GMAC glue layer + * + * Copyright 2019-2024 NXP + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "stmmac_platform.h" + +#define GMAC_INTF_RATE_125M 125000000 /* 125MHz */ + +/* SoC PHY interface control register */ +#define PHY_INTF_SEL_MII 0x00 +#define PHY_INTF_SEL_SGMII 0x01 +#define PHY_INTF_SEL_RGMII 0x02 +#define PHY_INTF_SEL_RMII 0x08 + +struct s32_priv_data { + void __iomem *ioaddr; + void __iomem *ctrl_sts; + struct device *dev; + phy_interface_t *intf_mode; + struct clk *tx_clk; + struct clk *rx_clk; +}; + +static int s32_gmac_write_phy_intf_select(struct s32_priv_data *gmac) +{ + writel(PHY_INTF_SEL_RGMII, gmac->ctrl_sts); + + dev_dbg(gmac->dev, "PHY mode set to %s\n", phy_modes(*gmac->intf_mode)); + + return 0; +} + +static int s32_gmac_init(struct platform_device *pdev, void *priv) +{ + struct s32_priv_data *gmac = priv; + int ret; + + /* Set initial TX interface clock */ + ret = clk_prepare_enable(gmac->tx_clk); + if (ret) { + dev_err(&pdev->dev, "Can't enable tx clock\n"); + return ret; + } + ret = clk_set_rate(gmac->tx_clk, GMAC_INTF_RATE_125M); + if (ret) { + dev_err(&pdev->dev, "Can't set tx clock\n"); + goto err_tx_disable; + } + + /* Set initial RX interface clock */ + ret = clk_prepare_enable(gmac->rx_clk); + if (ret) { + dev_err(&pdev->dev, "Can't enable rx clock\n"); + goto err_tx_disable; + } + ret = clk_set_rate(gmac->rx_clk, GMAC_INTF_RATE_125M); + if (ret) { + dev_err(&pdev->dev, "Can't set rx clock\n"); + goto err_txrx_disable; + } + + /* Set interface mode */ + ret = s32_gmac_write_phy_intf_select(gmac); + if (ret) { + dev_err(&pdev->dev, "Can't set PHY interface mode\n"); + goto err_txrx_disable; + } + + return 0; + +err_txrx_disable: + clk_disable_unprepare(gmac->rx_clk); +err_tx_disable: + clk_disable_unprepare(gmac->tx_clk); + return ret; +} + +static void s32_gmac_exit(struct platform_device *pdev, void *priv) +{ + struct s32_priv_data *gmac = priv; + + clk_disable_unprepare(gmac->tx_clk); + clk_disable_unprepare(gmac->rx_clk); +} + +static void s32_fix_mac_speed(void *priv, unsigned int speed, unsigned int mode) +{ + struct s32_priv_data *gmac = priv; + long tx_clk_rate; + int ret; + + tx_clk_rate = rgmii_clock(speed); + if (tx_clk_rate < 0) { + dev_err(gmac->dev, "Unsupported/Invalid speed: %d\n", speed); + return; + } + + dev_dbg(gmac->dev, "Set tx clock to %ld Hz\n", tx_clk_rate); + ret = clk_set_rate(gmac->tx_clk, tx_clk_rate); + if (ret) + dev_err(gmac->dev, "Can't set tx clock\n"); +} + +static int s32_dwmac_probe(struct platform_device *pdev) +{ + struct plat_stmmacenet_data *plat; + struct device *dev = &pdev->dev; + struct stmmac_resources res; + struct s32_priv_data *gmac; + int ret; + + gmac = devm_kzalloc(&pdev->dev, sizeof(*gmac), GFP_KERNEL); + if (!gmac) + return -ENOMEM; + + gmac->dev = &pdev->dev; + + ret = stmmac_get_platform_resources(pdev, &res); + if (ret) + return dev_err_probe(dev, ret, + "Failed to get platform resources\n"); + + plat = devm_stmmac_probe_config_dt(pdev, res.mac); + if (IS_ERR(plat)) + return dev_err_probe(dev, PTR_ERR(plat), + "dt configuration failed\n"); + + /* PHY interface mode control reg */ + gmac->ctrl_sts = devm_platform_get_and_ioremap_resource(pdev, 1, NULL); + if (IS_ERR(gmac->ctrl_sts)) + return dev_err_probe(dev, PTR_ERR(gmac->ctrl_sts), + "S32CC config region is missing\n"); + + /* tx clock */ + gmac->tx_clk = devm_clk_get(&pdev->dev, "tx"); + if (IS_ERR(gmac->tx_clk)) + return dev_err_probe(dev, PTR_ERR(gmac->tx_clk), + "tx clock not found\n"); + + /* rx clock */ + gmac->rx_clk = devm_clk_get(&pdev->dev, "rx"); + if (IS_ERR(gmac->rx_clk)) + return dev_err_probe(dev, PTR_ERR(gmac->rx_clk), + "rx clock not found\n"); + + gmac->intf_mode = &plat->phy_interface; + gmac->ioaddr = res.addr; + + /* S32CC core feature set */ + plat->has_gmac4 = true; + plat->pmt = 1; + plat->flags |= STMMAC_FLAG_SPH_DISABLE; + plat->rx_fifo_size = 20480; + plat->tx_fifo_size = 20480; + + plat->init = s32_gmac_init; + plat->exit = s32_gmac_exit; + plat->fix_mac_speed = s32_fix_mac_speed; + + plat->bsp_priv = gmac; + + return stmmac_pltfr_probe(pdev, plat, &res); +} + +static const struct of_device_id s32_dwmac_match[] = { + { .compatible = "nxp,s32g2-dwmac" }, + { } +}; +MODULE_DEVICE_TABLE(of, s32_dwmac_match); + +static struct platform_driver s32_dwmac_driver = { + .probe = s32_dwmac_probe, + .remove = stmmac_pltfr_remove, + .driver = { + .name = "s32-dwmac", + .pm = &stmmac_pltfr_pm_ops, + .of_match_table = s32_dwmac_match, + }, +}; +module_platform_driver(s32_dwmac_driver); + +MODULE_AUTHOR("Jan Petrous (OSS) "); +MODULE_DESCRIPTION("NXP S32G/R common chassis GMAC driver"); +MODULE_LICENSE("GPL"); + From 6bc6234cbd5e9f7a4d8a20aa4d5f0c891e099649 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:12 +0100 Subject: [PATCH 0352/1450] MAINTAINERS: Add Jan Petrous as the NXP S32G/R DWMAC driver maintainer Add myself as NXP S32G/R DWMAC Ethernet driver maintainer. Reviewed-by: Andrew Lunn Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-15-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2d75560e64acd..af35519be3200 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2836,6 +2836,13 @@ S: Maintained F: arch/arm64/boot/dts/freescale/s32g*.dts* F: drivers/pinctrl/nxp/ +ARM/NXP S32G/S32R DWMAC ETHERNET DRIVER +M: Jan Petrous +L: NXP S32 Linux Team +S: Maintained +F: Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml +F: drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c + ARM/Orion SoC/Technologic Systems TS-78xx platform support M: Alexander Clouter L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) From 919bfa9b2dbf3bc0c478afd4e44445836381dacb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 10 Dec 2024 03:25:57 +0000 Subject: [PATCH 0353/1450] cpufreq/amd-pstate: Detect preferred core support before driver registration Booting with amd-pstate on 3rd Generation EPYC system incorrectly enabled ITMT support despite the system not supporting Preferred Core ranking. amd_pstate_init_prefcore() called during amd_pstate*_cpu_init() requires "amd_pstate_prefcore" to be set correctly however the preferred core support is detected only after driver registration which is too late. Swap the function calls around to detect preferred core support before registring the driver via amd_pstate_register_driver(). This ensures amd_pstate*_cpu_init() sees the correct value of "amd_pstate_prefcore" considering the platform support. Fixes: 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()") Fixes: ff2653ded4d9 ("cpufreq/amd-pstate: Move registration after static function call update") Signed-off-by: K Prateek Nayak Acked-by: Mario Limonciello Link: https://lore.kernel.org/r/20241210032557.754-1-kprateek.nayak@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d7630bab2516f..8b36450bbdf6a 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1869,18 +1869,18 @@ static int __init amd_pstate_init(void) static_call_update(amd_pstate_update_perf, shmem_update_perf); } - ret = amd_pstate_register_driver(cppc_state); - if (ret) { - pr_err("failed to register with return %d\n", ret); - return ret; - } - if (amd_pstate_prefcore) { ret = amd_detect_prefcore(&amd_pstate_prefcore); if (ret) return ret; } + ret = amd_pstate_register_driver(cppc_state); + if (ret) { + pr_err("failed to register with return %d\n", ret); + return ret; + } + dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group); From 8644b48714dca8bf2f42a4ff8311de8efc9bd8c3 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 14 May 2024 10:15:14 +0300 Subject: [PATCH 0354/1450] thunderbolt: Add support for Intel Panther Lake-M/P Intel Panther Lake-M/P has the same integrated Thunderbolt/USB4 controller as Lunar Lake. Add these PCI IDs to the driver list of supported devices. Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/nhi.c | 8 ++++++++ drivers/thunderbolt/nhi.h | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 1257dd3ce7e6a..f3a2264e012bc 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -1520,6 +1520,14 @@ static struct pci_device_id nhi_ids[] = { .driver_data = (kernel_ulong_t)&icl_nhi_ops }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_LNL_NHI1), .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_M_NHI0), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_M_NHI1), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_P_NHI0), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_P_NHI1), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_80G_NHI) }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_40G_NHI) }, diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h index 7a07c7c1a9c2c..16744f25a9a06 100644 --- a/drivers/thunderbolt/nhi.h +++ b/drivers/thunderbolt/nhi.h @@ -92,6 +92,10 @@ extern const struct tb_nhi_ops icl_nhi_ops; #define PCI_DEVICE_ID_INTEL_RPL_NHI1 0xa76d #define PCI_DEVICE_ID_INTEL_LNL_NHI0 0xa833 #define PCI_DEVICE_ID_INTEL_LNL_NHI1 0xa834 +#define PCI_DEVICE_ID_INTEL_PTL_M_NHI0 0xe333 +#define PCI_DEVICE_ID_INTEL_PTL_M_NHI1 0xe334 +#define PCI_DEVICE_ID_INTEL_PTL_P_NHI0 0xe433 +#define PCI_DEVICE_ID_INTEL_PTL_P_NHI1 0xe434 #define PCI_CLASS_SERIAL_USB_USB4 0x0c0340 From a4048c83fd87c65657a4acb17d639092d4b6133d Mon Sep 17 00:00:00 2001 From: Anumula Murali Mohan Reddy Date: Tue, 3 Dec 2024 19:30:53 +0530 Subject: [PATCH 0355/1450] RDMA/core: Fix ENODEV error for iWARP test over vlan If traffic is over vlan, cma_validate_port() fails to match net_device ifindex with bound_if_index and results in ENODEV error. As iWARP gid table is static, it contains entry corresponding to only one net device which is either real netdev or vlan netdev for cases like siw attached to a vlan interface. This patch fixes the issue by assigning bound_if_index with net device index, if real net device obtained from bound if index matches with net device retrieved from gid table Fixes: f8ef1be816bf ("RDMA/cma: Avoid GID lookups on iWARP devices") Link: https://lore.kernel.org/all/ZzNgdrjo1kSCGbRz@chelsio.com/ Signed-off-by: Anumula Murali Mohan Reddy Signed-off-by: Potnuri Bharat Teja Link: https://patch.msgid.link/20241203140052.3985-1-anumula@chelsio.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 64ace0b968f07..91db10515d747 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -690,6 +690,7 @@ cma_validate_port(struct ib_device *device, u32 port, int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + struct net_device *pdev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; @@ -714,6 +715,21 @@ cma_validate_port(struct ib_device *device, u32 port, rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); + if (ndev->ifindex != bound_if_index) { + pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index); + if (pdev) { + if (is_vlan_dev(pdev)) { + pdev = vlan_dev_real_dev(pdev); + if (ndev->ifindex == pdev->ifindex) + bound_if_index = pdev->ifindex; + } + if (is_vlan_dev(ndev)) { + pdev = vlan_dev_real_dev(ndev); + if (bound_if_index == pdev->ifindex) + bound_if_index = ndev->ifindex; + } + } + } if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) { rdma_put_gid_attr(sgid_attr); From efb113fc30e7b805f7375d269b93bb4593d11d97 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Nov 2024 17:23:10 +0100 Subject: [PATCH 0356/1450] drm: rework FB_CORE dependency The 'select FB_CORE' statement moved from CONFIG_DRM to DRM_CLIENT_LIB, but there are now configurations that have code calling into fb_core as built-in even though the client_lib itself is a loadable module: x86_64-linux-ld: drivers/gpu/drm/drm_fbdev_shmem.o: in function `drm_fbdev_shmem_driver_fbdev_probe': drm_fbdev_shmem.c:(.text+0x1fc): undefined reference to `fb_deferred_io_init' x86_64-linux-ld: drivers/gpu/drm/drm_fbdev_shmem.o: in function `drm_fbdev_shmem_fb_destroy': drm_fbdev_shmem.c:(.text+0x2e1): undefined reference to `fb_deferred_io_cleanup' In addition to DRM_CLIENT_LIB, the 'select' needs to be at least in two more parts, DRM_KMS_HELPER and DRM_GEM_SHMEM_HELPER, so add those here. v3: - Remove FB_CORE from DRM_KMS_HELPER to avoid circular dependency Fixes: dadd28d4142f ("drm/client: Add client-lib module") Signed-off-by: Arnd Bergmann Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20241115162323.3555229-1-arnd@kernel.org --- drivers/gpu/drm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 5504721007cc1..a0690049b292e 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -372,6 +372,7 @@ config DRM_GEM_DMA_HELPER config DRM_GEM_SHMEM_HELPER tristate depends on DRM && MMU + select FB_CORE if DRM_FBDEV_EMULATION select FB_SYSMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION help Choose this if you need the GEM shmem helper functions From c1043cdb019ed4d053d673e62b553a5cea1a287d Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Sat, 7 Dec 2024 21:26:55 -0300 Subject: [PATCH 0357/1450] alienware-wmi: Fix X Series and G Series quirks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devices that are known to support the WMI thermal interface do not support the legacy LED control interface. Make `.num_zones = 0` and avoid calling alienware_zone_init() if that's the case. Fixes: 9f6c43041552 ("alienware-wmi: added platform profile support") Fixes: 1c1eb70e7d23 ("alienware-wmi: extends the list of supported models") Suggested-by: Armin Wolf Reviewed-by: Armin Wolf Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20241208002652.5885-4-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi.c b/drivers/platform/x86/dell/alienware-wmi.c index 77465ed9b449b..e69bf9a7b6c84 100644 --- a/drivers/platform/x86/dell/alienware-wmi.c +++ b/drivers/platform/x86/dell/alienware-wmi.c @@ -190,7 +190,7 @@ static struct quirk_entry quirk_asm201 = { }; static struct quirk_entry quirk_g_series = { - .num_zones = 2, + .num_zones = 0, .hdmi_mux = 0, .amplifier = 0, .deepslp = 0, @@ -199,7 +199,7 @@ static struct quirk_entry quirk_g_series = { }; static struct quirk_entry quirk_x_series = { - .num_zones = 2, + .num_zones = 0, .hdmi_mux = 0, .amplifier = 0, .deepslp = 0, @@ -687,6 +687,9 @@ static void alienware_zone_exit(struct platform_device *dev) { u8 zone; + if (!quirks->num_zones) + return; + sysfs_remove_group(&dev->dev.kobj, &zone_attribute_group); led_classdev_unregister(&global_led); if (zone_dev_attrs) { @@ -1229,9 +1232,11 @@ static int __init alienware_wmi_init(void) goto fail_prep_thermal_profile; } - ret = alienware_zone_init(platform_device); - if (ret) - goto fail_prep_zones; + if (quirks->num_zones > 0) { + ret = alienware_zone_init(platform_device); + if (ret) + goto fail_prep_zones; + } return 0; From 54a8cada2f3d7efb4a7920807473d89c442d9c45 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Sat, 7 Dec 2024 21:30:15 -0300 Subject: [PATCH 0358/1450] alienware-wmi: Adds support to Alienware m16 R1 AMD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds support to Alienware m16 R1 AMD. Tested-by: Cihan Ozakca Signed-off-by: Kurt Borja Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20241208003013.6490-3-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/dell/alienware-wmi.c b/drivers/platform/x86/dell/alienware-wmi.c index e69bf9a7b6c84..341d01d3e3e4c 100644 --- a/drivers/platform/x86/dell/alienware-wmi.c +++ b/drivers/platform/x86/dell/alienware-wmi.c @@ -241,6 +241,15 @@ static const struct dmi_system_id alienware_quirks[] __initconst = { }, .driver_data = &quirk_asm201, }, + { + .callback = dmi_matched, + .ident = "Alienware m16 R1 AMD", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1 AMD"), + }, + .driver_data = &quirk_x_series, + }, { .callback = dmi_matched, .ident = "Alienware m17 R5", From 9244524d60ddea55f4df54c51200e8fef2032447 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:33 +0900 Subject: [PATCH 0359/1450] p2sb: Factor out p2sb_read_from_cache() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for the following fix, factor out the code to read the P2SB resource from the cache to the new function p2sb_read_from_cache(). Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-2-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index d51eb0db06264..a685781d1272c 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -172,6 +172,22 @@ static int p2sb_cache_resources(void) return ret; } +static int p2sb_read_from_cache(struct pci_bus *bus, unsigned int devfn, + struct resource *mem) +{ + struct p2sb_res_cache *cache = &p2sb_resources[PCI_FUNC(devfn)]; + + if (cache->bus_dev_id != bus->dev.id) + return -ENODEV; + + if (!p2sb_valid_resource(&cache->res)) + return -ENOENT; + + memcpy(mem, &cache->res, sizeof(*mem)); + + return 0; +} + /** * p2sb_bar - Get Primary to Sideband (P2SB) bridge device BAR * @bus: PCI bus to communicate with @@ -188,8 +204,6 @@ static int p2sb_cache_resources(void) */ int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem) { - struct p2sb_res_cache *cache; - bus = p2sb_get_bus(bus); if (!bus) return -ENODEV; @@ -197,15 +211,7 @@ int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem) if (!devfn) p2sb_get_devfn(&devfn); - cache = &p2sb_resources[PCI_FUNC(devfn)]; - if (cache->bus_dev_id != bus->dev.id) - return -ENODEV; - - if (!p2sb_valid_resource(&cache->res)) - return -ENOENT; - - memcpy(mem, &cache->res, sizeof(*mem)); - return 0; + return p2sb_read_from_cache(bus, devfn, mem); } EXPORT_SYMBOL_GPL(p2sb_bar); From ae3e6ebc5ab046d434c05c58a3e3f7e94441fec2 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:34 +0900 Subject: [PATCH 0360/1450] p2sb: Introduce the global flag p2sb_hidden_by_bios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for the following fix, introduce the global flag p2sb_hidden_by_bios. Check if the BIOS hides the P2SB device and store the result in the flag. This allows to refer to the check result across functions. Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-3-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index a685781d1272c..630068e01f7e0 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -43,6 +43,7 @@ struct p2sb_res_cache { }; static struct p2sb_res_cache p2sb_resources[NR_P2SB_RES_CACHE]; +static bool p2sb_hidden_by_bios; static void p2sb_get_devfn(unsigned int *devfn) { @@ -158,13 +159,14 @@ static int p2sb_cache_resources(void) * Unhide the P2SB device here, if needed. */ pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value); - if (value & P2SBC_HIDE) + p2sb_hidden_by_bios = value & P2SBC_HIDE; + if (p2sb_hidden_by_bios) pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, 0); ret = p2sb_scan_and_cache(bus, devfn_p2sb); /* Hide the P2SB device, if it was hidden */ - if (value & P2SBC_HIDE) + if (p2sb_hidden_by_bios) pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, P2SBC_HIDE); pci_unlock_rescan_remove(); From 0286070c74ee48391fc07f7f617460479472d221 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:35 +0900 Subject: [PATCH 0361/1450] p2sb: Move P2SB hide and unhide code to p2sb_scan_and_cache() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare for the following fix, move the code to hide and unhide the P2SB device from p2sb_cache_resources() to p2sb_scan_and_cache(). Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-4-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index 630068e01f7e0..46c108bbcbbac 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -98,6 +98,14 @@ static void p2sb_scan_and_cache_devfn(struct pci_bus *bus, unsigned int devfn) static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) { + /* + * The BIOS prevents the P2SB device from being enumerated by the PCI + * subsystem, so we need to unhide and hide it back to lookup the BAR. + * Unhide the P2SB device here, if needed. + */ + if (p2sb_hidden_by_bios) + pci_bus_write_config_dword(bus, devfn, P2SBC, 0); + /* Scan the P2SB device and cache its BAR0 */ p2sb_scan_and_cache_devfn(bus, devfn); @@ -105,6 +113,10 @@ static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) if (devfn == P2SB_DEVFN_GOLDMONT) p2sb_scan_and_cache_devfn(bus, SPI_DEVFN_GOLDMONT); + /* Hide the P2SB device, if it was hidden */ + if (p2sb_hidden_by_bios) + pci_bus_write_config_dword(bus, devfn, P2SBC, P2SBC_HIDE); + if (!p2sb_valid_resource(&p2sb_resources[PCI_FUNC(devfn)].res)) return -ENOENT; @@ -153,22 +165,11 @@ static int p2sb_cache_resources(void) */ pci_lock_rescan_remove(); - /* - * The BIOS prevents the P2SB device from being enumerated by the PCI - * subsystem, so we need to unhide and hide it back to lookup the BAR. - * Unhide the P2SB device here, if needed. - */ pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value); p2sb_hidden_by_bios = value & P2SBC_HIDE; - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, 0); ret = p2sb_scan_and_cache(bus, devfn_p2sb); - /* Hide the P2SB device, if it was hidden */ - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, P2SBC_HIDE); - pci_unlock_rescan_remove(); return ret; From 360c400d0f568636c1b98d1d5f9f49aa3d420c70 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 28 Nov 2024 09:28:36 +0900 Subject: [PATCH 0362/1450] p2sb: Do not scan and remove the P2SB device when it is unhidden MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When drivers access P2SB device resources, it calls p2sb_bar(). Before the commit 5913320eb0b3 ("platform/x86: p2sb: Allow p2sb_bar() calls during PCI device probe"), p2sb_bar() obtained the resources and then called pci_stop_and_remove_bus_device() for clean up. Then the P2SB device disappeared. The commit 5913320eb0b3 introduced the P2SB device resource cache feature in the boot process. During the resource cache, pci_stop_and_remove_bus_device() is called for the P2SB device, then the P2SB device disappears regardless of whether p2sb_bar() is called or not. Such P2SB device disappearance caused a confusion [1]. To avoid the confusion, avoid the pci_stop_and_remove_bus_device() call when the BIOS does not hide the P2SB device. For that purpose, cache the P2SB device resources only if the BIOS hides the P2SB device. Call p2sb_scan_and_cache() only if p2sb_hidden_by_bios is true. This allows removing two branches from p2sb_scan_and_cache(). When p2sb_bar() is called, get the resources from the cache if the P2SB device is hidden. Otherwise, read the resources from the unhidden P2SB device. Reported-by: Daniel Walker (danielwa) Closes: https://lore.kernel.org/lkml/ZzTI+biIUTvFT6NC@goliath/ [1] Fixes: 5913320eb0b3 ("platform/x86: p2sb: Allow p2sb_bar() calls during PCI device probe") Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20241128002836.373745-5-shinichiro.kawasaki@wdc.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/p2sb.c | 42 +++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/p2sb.c b/drivers/platform/x86/p2sb.c index 46c108bbcbbac..cbbb0f809704e 100644 --- a/drivers/platform/x86/p2sb.c +++ b/drivers/platform/x86/p2sb.c @@ -101,10 +101,8 @@ static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) /* * The BIOS prevents the P2SB device from being enumerated by the PCI * subsystem, so we need to unhide and hide it back to lookup the BAR. - * Unhide the P2SB device here, if needed. */ - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn, P2SBC, 0); + pci_bus_write_config_dword(bus, devfn, P2SBC, 0); /* Scan the P2SB device and cache its BAR0 */ p2sb_scan_and_cache_devfn(bus, devfn); @@ -113,9 +111,7 @@ static int p2sb_scan_and_cache(struct pci_bus *bus, unsigned int devfn) if (devfn == P2SB_DEVFN_GOLDMONT) p2sb_scan_and_cache_devfn(bus, SPI_DEVFN_GOLDMONT); - /* Hide the P2SB device, if it was hidden */ - if (p2sb_hidden_by_bios) - pci_bus_write_config_dword(bus, devfn, P2SBC, P2SBC_HIDE); + pci_bus_write_config_dword(bus, devfn, P2SBC, P2SBC_HIDE); if (!p2sb_valid_resource(&p2sb_resources[PCI_FUNC(devfn)].res)) return -ENOENT; @@ -142,7 +138,7 @@ static int p2sb_cache_resources(void) u32 value = P2SBC_HIDE; struct pci_bus *bus; u16 class; - int ret; + int ret = 0; /* Get devfn for P2SB device itself */ p2sb_get_devfn(&devfn_p2sb); @@ -168,7 +164,12 @@ static int p2sb_cache_resources(void) pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value); p2sb_hidden_by_bios = value & P2SBC_HIDE; - ret = p2sb_scan_and_cache(bus, devfn_p2sb); + /* + * If the BIOS does not hide the P2SB device then its resources + * are accesilble. Cache them only if the P2SB device is hidden. + */ + if (p2sb_hidden_by_bios) + ret = p2sb_scan_and_cache(bus, devfn_p2sb); pci_unlock_rescan_remove(); @@ -191,6 +192,26 @@ static int p2sb_read_from_cache(struct pci_bus *bus, unsigned int devfn, return 0; } +static int p2sb_read_from_dev(struct pci_bus *bus, unsigned int devfn, + struct resource *mem) +{ + struct pci_dev *pdev; + int ret = 0; + + pdev = pci_get_slot(bus, devfn); + if (!pdev) + return -ENODEV; + + if (p2sb_valid_resource(pci_resource_n(pdev, 0))) + p2sb_read_bar0(pdev, mem); + else + ret = -ENOENT; + + pci_dev_put(pdev); + + return ret; +} + /** * p2sb_bar - Get Primary to Sideband (P2SB) bridge device BAR * @bus: PCI bus to communicate with @@ -214,7 +235,10 @@ int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem) if (!devfn) p2sb_get_devfn(&devfn); - return p2sb_read_from_cache(bus, devfn, mem); + if (p2sb_hidden_by_bios) + return p2sb_read_from_cache(bus, devfn, mem); + + return p2sb_read_from_dev(bus, devfn, mem); } EXPORT_SYMBOL_GPL(p2sb_bar); From 41856638e6c4ed51d8aa9e54f70059d1e357b46e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 29 Nov 2024 17:39:27 +0100 Subject: [PATCH 0363/1450] s390/mm: Fix DirectMap accounting With uncoupling of physical and virtual address spaces population of the identity mapping was changed to use the type POPULATE_IDENTITY instead of POPULATE_DIRECT. This breaks DirectMap accounting: > cat /proc/meminfo DirectMap4k: 55296 kB DirectMap1M: 18446744073709496320 kB Adjust all locations of update_page_count() in vmem.c to use POPULATE_IDENTITY instead of POPULATE_DIRECT as well. With this accounting is correct again: > cat /proc/meminfo DirectMap4k: 54264 kB DirectMap1M: 8334336 kB Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Cc: stable@vger.kernel.org Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/boot/vmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 145035f84a0e3..3fa28db2fe59f 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -306,7 +306,7 @@ static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long e pages++; } } - if (mode == POPULATE_DIRECT) + if (mode == POPULATE_IDENTITY) update_page_count(PG_DIRECT_MAP_4K, pages); } @@ -339,7 +339,7 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e } pgtable_pte_populate(pmd, addr, next, mode); } - if (mode == POPULATE_DIRECT) + if (mode == POPULATE_IDENTITY) update_page_count(PG_DIRECT_MAP_1M, pages); } @@ -372,7 +372,7 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e } pgtable_pmd_populate(pud, addr, next, mode); } - if (mode == POPULATE_DIRECT) + if (mode == POPULATE_IDENTITY) update_page_count(PG_DIRECT_MAP_2G, pages); } From a56335c85b592cb2833db0a71f7112b7d9f0d56b Mon Sep 17 00:00:00 2001 From: Prathamesh Shete Date: Mon, 9 Dec 2024 15:40:09 +0530 Subject: [PATCH 0364/1450] mmc: sdhci-tegra: Remove SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC quirk Value 0 in ADMA length descriptor is interpreted as 65536 on new Tegra chips, remove SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC quirk to make sure max ADMA2 length is 65536. Fixes: 4346b7c7941d ("mmc: tegra: Add Tegra186 support") Cc: stable@vger.kernel.org Signed-off-by: Prathamesh Shete Acked-by: Thierry Reding Acked-by: Adrian Hunter Message-ID: <20241209101009.22710-1-pshete@nvidia.com> Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-tegra.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 4d402b6018836..b2f5c3f8b8396 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -1525,7 +1525,6 @@ static const struct sdhci_pltfm_data sdhci_tegra186_pdata = { .quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL | SDHCI_QUIRK_SINGLE_POWER_WRITE | SDHCI_QUIRK_NO_HISPD_BIT | - SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | SDHCI_QUIRK2_ISSUE_CMD_DAT_RESET_TOGETHER, From f3d87abe11ed04d1b23a474a212f0e5deeb50892 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Tue, 3 Dec 2024 11:34:42 +0900 Subject: [PATCH 0365/1450] mmc: mtk-sd: disable wakeup in .remove() and in the error path of .probe() Current implementation leaves pdev->dev as a wakeup source. Add a device_init_wakeup(&pdev->dev, false) call in the .remove() function and in the error path of the .probe() function. Signed-off-by: Joe Hattori Fixes: 527f36f5efa4 ("mmc: mediatek: add support for SDIO eint wakup IRQ") Cc: stable@vger.kernel.org Message-ID: <20241203023442.2434018-1-joe@pf.is.s.u-tokyo.ac.jp> Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index efb0d2d5716b9..af445d3f8e2ae 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -3070,6 +3070,7 @@ static int msdc_drv_probe(struct platform_device *pdev) msdc_gate_clock(host); platform_set_drvdata(pdev, NULL); release_mem: + device_init_wakeup(&pdev->dev, false); if (host->dma.gpd) dma_free_coherent(&pdev->dev, 2 * sizeof(struct mt_gpdma_desc), @@ -3103,6 +3104,7 @@ static void msdc_drv_remove(struct platform_device *pdev) host->dma.gpd, host->dma.gpd_addr); dma_free_coherent(&pdev->dev, MAX_BD_NUM * sizeof(struct mt_bdma_desc), host->dma.bd, host->dma.bd_addr); + device_init_wakeup(&pdev->dev, false); } static void msdc_save_reg(struct msdc_host *host) From 50a062a7620051c09adacd6d140ebd56881a333b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:34 -0600 Subject: [PATCH 0366/1450] cpufreq/amd-pstate: Store the boost numerator as highest perf again commit ad4caad58d91d ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()") changed the semantics for highest perf and commit 18d9b52271213 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled") worked around those semantic changes. This however is a confusing result and furthermore makes it awkward to change frequency limits and boost due to the scaling differences. Restore the boost numerator to highest perf again. Suggested-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Fixes: ad4caad58d91 ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()") Link: https://lore.kernel.org/r/20241209185248.16301-2-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- Documentation/admin-guide/pm/amd-pstate.rst | 4 +--- drivers/cpufreq/amd-pstate.c | 25 ++++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index 210a808b74ec2..412423c54f258 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -251,9 +251,7 @@ performance supported in `AMD CPPC Performance Capability `_). In some ASICs, the highest CPPC performance is not the one in the ``_CPC`` table, so we need to expose it to sysfs. If boost is not active, but still supported, this maximum frequency will be larger than the one in -``cpuinfo``. On systems that support preferred core, the driver will have -different values for some cores than others and this will reflect the values -advertised by the platform at bootup. +``cpuinfo``. This attribute is read-only. ``amd_pstate_lowest_nonlinear_freq`` diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 8b36450bbdf6a..ab6fe9c2150c9 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -374,15 +374,19 @@ static inline int amd_pstate_cppc_enable(bool enable) static int msr_init_perf(struct amd_cpudata *cpudata) { - u64 cap1; + u64 cap1, numerator; int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &cap1); if (ret) return ret; - WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); - WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1)); + ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); + if (ret) + return ret; + + WRITE_ONCE(cpudata->highest_perf, numerator); + WRITE_ONCE(cpudata->max_limit_perf, numerator); WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); @@ -394,13 +398,18 @@ static int msr_init_perf(struct amd_cpudata *cpudata) static int shmem_init_perf(struct amd_cpudata *cpudata) { struct cppc_perf_caps cppc_perf; + u64 numerator; int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); if (ret) return ret; - WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf); - WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf); + ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); + if (ret) + return ret; + + WRITE_ONCE(cpudata->highest_perf, numerator); + WRITE_ONCE(cpudata->max_limit_perf, numerator); WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); WRITE_ONCE(cpudata->lowest_nonlinear_perf, cppc_perf.lowest_nonlinear_perf); @@ -889,7 +898,6 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) { int ret; u32 min_freq, max_freq; - u64 numerator; u32 nominal_perf, nominal_freq; u32 lowest_nonlinear_perf, lowest_nonlinear_freq; u32 boost_ratio, lowest_nonlinear_ratio; @@ -911,10 +919,7 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) nominal_perf = READ_ONCE(cpudata->nominal_perf); - ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); - if (ret) - return ret; - boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf); + boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); From 2993b29b2a98f2bc9d55dfd37ef39f56a2908748 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:52:35 -0600 Subject: [PATCH 0367/1450] cpufreq/amd-pstate: Use boost numerator for upper bound of frequencies commit 18d9b5227121 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled") introduced different semantics for min/max limits based upon whether the user turned off boost from sysfs. This however is not necessary when the highest perf value is the boost numerator. Suggested-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Fixes: 18d9b5227121 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled") Link: https://lore.kernel.org/r/20241209185248.16301-3-mario.limonciello@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ab6fe9c2150c9..66e5dfc711c0c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -570,16 +570,13 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) { - u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf; + u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq; struct amd_cpudata *cpudata = policy->driver_data; - if (cpudata->boost_supported && !policy->boost_enabled) - max_perf = READ_ONCE(cpudata->nominal_perf); - else - max_perf = READ_ONCE(cpudata->highest_perf); - - max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); - min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); + max_perf = READ_ONCE(cpudata->highest_perf); + max_freq = READ_ONCE(cpudata->max_freq); + max_limit_perf = div_u64(policy->max * max_perf, max_freq); + min_limit_perf = div_u64(policy->min * max_perf, max_freq); lowest_perf = READ_ONCE(cpudata->lowest_perf); if (min_limit_perf < lowest_perf) From 5d009e024056ded20c5bb1583146b833b23bbd5a Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 6 Dec 2024 08:52:30 +0800 Subject: [PATCH 0368/1450] of: Fix refcount leakage for OF node returned by __of_get_dma_parent() __of_get_dma_parent() returns OF device node @args.np, but the node's refcount is increased twice, by both of_parse_phandle_with_args() and of_node_get(), so causes refcount leakage for the node. Fix by directly returning the node got by of_parse_phandle_with_args(). Fixes: f83a6e5dea6c ("of: address: Add support for the parent DMA bus") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241206-of_core_fix-v1-4-dc28ed56bec3@quicinc.com Signed-off-by: Rob Herring (Arm) --- drivers/of/address.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index 5b7ee3ed5296b..c1f1c810e810d 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -620,7 +620,7 @@ struct device_node *__of_get_dma_parent(const struct device_node *np) if (ret < 0) return of_get_parent(np); - return of_node_get(args.np); + return args.np; } #endif From fec3edc47d5cfc2dd296a5141df887bf567944db Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 9 Dec 2024 21:24:59 +0800 Subject: [PATCH 0369/1450] of/irq: Fix interrupt-map cell length check in of_irq_parse_imap_parent() On a malformed interrupt-map property which is shorter than expected by 1 cell, we may read bogus data past the end of the property instead of returning an error in of_irq_parse_imap_parent(). Decrement the remaining length when skipping over the interrupt parent phandle cell. Fixes: 935df1bd40d4 ("of/irq: Factor out parsing of interrupt-map parent phandle+args from of_irq_parse_raw()") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241209-of_irq_fix-v1-1-782f1419c8a1@quicinc.com [rh: reword commit msg] Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 67fc0ceaa5f51..43cf60479b9e1 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -111,6 +111,7 @@ const __be32 *of_irq_parse_imap_parent(const __be32 *imap, int len, struct of_ph else np = of_find_node_by_phandle(be32_to_cpup(imap)); imap++; + len--; /* Check if not found */ if (!np) { From 0f7ca6f69354e0c3923bbc28c92d0ecab4d50a3e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 9 Dec 2024 21:25:02 +0800 Subject: [PATCH 0370/1450] of/irq: Fix using uninitialized variable @addr_len in API of_irq_parse_one() of_irq_parse_one() may use uninitialized variable @addr_len as shown below: // @addr_len is uninitialized int addr_len; // This operation does not touch @addr_len if it fails. addr = of_get_property(device, "reg", &addr_len); // Use uninitialized @addr_len if the operation fails. if (addr_len > sizeof(addr_buf)) addr_len = sizeof(addr_buf); // Check the operation result here. if (addr) memcpy(addr_buf, addr, addr_len); Fix by initializing @addr_len before the operation. Fixes: b739dffa5d57 ("of/irq: Prevent device address out-of-bounds read in interrupt map walk") Cc: stable@vger.kernel.org Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241209-of_irq_fix-v1-4-782f1419c8a1@quicinc.com Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 43cf60479b9e1..98b1cf78ecac7 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -355,6 +355,7 @@ int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_ar return of_irq_parse_oldworld(device, index, out_irq); /* Get the reg property (if any) */ + addr_len = 0; addr = of_get_property(device, "reg", &addr_len); /* Prevent out-of-bounds read in case of longer interrupt parent address size */ From da4d8c83358163df9a4addaeba0ef8bcb03b22e8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 15 Nov 2024 09:00:32 -0800 Subject: [PATCH 0371/1450] cxl/pci: Fix potential bogus return value upon successful probing If cxl_pci_ras_unmask() returns non-zero, cxl_pci_probe() will end up returning that value, instead of zero. Fixes: 248529edc86f ("cxl: add RAS status unmasking for CXL") Reviewed-by: Fan Ni Signed-off-by: Davidlohr Bueso Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20241115170032.108445-1-dave@stgolabs.net Signed-off-by: Dave Jiang --- drivers/cxl/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0241d1d7133a4..26ab06c9deffb 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1032,8 +1032,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; - rc = cxl_pci_ras_unmask(pdev); - if (rc) + if (cxl_pci_ras_unmask(pdev)) dev_dbg(&pdev->dev, "No RAS reporting unmasked\n"); pci_save_state(pdev); From 09ceba3a93450b652ae6910b6f65be99885f4437 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 29 Nov 2024 21:28:25 +0800 Subject: [PATCH 0372/1450] cxl/pci: Check dport->regs.rcd_pcie_cap availability before accessing RCD Upstream Port's PCI Express Capability is a component registers block stored in RCD Upstream Port RCRB. CXL PCI driver helps to map it during the RCD probing, but mapping failure is allowed for component registers blocks in CXL PCI driver. dport->regs.rcd_pcie_cap is used to store the virtual address of the RCD Upstream Port's PCI Express Capability, add a dport->regs.rcd_pcie_cap checking in rcd_pcie_cap_emit() just in case user accesses a invalid address via RCD sysfs. Fixes: c5eaec79fa43 ("cxl/pci: Add sysfs attribute for CXL 1.1 device link status") Signed-off-by: Li Ming Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Link: https://patch.msgid.link/20241129132825.569237-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 26ab06c9deffb..6d94ff4a4f1a6 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -836,6 +836,9 @@ static ssize_t rcd_pcie_cap_emit(struct device *dev, u16 offset, char *buf, size if (!root_dev) return -ENXIO; + if (!dport->regs.rcd_pcie_cap) + return -ENXIO; + guard(device)(root_dev); if (!root_dev->driver) return -ENXIO; From 76467a94810c2aa4dd3096903291ac6df30c399e Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Mon, 9 Dec 2024 15:33:02 -0800 Subject: [PATCH 0373/1450] cxl/region: Fix region creation for greater than x2 switches The cxl_port_setup_targets() algorithm fails to identify valid target list ordering in the presence of 4-way and above switches resulting in 'cxl create-region' failures of the form: $ cxl create-region -d decoder0.0 -g 1024 -s 2G -t ram -w 8 -m mem4 mem1 mem6 mem3 mem2 mem5 mem7 mem0 cxl region: create_region: region0: failed to set target7 to mem0 cxl region: cmd_create_region: created 0 regions [kernel debug message] check_last_peer:1213: cxl region0: pci0000:0c:port1: cannot host mem6:decoder7.0 at 2 bus_remove_device:574: bus: 'cxl': remove device region0 QEMU can create this failing topology: ACPI0017:00 [root0] | HB_0 [port1] / \ RP_0 RP_1 | | USP [port2] USP [port3] / / \ \ / / \ \ DSP DSP DSP DSP DSP DSP DSP DSP | | | | | | | | mem4 mem6 mem2 mem7 mem1 mem3 mem5 mem0 Pos: 0 2 4 6 1 3 5 7 HB: Host Bridge RP: Root Port USP: Upstream Port DSP: Downstream Port ...with the following command steps: $ qemu-system-x86_64 -machine q35,cxl=on,accel=tcg \ -smp cpus=8 \ -m 8G \ -hda /home/work/vm-images/centos-stream8-02.qcow2 \ -object memory-backend-ram,size=4G,id=m0 \ -object memory-backend-ram,size=4G,id=m1 \ -object memory-backend-ram,size=2G,id=cxl-mem0 \ -object memory-backend-ram,size=2G,id=cxl-mem1 \ -object memory-backend-ram,size=2G,id=cxl-mem2 \ -object memory-backend-ram,size=2G,id=cxl-mem3 \ -object memory-backend-ram,size=2G,id=cxl-mem4 \ -object memory-backend-ram,size=2G,id=cxl-mem5 \ -object memory-backend-ram,size=2G,id=cxl-mem6 \ -object memory-backend-ram,size=2G,id=cxl-mem7 \ -numa node,memdev=m0,cpus=0-3,nodeid=0 \ -numa node,memdev=m1,cpus=4-7,nodeid=1 \ -netdev user,id=net0,hostfwd=tcp::2222-:22 \ -device virtio-net-pci,netdev=net0 \ -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \ -device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \ -device cxl-rp,port=1,bus=cxl.1,id=root_port1,chassis=0,slot=1 \ -device cxl-upstream,bus=root_port0,id=us0 \ -device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \ -device cxl-type3,bus=swport0,volatile-memdev=cxl-mem0,id=cxl-vmem0 \ -device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \ -device cxl-type3,bus=swport1,volatile-memdev=cxl-mem1,id=cxl-vmem1 \ -device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \ -device cxl-type3,bus=swport2,volatile-memdev=cxl-mem2,id=cxl-vmem2 \ -device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \ -device cxl-type3,bus=swport3,volatile-memdev=cxl-mem3,id=cxl-vmem3 \ -device cxl-upstream,bus=root_port1,id=us1 \ -device cxl-downstream,port=4,bus=us1,id=swport4,chassis=0,slot=8 \ -device cxl-type3,bus=swport4,volatile-memdev=cxl-mem4,id=cxl-vmem4 \ -device cxl-downstream,port=5,bus=us1,id=swport5,chassis=0,slot=9 \ -device cxl-type3,bus=swport5,volatile-memdev=cxl-mem5,id=cxl-vmem5 \ -device cxl-downstream,port=6,bus=us1,id=swport6,chassis=0,slot=10 \ -device cxl-type3,bus=swport6,volatile-memdev=cxl-mem6,id=cxl-vmem6 \ -device cxl-downstream,port=7,bus=us1,id=swport7,chassis=0,slot=11 \ -device cxl-type3,bus=swport7,volatile-memdev=cxl-mem7,id=cxl-vmem7 \ -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=32G & In Guest OS: $ cxl create-region -d decoder0.0 -g 1024 -s 2G -t ram -w 8 -m mem4 mem1 mem6 mem3 mem2 mem5 mem7 mem0 Fix the method to calculate @distance by iterativeley multiplying the number of targets per switch port. This also follows the algorithm recommended here [1]. Fixes: 27b3f8d13830 ("cxl/region: Program target lists") Link: http://lore.kernel.org/6538824b52349_7258329466@dwillia2-xfh.jf.intel.com.notmuch [1] Signed-off-by: Huaisheng Ye Tested-by: Li Zhijian [djbw: add a comment explaining 'distance'] Signed-off-by: Dan Williams Link: https://patch.msgid.link/173378716722.1270362.9546805175813426729.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d778996507984..b98b1ccffd1ca 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1295,6 +1295,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region_params *p = &cxlr->params; struct cxl_decoder *cxld = cxl_rr->decoder; struct cxl_switch_decoder *cxlsd; + struct cxl_port *iter = port; u16 eig, peig; u8 eiw, peiw; @@ -1311,16 +1312,26 @@ static int cxl_port_setup_targets(struct cxl_port *port, cxlsd = to_cxl_switch_decoder(&cxld->dev); if (cxl_rr->nr_targets_set) { - int i, distance; + int i, distance = 1; + struct cxl_region_ref *cxl_rr_iter; /* - * Passthrough decoders impose no distance requirements between - * peers + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. */ - if (cxl_rr->nr_targets == 1) - distance = 0; - else - distance = p->nr_targets / cxl_rr->nr_targets; + do { + cxl_rr_iter = cxl_rr_load(iter, cxlr); + distance *= cxl_rr_iter->nr_targets; + iter = to_cxl_port(iter->dev.parent); + } while (!is_cxl_root(iter)); + distance *= cxlrd->cxlsd.cxld.interleave_ways; + for (i = 0; i < cxl_rr->nr_targets_set; i++) if (ep->dport == cxlsd->target[i]) { rc = check_last_peer(cxled, ep, cxl_rr, From 2872e21c47c359b902e53faf7e749c8ea682f7f7 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Wed, 4 Dec 2024 16:22:47 +0100 Subject: [PATCH 0374/1450] MAINTAINERS: align Danilo's maintainer entries Some entries use my kernel.org address, while others use my Red Hat one. Since this is a bit of an inconvinience for me, align them to all use the same (kernel.org) address. Acked-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20241204152248.8644-1-dakr@kernel.org --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b13..c669c5bd61e75 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7345,7 +7345,7 @@ F: drivers/gpu/drm/panel/panel-novatek-nt36672a.c DRM DRIVER FOR NVIDIA GEFORCE/QUADRO GPUS M: Karol Herbst M: Lyude Paul -M: Danilo Krummrich +M: Danilo Krummrich L: dri-devel@lists.freedesktop.org L: nouveau@lists.freedesktop.org S: Supported @@ -8922,7 +8922,7 @@ F: include/linux/arm_ffa.h FIRMWARE LOADER (request_firmware) M: Luis Chamberlain M: Russ Weight -M: Danilo Krummrich +M: Danilo Krummrich L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/firmware_class/ From 5595e3613ea768b81f600d37ea6880983339f21a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 7 Dec 2024 15:18:44 -0600 Subject: [PATCH 0375/1450] dsa: mv88e6xxx: Move available stats into info structure Different families of switches have different statistics available. This information is current hard coded into functions, however this information will also soon be needed when getting statistics from the RMU. Move it into the info structure. Signed-off-by: Andrew Lunn Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241207-v6-13-rc1-net-next-mv88e6xxx-stats-refactor-v1-1-b9960f839846@lunn.ch Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 42 +++++++++++++++++++++++++++++--- drivers/net/dsa/mv88e6xxx/chip.h | 1 + 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 3a792f79270d9..794653b53bb5b 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1289,7 +1289,7 @@ static size_t mv88e6095_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & (STATS_TYPE_BANK0 | STATS_TYPE_PORT))) + if (!(stat->type & chip->info->stats_type)) return 0; *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, 0, @@ -1301,7 +1301,7 @@ static size_t mv88e6250_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & STATS_TYPE_BANK0)) + if (!(stat->type & chip->info->stats_type)) return 0; *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, 0, @@ -1313,7 +1313,7 @@ static size_t mv88e6320_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & (STATS_TYPE_BANK0 | STATS_TYPE_BANK1))) + if (!(stat->type & chip->info->stats_type)) return 0; *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, @@ -1326,7 +1326,7 @@ static size_t mv88e6390_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & (STATS_TYPE_BANK0 | STATS_TYPE_BANK1))) + if (!(stat->type & chip->info->stats_type)) return 0; *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, @@ -5645,6 +5645,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 5, + .stats_type = STATS_TYPE_BANK0, .atu_move_port_mask = 0xf, .dual_chip = true, .ops = &mv88e6250_ops, @@ -5665,6 +5666,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 5, + .stats_type = STATS_TYPE_BANK0, .atu_move_port_mask = 0xf, .dual_chip = true, .ops = &mv88e6250_ops, @@ -5687,6 +5689,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 8, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5708,6 +5711,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global2_addr = 0x1c, .age_time_coeff = 15000, .g1_irqs = 8, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .multi_chip = true, .ops = &mv88e6095_ops, @@ -5730,6 +5734,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 8, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5754,6 +5759,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5776,6 +5782,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global2_addr = 0x1c, .age_time_coeff = 15000, .g1_irqs = 9, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .multi_chip = true, .ops = &mv88e6131_ops, @@ -5800,6 +5807,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .atu_move_port_mask = 0x1f, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .pvt = true, .multi_chip = true, .edsa_support = MV88E6XXX_EDSA_SUPPORTED, @@ -5823,6 +5831,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5848,6 +5857,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5872,6 +5882,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5897,6 +5908,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5921,6 +5933,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5946,6 +5959,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -5968,6 +5982,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global2_addr = 0x1c, .age_time_coeff = 15000, .g1_irqs = 8, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .multi_chip = true, .edsa_support = MV88E6XXX_EDSA_SUPPORTED, @@ -5992,6 +6007,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .pvt = true, .multi_chip = true, .atu_move_port_mask = 0x1f, @@ -6016,6 +6032,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6039,6 +6056,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6063,6 +6081,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 10, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6087,6 +6106,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 10, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6114,6 +6134,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0, .atu_move_port_mask = 0xf, .dual_chip = true, .ptp_support = true, @@ -6138,6 +6159,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6161,6 +6183,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0, .atu_move_port_mask = 0xf, .dual_chip = true, .ptp_support = true, @@ -6184,6 +6207,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6208,6 +6232,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 8, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6233,6 +6258,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 8, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0xf, .multi_chip = true, .edsa_support = MV88E6XXX_EDSA_SUPPORTED, @@ -6259,6 +6285,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .atu_move_port_mask = 0x1f, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .pvt = true, .multi_chip = true, .edsa_support = MV88E6XXX_EDSA_SUPPORTED, @@ -6283,6 +6310,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6307,6 +6335,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6332,6 +6361,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 15000, .g1_irqs = 9, .g2_irqs = 10, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_PORT, .atu_move_port_mask = 0xf, .pvt = true, .multi_chip = true, @@ -6359,6 +6389,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 10, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6383,6 +6414,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6408,6 +6440,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 9, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, @@ -6433,6 +6466,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .age_time_coeff = 3750, .g1_irqs = 10, .g2_irqs = 14, + .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0x1f, .pvt = true, .multi_chip = true, diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 9fe8e8a7856b2..86bf113c9bfa1 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -144,6 +144,7 @@ struct mv88e6xxx_info { unsigned int age_time_coeff; unsigned int g1_irqs; unsigned int g2_irqs; + int stats_type; bool pvt; /* Mark certain ports as invalid. This is required for example for the From 9a4eef6bf2bee375e94a3647cc11906ed3ee58f7 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 7 Dec 2024 15:18:45 -0600 Subject: [PATCH 0376/1450] dsa: mv88e6xxx: Centralise common statistics check With moving information about available statistics into the info structure, the test becomes identical. Consolidate them into a single test. Signed-off-by: Andrew Lunn Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241207-v6-13-rc1-net-next-mv88e6xxx-stats-refactor-v1-2-b9960f839846@lunn.ch Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 794653b53bb5b..34708c739b045 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1289,9 +1289,6 @@ static size_t mv88e6095_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & chip->info->stats_type)) - return 0; - *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, 0, MV88E6XXX_G1_STATS_OP_HIST_RX); return 1; @@ -1301,9 +1298,6 @@ static size_t mv88e6250_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & chip->info->stats_type)) - return 0; - *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, 0, MV88E6XXX_G1_STATS_OP_HIST_RX); return 1; @@ -1313,9 +1307,6 @@ static size_t mv88e6320_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & chip->info->stats_type)) - return 0; - *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, MV88E6XXX_G1_STATS_OP_BANK_1_BIT_9, MV88E6XXX_G1_STATS_OP_HIST_RX); @@ -1326,9 +1317,6 @@ static size_t mv88e6390_stats_get_stat(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data) { - if (!(stat->type & chip->info->stats_type)) - return 0; - *data = _mv88e6xxx_get_ethtool_stat(chip, stat, port, MV88E6XXX_G1_STATS_OP_BANK_1_BIT_10, 0); @@ -1341,6 +1329,9 @@ static size_t mv88e6xxx_stats_get_stat(struct mv88e6xxx_chip *chip, int port, { int ret = 0; + if (!(stat->type & chip->info->stats_type)) + return 0; + if (chip->info->ops->stats_get_stat) { mv88e6xxx_reg_lock(chip); ret = chip->info->ops->stats_get_stat(chip, port, stat, data); From 46afe345ff181e3b72830cb12f1e11cc837cc58e Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Sun, 8 Dec 2024 15:02:02 +0800 Subject: [PATCH 0377/1450] net: stmmac: Relocate extern declarations in common.h and hwif.h The extern declarations should be in a header file that corresponds to their definition, move these extern declarations to its header file. Some of them have nowhere to go, so move them to hwif.h since they are referenced in hwif.c only. dwmac100_* dwmac1000_* dwmac4_* dwmac410_* dwmac510_* stay in hwif.h, otherwise you will be flooded with name conflicts from dwmac100.h, dwmac1000.h and dwmac4.h if hwif.c try to #include these .h files. Compile tested only. No functional change intended. Suggested-by: Russell King (Oracle) Signed-off-by: Furong Xu <0x1207@gmail.com> Link: https://patch.msgid.link/20241208070202.203931-1-0x1207@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/common.h | 14 -------------- .../net/ethernet/stmicro/stmmac/dwmac4_descs.h | 3 +++ drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h | 5 +++++ drivers/net/ethernet/stmicro/stmmac/hwif.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/hwif.h | 16 +++++++++------- drivers/net/ethernet/stmicro/stmmac/mmc.h | 3 +++ drivers/net/ethernet/stmicro/stmmac/stmmac_est.h | 2 ++ drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h | 3 +++ 8 files changed, 27 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 70d601f454811..e25db747a81a5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -545,18 +545,8 @@ struct dma_features { #define STMMAC_VLAN_INSERT 0x2 #define STMMAC_VLAN_REPLACE 0x3 -extern const struct stmmac_desc_ops enh_desc_ops; -extern const struct stmmac_desc_ops ndesc_ops; - struct mac_device_info; -extern const struct stmmac_hwtimestamp stmmac_ptp; -extern const struct stmmac_hwtimestamp dwmac1000_ptp; -extern const struct stmmac_mode_ops dwmac4_ring_mode_ops; - -extern const struct ptp_clock_info stmmac_ptp_clock_ops; -extern const struct ptp_clock_info dwmac1000_ptp_clock_ops; - struct mac_link { u32 caps; u32 speed_mask; @@ -643,8 +633,4 @@ void stmmac_dwmac4_set_mac(void __iomem *ioaddr, bool enable); void dwmac_dma_flush_tx_fifo(void __iomem *ioaddr); -extern const struct stmmac_mode_ops ring_mode_ops; -extern const struct stmmac_mode_ops chain_mode_ops; -extern const struct stmmac_desc_ops dwmac4_desc_ops; - #endif /* __COMMON_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h index 1ce6f43d545ae..806555976496e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h @@ -144,4 +144,7 @@ /* TDS3 use for both format (read and write back) */ #define RDES3_OWN BIT(31) +extern const struct stmmac_mode_ops dwmac4_ring_mode_ops; +extern const struct stmmac_desc_ops dwmac4_desc_ops; + #endif /* __DWMAC4_DESCS_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h index a04a790036927..20027d3c25a79 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h @@ -493,4 +493,9 @@ #define XGMAC_RDES3_TSD BIT(6) #define XGMAC_RDES3_TSA BIT(4) +extern const struct stmmac_ops dwxgmac210_ops; +extern const struct stmmac_ops dwxlgmac2_ops; +extern const struct stmmac_dma_ops dwxgmac210_dma_ops; +extern const struct stmmac_desc_ops dwxgmac210_desc_ops; + #endif /* __STMMAC_DWXGMAC2_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index a72d336a83508..4bd79de2e2220 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -9,6 +9,8 @@ #include "stmmac_fpe.h" #include "stmmac_ptp.h" #include "stmmac_est.h" +#include "dwmac4_descs.h" +#include "dwxgmac2.h" static u32 stmmac_get_id(struct stmmac_priv *priv, u32 id_reg) { diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 64f8ed67dcc4a..e428c82b7d317 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -665,6 +665,15 @@ struct stmmac_regs_off { u32 est_off; }; +extern const struct stmmac_desc_ops enh_desc_ops; +extern const struct stmmac_desc_ops ndesc_ops; + +extern const struct stmmac_hwtimestamp stmmac_ptp; +extern const struct stmmac_hwtimestamp dwmac1000_ptp; + +extern const struct stmmac_mode_ops ring_mode_ops; +extern const struct stmmac_mode_ops chain_mode_ops; + extern const struct stmmac_ops dwmac100_ops; extern const struct stmmac_dma_ops dwmac100_dma_ops; extern const struct stmmac_ops dwmac1000_ops; @@ -677,13 +686,6 @@ extern const struct stmmac_ops dwmac510_ops; extern const struct stmmac_tc_ops dwmac4_tc_ops; extern const struct stmmac_tc_ops dwmac510_tc_ops; extern const struct stmmac_tc_ops dwxgmac_tc_ops; -extern const struct stmmac_ops dwxgmac210_ops; -extern const struct stmmac_ops dwxlgmac2_ops; -extern const struct stmmac_dma_ops dwxgmac210_dma_ops; -extern const struct stmmac_desc_ops dwxgmac210_desc_ops; -extern const struct stmmac_mmc_ops dwmac_mmc_ops; -extern const struct stmmac_mmc_ops dwxgmac_mmc_ops; -extern const struct stmmac_est_ops dwmac510_est_ops; #define GMAC_VERSION 0x00000020 /* GMAC CORE Version */ #define GMAC4_VERSION 0x00000110 /* GMAC4+ CORE Version */ diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc.h b/drivers/net/ethernet/stmicro/stmmac/mmc.h index 5d1ea3e07459a..1cba39fb2c44c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/mmc.h +++ b/drivers/net/ethernet/stmicro/stmmac/mmc.h @@ -139,4 +139,7 @@ struct stmmac_counters { unsigned int mmc_rx_fpe_fragment_cntr; }; +extern const struct stmmac_mmc_ops dwmac_mmc_ops; +extern const struct stmmac_mmc_ops dwxgmac_mmc_ops; + #endif /* __MMC_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_est.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_est.h index 7a858c566e7e5..d247fa383a6e4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_est.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_est.h @@ -62,3 +62,5 @@ #define EST_SRWO BIT(0) #define EST_GCL_DATA 0x00000034 + +extern const struct stmmac_est_ops dwmac510_est_ops; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h index 4cc70480ce0f0..3fe0e3a80e80a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h @@ -104,4 +104,7 @@ int dwmac1000_ptp_enable(struct ptp_clock_info *ptp, void dwmac1000_get_ptptime(void __iomem *ptpaddr, u64 *ptp_time); void dwmac1000_timestamp_interrupt(struct stmmac_priv *priv); +extern const struct ptp_clock_info stmmac_ptp_clock_ops; +extern const struct ptp_clock_info dwmac1000_ptp_clock_ops; + #endif /* __STMMAC_PTP_H__ */ From 33035977b464fc15fa1028606a05316f91f14a23 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 9 Dec 2024 00:49:55 +0100 Subject: [PATCH 0378/1450] net: pktgen: Use kthread_create_on_cpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the proper API instead of open coding it. Signed-off-by: Frederic Weisbecker Reviewed-by: Eric Dumazet Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241208234955.31910-1-frederic@kernel.org Signed-off-by: Jakub Kicinski --- net/core/pktgen.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7e23cacbe66e4..ee95dbb0539a0 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3883,17 +3883,14 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) list_add_tail(&t->th_list, &pn->pktgen_threads); init_completion(&t->start_done); - p = kthread_create_on_node(pktgen_thread_worker, - t, - cpu_to_node(cpu), - "kpktgend_%d", cpu); + p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d"); if (IS_ERR(p)) { pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu); list_del(&t->th_list); kfree(t); return PTR_ERR(p); } - kthread_bind(p, cpu); + t->tsk = p; pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, From 6bb6ab852c19442285c50874954da454ae60c6c3 Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Mon, 9 Dec 2024 14:47:30 +0200 Subject: [PATCH 0379/1450] net: hinic: Fix typo in dev_err message There is a typo in dev_err message: fliter -> filter. Fix it via codespell. Signed-off-by: Andrew Kreimer Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241209124804.9789-1-algonell@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/huawei/hinic/hinic_port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_port.c b/drivers/net/ethernet/huawei/hinic/hinic_port.c index f81a43d2cdfcd..486fb0e20beff 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_port.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_port.c @@ -469,7 +469,7 @@ int hinic_set_vlan_fliter(struct hinic_dev *nic_dev, u32 en) err = HINIC_MGMT_CMD_UNSUPPORTED; } else if (err || !out_size || vlan_filter.status) { dev_err(&pdev->dev, - "Failed to set vlan fliter, err: %d, status: 0x%x, out size: 0x%x\n", + "Failed to set vlan filter, err: %d, status: 0x%x, out size: 0x%x\n", err, vlan_filter.status, out_size); err = -EINVAL; } From 4eb0308d78d3891d7d691f719e262cf908bdcb35 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Mon, 9 Dec 2024 18:50:42 +0100 Subject: [PATCH 0380/1450] net: phy: dp83822: Replace DP83822_DEVADDR with MDIO_MMD_VEND2 Instead of using DP83822_DEVADDR which is locally defined use MDIO_MMD_VEND2. Signed-off-by: Dimitri Fedrau Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241209-dp83822-mdio-mmd-vend2-v1-1-4473c7284b94@liebherr.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83822.c | 58 +++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index cf8b6d0bfaa98..25ee09c48027c 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -22,8 +22,6 @@ #define DP83826C_PHY_ID 0x2000a130 #define DP83826NC_PHY_ID 0x2000a110 -#define DP83822_DEVADDR 0x1f - #define MII_DP83822_CTRL_2 0x0a #define MII_DP83822_PHYSTS 0x10 #define MII_DP83822_PHYSCR 0x11 @@ -159,14 +157,14 @@ static int dp83822_config_wol(struct phy_device *phydev, /* MAC addresses start with byte 5, but stored in mac[0]. * 822 PHYs store bytes 4|5, 2|3, 0|1 */ - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA1, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_DA1, (mac[1] << 8) | mac[0]); - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA2, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_DA2, (mac[3] << 8) | mac[2]); - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA3, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_DA3, (mac[5] << 8) | mac[4]); - value = phy_read_mmd(phydev, DP83822_DEVADDR, + value = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG); if (wol->wolopts & WAKE_MAGIC) value |= DP83822_WOL_MAGIC_EN; @@ -174,13 +172,13 @@ static int dp83822_config_wol(struct phy_device *phydev, value &= ~DP83822_WOL_MAGIC_EN; if (wol->wolopts & WAKE_MAGICSECURE) { - phy_write_mmd(phydev, DP83822_DEVADDR, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP1, (wol->sopass[1] << 8) | wol->sopass[0]); - phy_write_mmd(phydev, DP83822_DEVADDR, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP2, (wol->sopass[3] << 8) | wol->sopass[2]); - phy_write_mmd(phydev, DP83822_DEVADDR, + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP3, (wol->sopass[5] << 8) | wol->sopass[4]); value |= DP83822_WOL_SECURE_ON; @@ -194,10 +192,10 @@ static int dp83822_config_wol(struct phy_device *phydev, value |= DP83822_WOL_EN | DP83822_WOL_INDICATION_SEL | DP83822_WOL_CLR_INDICATION; - return phy_write_mmd(phydev, DP83822_DEVADDR, + return phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG, value); } else { - return phy_clear_bits_mmd(phydev, DP83822_DEVADDR, + return phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG, DP83822_WOL_EN | DP83822_WOL_MAGIC_EN | @@ -226,23 +224,23 @@ static void dp83822_get_wol(struct phy_device *phydev, wol->supported = (WAKE_MAGIC | WAKE_MAGICSECURE); wol->wolopts = 0; - value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG); + value = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG); if (value & DP83822_WOL_MAGIC_EN) wol->wolopts |= WAKE_MAGIC; if (value & DP83822_WOL_SECURE_ON) { - sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, + sopass_val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP1); wol->sopass[0] = (sopass_val & 0xff); wol->sopass[1] = (sopass_val >> 8); - sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, + sopass_val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP2); wol->sopass[2] = (sopass_val & 0xff); wol->sopass[3] = (sopass_val >> 8); - sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, + sopass_val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RXSOP3); wol->sopass[4] = (sopass_val & 0xff); wol->sopass[5] = (sopass_val >> 8); @@ -430,18 +428,18 @@ static int dp83822_config_init(struct phy_device *phydev) if (tx_int_delay <= 0) rgmii_delay |= DP83822_TX_CLK_SHIFT; - err = phy_modify_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + err = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RX_CLK_SHIFT | DP83822_TX_CLK_SHIFT, rgmii_delay); if (err) return err; - err = phy_set_bits_mmd(phydev, DP83822_DEVADDR, + err = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RGMII_MODE_EN); if (err) return err; } else { - err = phy_clear_bits_mmd(phydev, DP83822_DEVADDR, + err = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RGMII_MODE_EN); if (err) @@ -496,7 +494,7 @@ static int dp83822_config_init(struct phy_device *phydev) return err; if (dp83822->fx_signal_det_low) { - err = phy_set_bits_mmd(phydev, DP83822_DEVADDR, + err = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_GENCFG, DP83822_SIG_DET_LOW); if (err) @@ -514,10 +512,10 @@ static int dp8382x_config_rmii_mode(struct phy_device *phydev) if (!device_property_read_string(dev, "ti,rmii-mode", &of_val)) { if (strcmp(of_val, "master") == 0) { - ret = phy_clear_bits_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + ret = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RMII_MODE_SEL); } else if (strcmp(of_val, "slave") == 0) { - ret = phy_set_bits_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + ret = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RMII_MODE_SEL); } else { phydev_err(phydev, "Invalid value for ti,rmii-mode property (%s)\n", @@ -539,7 +537,7 @@ static int dp83826_config_init(struct phy_device *phydev) int ret; if (phydev->interface == PHY_INTERFACE_MODE_RMII) { - ret = phy_set_bits_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + ret = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RMII_MODE_EN); if (ret) return ret; @@ -548,7 +546,7 @@ static int dp83826_config_init(struct phy_device *phydev) if (ret) return ret; } else { - ret = phy_clear_bits_mmd(phydev, DP83822_DEVADDR, MII_DP83822_RCSR, + ret = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_RCSR, DP83822_RMII_MODE_EN); if (ret) return ret; @@ -560,7 +558,7 @@ static int dp83826_config_init(struct phy_device *phydev) FIELD_GET(DP83826_CFG_DAC_MINUS_MDIX_5_TO_4, dp83822->cfg_dac_minus)); mask = DP83826_VOD_CFG1_MINUS_MDIX_MASK | DP83826_VOD_CFG1_MINUS_MDI_MASK; - ret = phy_modify_mmd(phydev, DP83822_DEVADDR, MII_DP83826_VOD_CFG1, mask, val); + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83826_VOD_CFG1, mask, val); if (ret) return ret; @@ -568,7 +566,7 @@ static int dp83826_config_init(struct phy_device *phydev) FIELD_GET(DP83826_CFG_DAC_MINUS_MDIX_3_TO_0, dp83822->cfg_dac_minus)); mask = DP83826_VOD_CFG2_MINUS_MDIX_MASK; - ret = phy_modify_mmd(phydev, DP83822_DEVADDR, MII_DP83826_VOD_CFG2, mask, val); + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83826_VOD_CFG2, mask, val); if (ret) return ret; } @@ -577,7 +575,7 @@ static int dp83826_config_init(struct phy_device *phydev) val = FIELD_PREP(DP83826_VOD_CFG2_PLUS_MDIX_MASK, dp83822->cfg_dac_plus) | FIELD_PREP(DP83826_VOD_CFG2_PLUS_MDI_MASK, dp83822->cfg_dac_plus); mask = DP83826_VOD_CFG2_PLUS_MDIX_MASK | DP83826_VOD_CFG2_PLUS_MDI_MASK; - ret = phy_modify_mmd(phydev, DP83822_DEVADDR, MII_DP83826_VOD_CFG2, mask, val); + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83826_VOD_CFG2, mask, val); if (ret) return ret; } @@ -673,7 +671,7 @@ static int dp83822_read_straps(struct phy_device *phydev) int fx_enabled, fx_sd_enable; int val; - val = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_SOR1); + val = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_SOR1); if (val < 0) return val; @@ -748,7 +746,7 @@ static int dp83822_suspend(struct phy_device *phydev) { int value; - value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG); + value = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG); if (!(value & DP83822_WOL_EN)) genphy_suspend(phydev); @@ -762,9 +760,9 @@ static int dp83822_resume(struct phy_device *phydev) genphy_resume(phydev); - value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG); + value = phy_read_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG); - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, value | + phy_write_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_WOL_CFG, value | DP83822_WOL_CLR_INDICATION); return 0; From be325f08c432ae5ac6d6594d163e1899cdf202df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Dec 2024 10:07:45 +0000 Subject: [PATCH 0381/1450] rtnetlink: add ndo_fdb_dump_context rtnl_fdb_dump() and various ndo_fdb_dump() helpers share a hidden layout of cb->ctx. Before switching rtnl_fdb_dump() to for_each_netdev_dump() in the following patch, make this more explicit. Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241209100747.2269613-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../ethernet/freescale/dpaa2/dpaa2-switch.c | 3 ++- drivers/net/ethernet/mscc/ocelot_net.c | 3 ++- drivers/net/vxlan/vxlan_core.c | 5 ++-- include/linux/rtnetlink.h | 7 +++++ net/bridge/br_fdb.c | 3 ++- net/core/rtnetlink.c | 26 ++++++++++--------- net/dsa/user.c | 3 ++- 7 files changed, 32 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index a293b08f36d46..147a93bf9fa91 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -780,13 +780,14 @@ struct ethsw_dump_ctx { static int dpaa2_switch_fdb_dump_nl(struct fdb_dump_entry *entry, struct ethsw_dump_ctx *dump) { + struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; int is_dynamic = entry->type & DPSW_FDB_ENTRY_DINAMIC; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; - if (dump->idx < dump->cb->args[2]) + if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 558e03301aa8e..8d48468cddd7c 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -758,12 +758,13 @@ static int ocelot_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct ocelot_dump_ctx *dump = data; + struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; - if (dump->idx < dump->cb->args[2]) + if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 43cf672b7b9fc..0c356e0a61ef0 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1352,6 +1352,7 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; int err = 0; @@ -1364,7 +1365,7 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct vxlan_rdst *rd; if (rcu_access_pointer(f->nh)) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip_nh; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, @@ -1381,7 +1382,7 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, } list_for_each_entry_rcu(rd, &f->remotes, list) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; err = vxlan_fdb_info(skb, vxlan, f, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 811ce44113f6f..c43cffb014a7c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -178,6 +178,13 @@ void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); +/* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ +struct ndo_fdb_dump_context { + unsigned long s_h; + unsigned long s_idx; + unsigned long fdb_idx; +}; + extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 82bac2426631b..902694c0ce643 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -955,6 +955,7 @@ int br_fdb_dump(struct sk_buff *skb, struct net_device *filter_dev, int *idx) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *f; int err = 0; @@ -970,7 +971,7 @@ int br_fdb_dump(struct sk_buff *skb, rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { if (filter_dev != dev) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ab5f201bf0ab4..453cc8bf18fbe 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4762,15 +4762,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, int *idx, struct netdev_hw_addr_list *list) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct netdev_hw_addr *ha; - int err; u32 portid, seq; + int err; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, @@ -4909,10 +4910,9 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct net_device *dev; - struct net_device *br_dev = NULL; - const struct net_device_ops *ops = NULL; - const struct net_device_ops *cops = NULL; + const struct net_device_ops *ops = NULL, *cops = NULL; + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; + struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); struct hlist_head *head; int brport_idx = 0; @@ -4922,6 +4922,8 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; + NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); + if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, cb->extack); @@ -4939,8 +4941,8 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } - s_h = cb->args[0]; - s_idx = cb->args[1]; + s_h = ctx->s_h; + s_idx = ctx->s_idx; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; @@ -4992,7 +4994,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) cops = NULL; /* reset fdb offset to 0 for rest of the interfaces */ - cb->args[2] = 0; + ctx->fdb_idx = 0; fidx = 0; cont: idx++; @@ -5000,9 +5002,9 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) } out: - cb->args[0] = h; - cb->args[1] = idx; - cb->args[2] = fidx; + ctx->s_h = h; + ctx->s_idx = idx; + ctx->fdb_idx = fidx; return skb->len; } diff --git a/net/dsa/user.c b/net/dsa/user.c index 06c30a9e29ff8..c736c019e2af9 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -515,12 +515,13 @@ dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct dsa_user_dump_ctx *dump = data; + struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; - if (dump->idx < dump->cb->args[2]) + if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, From 53970a05f799087e2dd2005973609188504e7fcc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Dec 2024 10:07:46 +0000 Subject: [PATCH 0382/1450] rtnetlink: switch rtnl_fdb_dump() to for_each_netdev_dump() This is the last netdev iterator still using net->dev_index_head[]. Convert to modern for_each_netdev_dump() for better scalability, and use common patterns in our stack. Following patch in this series removes the pad field in struct ndo_fdb_dump_context. Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241209100747.2269613-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 4 +- net/core/rtnetlink.c | 92 +++++++++++++++------------------------ 2 files changed, 37 insertions(+), 59 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c43cffb014a7c..5546571c25537 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -180,8 +180,8 @@ void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); /* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ struct ndo_fdb_dump_context { - unsigned long s_h; - unsigned long s_idx; + unsigned long ifindex; + unsigned long pad; unsigned long fdb_idx; }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 453cc8bf18fbe..8fe252c298a2d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4914,13 +4914,10 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); - struct hlist_head *head; int brport_idx = 0; int br_idx = 0; - int h, s_h; - int idx = 0, s_idx; - int err = 0; int fidx = 0; + int err; NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); @@ -4941,69 +4938,50 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } - s_h = ctx->s_h; - s_idx = ctx->s_idx; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - - if (brport_idx && (dev->ifindex != brport_idx)) - continue; - - if (!br_idx) { /* user did not specify a specific bridge */ - if (netif_is_bridge_port(dev)) { - br_dev = netdev_master_upper_dev_get(dev); - cops = br_dev->netdev_ops; - } - } else { - if (dev != br_dev && - !netif_is_bridge_port(dev)) - continue; + for_each_netdev_dump(net, dev, ctx->ifindex) { + if (brport_idx && (dev->ifindex != brport_idx)) + continue; - if (br_dev != netdev_master_upper_dev_get(dev) && - !netif_is_bridge_master(dev)) - continue; - cops = ops; + if (!br_idx) { /* user did not specify a specific bridge */ + if (netif_is_bridge_port(dev)) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; } + } else { + if (dev != br_dev && + !netif_is_bridge_port(dev)) + continue; - if (idx < s_idx) - goto cont; + if (br_dev != netdev_master_upper_dev_get(dev) && + !netif_is_bridge_master(dev)) + continue; + cops = ops; + } - if (netif_is_bridge_port(dev)) { - if (cops && cops->ndo_fdb_dump) { - err = cops->ndo_fdb_dump(skb, cb, - br_dev, dev, - &fidx); - if (err == -EMSGSIZE) - goto out; - } + if (netif_is_bridge_port(dev)) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + break; } + } - if (dev->netdev_ops->ndo_fdb_dump) - err = dev->netdev_ops->ndo_fdb_dump(skb, cb, - dev, NULL, - &fidx); - else - err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, - &fidx); - if (err == -EMSGSIZE) - goto out; + if (dev->netdev_ops->ndo_fdb_dump) + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, + &fidx); + else + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); + if (err == -EMSGSIZE) + break; - cops = NULL; + cops = NULL; - /* reset fdb offset to 0 for rest of the interfaces */ - ctx->fdb_idx = 0; - fidx = 0; -cont: - idx++; - } + /* reset fdb offset to 0 for rest of the interfaces */ + ctx->fdb_idx = 0; + fidx = 0; } -out: - ctx->s_h = h; - ctx->s_idx = idx; ctx->fdb_idx = fidx; return skb->len; From 53a6d8912372fc23ea82cc7a49eb59047aa0a650 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Dec 2024 10:07:47 +0000 Subject: [PATCH 0383/1450] rtnetlink: remove pad field in ndo_fdb_dump_context I chose to remove this field in a separate patch to ease potential bisection, in case one ndo_fdb_dump() is still using the old way (cb->args[2] instead of ctx->fdb_idx) Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241209100747.2269613-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 5546571c25537..3b9d132cbc9ef 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -181,7 +181,6 @@ void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); /* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ struct ndo_fdb_dump_context { unsigned long ifindex; - unsigned long pad; unsigned long fdb_idx; }; From ce864c76ccd69470205f5cb22181bffe23563730 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 6 Dec 2024 20:57:13 +0100 Subject: [PATCH 0384/1450] net: wwan: t7xx: Replace deprecated PCI functions pcim_iomap_regions() and pcim_iomap_table() have been deprecated by the PCI subsystem. Replace them with pcim_iomap_region(). Additionally, pass the actual driver name to that function to improve debug output. Signed-off-by: Philipp Stanner Reviewed-by: Simon Horman Reviewed-by: Sergey Ryazanov Link: https://patch.msgid.link/20241206195712.182282-2-pstanner@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/wwan/t7xx/t7xx_pci.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_pci.c b/drivers/net/wwan/t7xx/t7xx_pci.c index 8381b0dc7acb2..02f2ec7cf4cee 100644 --- a/drivers/net/wwan/t7xx/t7xx_pci.c +++ b/drivers/net/wwan/t7xx/t7xx_pci.c @@ -43,6 +43,8 @@ #include "t7xx_state_monitor.h" #include "t7xx_port_proxy.h" +#define DRIVER_NAME "mtk_t7xx" + #define T7XX_PCI_IREG_BASE 0 #define T7XX_PCI_EREG_BASE 2 @@ -833,6 +835,7 @@ static void t7xx_pci_infracfg_ao_calc(struct t7xx_pci_dev *t7xx_dev) static int t7xx_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct t7xx_pci_dev *t7xx_dev; + void __iomem *iomem; int ret; t7xx_dev = devm_kzalloc(&pdev->dev, sizeof(*t7xx_dev), GFP_KERNEL); @@ -848,12 +851,21 @@ static int t7xx_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_master(pdev); - ret = pcim_iomap_regions(pdev, BIT(T7XX_PCI_IREG_BASE) | BIT(T7XX_PCI_EREG_BASE), - pci_name(pdev)); + iomem = pcim_iomap_region(pdev, T7XX_PCI_IREG_BASE, DRIVER_NAME); + ret = PTR_ERR_OR_ZERO(iomem); + if (ret) { + dev_err(&pdev->dev, "Could not request IREG BAR: %d\n", ret); + return -ENOMEM; + } + IREG_BASE(t7xx_dev) = iomem; + + iomem = pcim_iomap_region(pdev, T7XX_PCI_EREG_BASE, DRIVER_NAME); + ret = PTR_ERR_OR_ZERO(iomem); if (ret) { - dev_err(&pdev->dev, "Could not request BARs: %d\n", ret); + dev_err(&pdev->dev, "Could not request EREG BAR: %d\n", ret); return -ENOMEM; } + t7xx_dev->base_addr.pcie_ext_reg_base = iomem; ret = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); if (ret) { @@ -867,9 +879,6 @@ static int t7xx_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; } - IREG_BASE(t7xx_dev) = pcim_iomap_table(pdev)[T7XX_PCI_IREG_BASE]; - t7xx_dev->base_addr.pcie_ext_reg_base = pcim_iomap_table(pdev)[T7XX_PCI_EREG_BASE]; - ret = t7xx_pci_pm_init(t7xx_dev); if (ret) return ret; @@ -937,7 +946,7 @@ static const struct pci_device_id t7xx_pci_table[] = { MODULE_DEVICE_TABLE(pci, t7xx_pci_table); static struct pci_driver t7xx_pci_driver = { - .name = "mtk_t7xx", + .name = DRIVER_NAME, .id_table = t7xx_pci_table, .probe = t7xx_pci_probe, .remove = t7xx_pci_remove, From d354d008255ffdde6f3d4549dd6a06a14ee76619 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 9 Dec 2024 14:07:41 +0100 Subject: [PATCH 0385/1450] net: usb: lan78xx: Add error handling to lan78xx_setup_irq_domain Update `lan78xx_setup_irq_domain` to handle errors in `lan78xx_read_reg`. Return the error code immediately if the read operation fails, ensuring proper error propagation during IRQ domain setup. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241209130751.703182-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index d5f6367d37148..070b21baffaf6 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2445,7 +2445,10 @@ static int lan78xx_setup_irq_domain(struct lan78xx_net *dev) mutex_init(&dev->domain_data.irq_lock); - lan78xx_read_reg(dev, INT_EP_CTL, &buf); + ret = lan78xx_read_reg(dev, INT_EP_CTL, &buf); + if (ret < 0) + return ret; + dev->domain_data.irqenable = buf; dev->domain_data.irqchip = &lan78xx_irqchip; From 6f31135894ec96481e2bda93a1da70712f5e57c1 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 9 Dec 2024 14:07:42 +0100 Subject: [PATCH 0386/1450] net: usb: lan78xx: Add error handling to lan78xx_init_mac_address Convert `lan78xx_init_mac_address` to return error codes and handle failures in register read and write operations. Update `lan78xx_reset` to check for errors during MAC address initialization and propagate them appropriately. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241209130751.703182-3-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 070b21baffaf6..26dc43bac84b5 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2153,13 +2153,19 @@ static const struct ethtool_ops lan78xx_ethtool_ops = { .get_regs = lan78xx_get_regs, }; -static void lan78xx_init_mac_address(struct lan78xx_net *dev) +static int lan78xx_init_mac_address(struct lan78xx_net *dev) { u32 addr_lo, addr_hi; u8 addr[6]; + int ret; + + ret = lan78xx_read_reg(dev, RX_ADDRL, &addr_lo); + if (ret < 0) + return ret; - lan78xx_read_reg(dev, RX_ADDRL, &addr_lo); - lan78xx_read_reg(dev, RX_ADDRH, &addr_hi); + ret = lan78xx_read_reg(dev, RX_ADDRH, &addr_hi); + if (ret < 0) + return ret; addr[0] = addr_lo & 0xFF; addr[1] = (addr_lo >> 8) & 0xFF; @@ -2192,14 +2198,26 @@ static void lan78xx_init_mac_address(struct lan78xx_net *dev) (addr[2] << 16) | (addr[3] << 24); addr_hi = addr[4] | (addr[5] << 8); - lan78xx_write_reg(dev, RX_ADDRL, addr_lo); - lan78xx_write_reg(dev, RX_ADDRH, addr_hi); + ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi); + if (ret < 0) + return ret; } - lan78xx_write_reg(dev, MAF_LO(0), addr_lo); - lan78xx_write_reg(dev, MAF_HI(0), addr_hi | MAF_HI_VALID_); + ret = lan78xx_write_reg(dev, MAF_LO(0), addr_lo); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, MAF_HI(0), addr_hi | MAF_HI_VALID_); + if (ret < 0) + return ret; eth_hw_addr_set(dev->net, addr); + + return 0; } /* MDIO read and write wrappers for phylib */ @@ -2990,7 +3008,9 @@ static int lan78xx_reset(struct lan78xx_net *dev) } } while (buf & HW_CFG_LRST_); - lan78xx_init_mac_address(dev); + ret = lan78xx_init_mac_address(dev); + if (ret < 0) + return ret; /* save DEVID for later usage */ ret = lan78xx_read_reg(dev, ID_REV, &buf); From 9a46956c72cbc5c1725b3ee7de5586e6e8b1b0b7 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 9 Dec 2024 14:07:43 +0100 Subject: [PATCH 0387/1450] net: usb: lan78xx: Add error handling to lan78xx_set_mac_addr Update `lan78xx_set_mac_addr` to handle errors during MAC address register write operations. Ensure that errors are properly propagated to the caller, improving the robustness of MAC address updates. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241209130751.703182-4-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 26dc43bac84b5..5d318ff8b33de 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2763,6 +2763,7 @@ static int lan78xx_set_mac_addr(struct net_device *netdev, void *p) struct lan78xx_net *dev = netdev_priv(netdev); struct sockaddr *addr = p; u32 addr_lo, addr_hi; + int ret; if (netif_running(netdev)) return -EBUSY; @@ -2779,14 +2780,20 @@ static int lan78xx_set_mac_addr(struct net_device *netdev, void *p) addr_hi = netdev->dev_addr[4] | netdev->dev_addr[5] << 8; - lan78xx_write_reg(dev, RX_ADDRL, addr_lo); - lan78xx_write_reg(dev, RX_ADDRH, addr_hi); + ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo); + if (ret < 0) + return ret; + + ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi); + if (ret < 0) + return ret; /* Added to support MAC address changes */ - lan78xx_write_reg(dev, MAF_LO(0), addr_lo); - lan78xx_write_reg(dev, MAF_HI(0), addr_hi | MAF_HI_VALID_); + ret = lan78xx_write_reg(dev, MAF_LO(0), addr_lo); + if (ret < 0) + return ret; - return 0; + return lan78xx_write_reg(dev, MAF_HI(0), addr_hi | MAF_HI_VALID_); } /* Enable or disable Rx checksum offload engine */ From 41b774e4f3279a3b3149a36fe27557ecdc72c29c Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 9 Dec 2024 14:07:45 +0100 Subject: [PATCH 0388/1450] net: usb: lan78xx: Simplify lan78xx_update_reg Simplify `lan78xx_update_reg` by directly returning the result of `lan78xx_write_reg`. This eliminates unnecessary checks and improves code readability. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241209130751.703182-6-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 5d318ff8b33de..442b6ee2dd461 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -674,11 +674,7 @@ static int lan78xx_update_reg(struct lan78xx_net *dev, u32 reg, u32 mask, buf &= ~mask; buf |= (mask & data); - ret = lan78xx_write_reg(dev, reg, buf); - if (ret < 0) - return ret; - - return 0; + return lan78xx_write_reg(dev, reg, buf); } static int lan78xx_read_stats(struct lan78xx_net *dev, From bf361b18d91e96dee50c5794097a80ff3594725c Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 9 Dec 2024 14:07:46 +0100 Subject: [PATCH 0389/1450] net: usb: lan78xx: Fix return value handling in lan78xx_set_features Update `lan78xx_set_features` to correctly return the result of `lan78xx_write_reg`. This ensures that errors during register writes are propagated to the caller. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241209130751.703182-7-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 442b6ee2dd461..71f8176210c93 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2822,9 +2822,7 @@ static int lan78xx_set_features(struct net_device *netdev, spin_unlock_irqrestore(&pdata->rfe_ctl_lock, flags); - lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl); - - return 0; + return lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl); } static void lan78xx_deferred_vlan_write(struct work_struct *param) From 21fff45a6cc1b5fceeedc5f6e2ccb118d4c19063 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 9 Dec 2024 14:07:49 +0100 Subject: [PATCH 0390/1450] net: usb: lan78xx: Improve error handling in lan78xx_phy_wait_not_busy Update `lan78xx_phy_wait_not_busy` to forward errors from `lan78xx_read_reg` instead of overwriting them with `-EIO`. Replace `-EIO` with `-ETIMEDOUT` for timeout cases, providing more specific and appropriate error codes. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241209130751.703182-10-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 71f8176210c93..7e7407748426d 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -963,14 +963,14 @@ static int lan78xx_phy_wait_not_busy(struct lan78xx_net *dev) do { ret = lan78xx_read_reg(dev, MII_ACC, &val); - if (unlikely(ret < 0)) - return -EIO; + if (ret < 0) + return ret; if (!(val & MII_ACC_MII_BUSY_)) return 0; } while (!time_after(jiffies, start_time + HZ)); - return -EIO; + return -ETIMEDOUT; } static inline u32 mii_access(int id, int index, int read) From 530f17e6cb3bac67723dd90b289a381cc04a52e8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 9 Dec 2024 14:07:50 +0100 Subject: [PATCH 0391/1450] net: usb: lan78xx: Rename lan78xx_phy_wait_not_busy to lan78xx_mdiobus_wait_not_busy Rename `lan78xx_phy_wait_not_busy` to `lan78xx_mdiobus_wait_not_busy` for clarity and accuracy, as the function operates on the MII bus rather than a specific PHY. Update all references to reflect the new name. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241209130751.703182-11-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 7e7407748426d..4661d131b1904 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -955,7 +955,7 @@ static int lan78xx_flush_rx_fifo(struct lan78xx_net *dev) } /* Loop until the read is completed with timeout called with phy_mutex held */ -static int lan78xx_phy_wait_not_busy(struct lan78xx_net *dev) +static int lan78xx_mdiobus_wait_not_busy(struct lan78xx_net *dev) { unsigned long start_time = jiffies; u32 val; @@ -1604,7 +1604,7 @@ static int lan78xx_mac_reset(struct lan78xx_net *dev) * bus can result in the MAC interface locking up and not * completing register access transactions. */ - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) goto done; @@ -2230,7 +2230,7 @@ static int lan78xx_mdiobus_read(struct mii_bus *bus, int phy_id, int idx) mutex_lock(&dev->phy_mutex); /* confirm MII not busy */ - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) goto done; @@ -2240,7 +2240,7 @@ static int lan78xx_mdiobus_read(struct mii_bus *bus, int phy_id, int idx) if (ret < 0) goto done; - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) goto done; @@ -2271,7 +2271,7 @@ static int lan78xx_mdiobus_write(struct mii_bus *bus, int phy_id, int idx, mutex_lock(&dev->phy_mutex); /* confirm MII not busy */ - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) goto done; @@ -2286,7 +2286,7 @@ static int lan78xx_mdiobus_write(struct mii_bus *bus, int phy_id, int idx, if (ret < 0) goto done; - ret = lan78xx_phy_wait_not_busy(dev); + ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) goto done; From b8f614207b0d5e4abd6df8d5cb3cc11f009d1d93 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 9 Dec 2024 09:29:24 -0600 Subject: [PATCH 0392/1450] scx: Fix maximal BPF selftest prog maximal.bpf.c is still dispatching to and consuming from SCX_DSQ_GLOBAL. Let's have it use its own DSQ to avoid any runtime errors. Signed-off-by: David Vernet Tested-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/maximal.bpf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c index 4c005fa718103..430f5e13bf554 100644 --- a/tools/testing/selftests/sched_ext/maximal.bpf.c +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -12,6 +12,8 @@ char _license[] SEC("license") = "GPL"; +#define DSQ_ID 0 + s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) { @@ -20,7 +22,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) { - scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); } void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) @@ -28,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) { - scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); + scx_bpf_dsq_move_to_local(DSQ_ID); } void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) @@ -123,7 +125,7 @@ void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) { - return 0; + return scx_bpf_create_dsq(DSQ_ID, -1); } void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) From 2d2f25405a87cfa270ea7b5bb03a612c1a16020a Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 9 Dec 2024 18:45:01 +0100 Subject: [PATCH 0393/1450] MAINTAINERS: add self as reviewer for sched_ext Add myself as a reviewer for sched_ext, as I am actively working on this project and would like to help review relevant patches and address any related kernel issues. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b13..41bae8792a77b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20902,6 +20902,7 @@ F: kernel/sched/ SCHEDULER - SCHED_EXT R: Tejun Heo R: David Vernet +R: Andrea Righi L: linux-kernel@vger.kernel.org S: Maintained W: https://github.com/sched-ext/scx From e34f1717ef0632fcec5cb827e5e0e9f223d70c9b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 10:25:51 -0600 Subject: [PATCH 0394/1450] thunderbolt: Don't display nvm_version unless upgrade supported The read will never succeed if NVM wasn't initialized due to an unknown format. Add a new callback for visibility to only show when supported. Cc: stable@vger.kernel.org Fixes: aef9c693e7e5 ("thunderbolt: Move vendor specific NVM handling into nvm.c") Reported-by: Richard Hughes Closes: https://github.com/fwupd/fwupd/issues/8200 Signed-off-by: Mario Limonciello Signed-off-by: Mika Westerberg --- drivers/thunderbolt/retimer.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/thunderbolt/retimer.c b/drivers/thunderbolt/retimer.c index 89d2919d0193e..eeb64433ebbca 100644 --- a/drivers/thunderbolt/retimer.c +++ b/drivers/thunderbolt/retimer.c @@ -103,6 +103,7 @@ static int tb_retimer_nvm_add(struct tb_retimer *rt) err_nvm: dev_dbg(&rt->dev, "NVM upgrade disabled\n"); + rt->no_nvm_upgrade = true; if (!IS_ERR(nvm)) tb_nvm_free(nvm); @@ -182,8 +183,6 @@ static ssize_t nvm_authenticate_show(struct device *dev, if (!rt->nvm) ret = -EAGAIN; - else if (rt->no_nvm_upgrade) - ret = -EOPNOTSUPP; else ret = sysfs_emit(buf, "%#x\n", rt->auth_status); @@ -323,8 +322,6 @@ static ssize_t nvm_version_show(struct device *dev, if (!rt->nvm) ret = -EAGAIN; - else if (rt->no_nvm_upgrade) - ret = -EOPNOTSUPP; else ret = sysfs_emit(buf, "%x.%x\n", rt->nvm->major, rt->nvm->minor); @@ -342,6 +339,19 @@ static ssize_t vendor_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(vendor); +static umode_t retimer_is_visible(struct kobject *kobj, struct attribute *attr, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct tb_retimer *rt = tb_to_retimer(dev); + + if (attr == &dev_attr_nvm_authenticate.attr || + attr == &dev_attr_nvm_version.attr) + return rt->no_nvm_upgrade ? 0 : attr->mode; + + return attr->mode; +} + static struct attribute *retimer_attrs[] = { &dev_attr_device.attr, &dev_attr_nvm_authenticate.attr, @@ -351,6 +361,7 @@ static struct attribute *retimer_attrs[] = { }; static const struct attribute_group retimer_group = { + .is_visible = retimer_is_visible, .attrs = retimer_attrs, }; From fdad4fb7c506bea8b419f70ff2163d99962e8ede Mon Sep 17 00:00:00 2001 From: Daniel Swanemar Date: Mon, 4 Nov 2024 14:42:17 +0100 Subject: [PATCH 0395/1450] USB: serial: option: add TCL IK512 MBIM & ECM Add the following TCL IK512 compositions: 0x0530: Modem + Diag + AT + MBIM T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 3 Spd=10000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=1bbb ProdID=0530 Rev=05.04 S: Manufacturer=TCL S: Product=TCL 5G USB Dongle S: SerialNumber=3136b91a C: #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 4 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms 0x0640: ECM + Modem + Diag + AT T: Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 4 Spd=10000 MxCh= 0 D: Ver= 3.20 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 9 #Cfgs= 1 P: Vendor=1bbb ProdID=0640 Rev=05.04 S: Manufacturer=TCL S: Product=TCL 5G USB Dongle S: SerialNumber=3136b91a C: #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=896mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms Signed-off-by: Daniel Swanemar Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 9ba5584061c8c..437960002bc3b 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2385,6 +2385,10 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, + { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0530, 0xff), /* TCL IK512 MBIM */ + .driver_info = NCTRL(1) }, + { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0640, 0xff), /* TCL IK512 ECM */ + .driver_info = NCTRL(3) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); From 724d461e44dfc0815624d2a9792f2f2beb7ee46d Mon Sep 17 00:00:00 2001 From: Michal Hrusecky Date: Tue, 19 Nov 2024 14:00:18 +0100 Subject: [PATCH 0396/1450] USB: serial: option: add MeiG Smart SLM770A Update the USB serial option driver to support MeiG Smart SLM770A. ID 2dee:4d57 Marvell Mobile Composite Device Bus T: Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=2dee ProdID=4d57 Rev= 1.00 S: Manufacturer=Marvell S: Product=Mobile Composite Device Bus C:* #Ifs= 6 Cfg#= 1 Atr=c0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=e0(wlcon) Sub=01 Prot=03 I:* If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host E: Ad=87(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0c(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0b(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=88(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0a(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=89(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0e(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Tested successfully connecting to the Internet via rndis interface after dialing via AT commands on If#=3 or If#=4. Not sure of the purpose of the other serial interfaces. Signed-off-by: Michal Hrusecky Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 437960002bc3b..a807101548e7b 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -625,6 +625,8 @@ static void option_instat_callback(struct urb *urb); #define MEIGSMART_PRODUCT_SRM825L 0x4d22 /* MeiG Smart SLM320 based on UNISOC UIS8910 */ #define MEIGSMART_PRODUCT_SLM320 0x4d41 +/* MeiG Smart SLM770A based on ASR1803 */ +#define MEIGSMART_PRODUCT_SLM770A 0x4d57 /* Device flags */ @@ -2382,6 +2384,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM320, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM770A, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, From aa954ae08262bb5cd6ab18dd56a0b58c1315db8b Mon Sep 17 00:00:00 2001 From: Mank Wang Date: Fri, 22 Nov 2024 09:06:00 +0000 Subject: [PATCH 0397/1450] USB: serial: option: add Netprisma LCUK54 modules for WWAN Ready LCUK54-WRD's pid/vid 0x3731/0x010a 0x3731/0x010c LCUK54-WWD's pid/vid 0x3731/0x010b 0x3731/0x010d Above products use the exact same interface layout and option driver: MBIM + GNSS + DIAG + NMEA + AT + QDSS + DPL T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 5 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=3731 ProdID=0101 Rev= 5.04 S: Manufacturer=NetPrisma S: Product=LCUK54-WRD S: SerialNumber=feeba631 C:* #Ifs= 8 Cfg#= 1 Atr=a0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8f(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Mank Wang [ johan: use lower case hex notation ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index a807101548e7b..e897c723b041c 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2377,6 +2377,18 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for Golbal EDU */ { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0x00, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WRD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010a, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010b, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WRD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010c, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for WWAN Ready */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x010d, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(OPPO_VENDOR_ID, OPPO_PRODUCT_R11, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) }, From f07dfa6a1b65034a5c3ba3a555950d972f252757 Mon Sep 17 00:00:00 2001 From: Jack Wu Date: Thu, 28 Nov 2024 10:22:27 +0800 Subject: [PATCH 0398/1450] USB: serial: option: add MediaTek T7XX compositions Add the MediaTek T7XX compositions: T: Bus=03 Lev=01 Prnt=01 Port=05 Cnt=01 Dev#= 74 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0e8d ProdID=7129 Rev= 0.01 S: Manufacturer=MediaTek Inc. S: Product=USB DATA CARD S: SerialNumber=004402459035402 C:* #Ifs=10 Cfg#= 1 Atr=a0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 7 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 8 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=08(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 9 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=8a(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=09(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms ------------------------------- | If Number | Function | ------------------------------- | 2 | USB AP Log Port | ------------------------------- | 3 | USB AP GNSS Port| ------------------------------- | 4 | USB AP META Port| ------------------------------- | 5 | ADB port | ------------------------------- | 6 | USB MD AT Port | ------------------------------ | 7 | USB MD META Port| ------------------------------- | 8 | USB NTZ Port | ------------------------------- | 9 | USB Debug port | ------------------------------- Signed-off-by: Jack Wu Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index e897c723b041c..dcedb88ad7c13 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2249,6 +2249,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) }, { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7127, 0xff, 0x00, 0x00), .driver_info = NCTRL(2) | NCTRL(3) | NCTRL(4) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7129, 0xff, 0x00, 0x00), /* MediaTek T7XX */ + .driver_info = NCTRL(2) | NCTRL(3) | NCTRL(4) }, { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) }, { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MPL200), .driver_info = RSVD(1) | RSVD(4) }, From 8366e64a4454481339e7c56a8ad280161f2e441d Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Mon, 9 Dec 2024 16:32:54 +0100 Subject: [PATCH 0399/1450] USB: serial: option: add Telit FE910C04 rmnet compositions Add the following Telit FE910C04 compositions: 0x10c0: rmnet + tty (AT/NMEA) + tty (AT) + tty (diag) T: Bus=02 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 13 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10c0 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FE910 S: SerialNumber=f71b8b32 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10c4: rmnet + tty (AT) + tty (AT) + tty (diag) T: Bus=02 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 14 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10c4 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FE910 S: SerialNumber=f71b8b32 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10c8: rmnet + tty (AT) + tty (diag) + DPL (data packet logging) + adb T: Bus=02 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10c8 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FE910 S: SerialNumber=f71b8b32 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Daniele Palmas Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index dcedb88ad7c13..64317b390d228 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1397,6 +1397,12 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10aa, 0xff), /* Telit FN920C04 (MBIM) */ .driver_info = NCTRL(3) | RSVD(4) | RSVD(5) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c0, 0xff), /* Telit FE910C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c4, 0xff), /* Telit FE910C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c8, 0xff), /* Telit FE910C04 (rmnet) */ + .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), From b44679c63e4d3ac820998b6bd59fba89a72ad3e7 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 9 Dec 2024 19:42:39 +0800 Subject: [PATCH 0400/1450] iomap: pass byte granular end position to iomap_add_to_ioend This is a preparatory patch for fixing zero padding issues in concurrent append write scenarios. In the following patches, we need to obtain byte-granular writeback end position for io_size trimming after EOF handling. Due to concurrent writeback and truncate operations, inode size may shrink. Resampling inode size would force writeback code to handle the newly appeared post-EOF blocks, which is undesirable. As Dave explained in [1]: "Really, the issue is that writeback mappings have to be able to handle the range being mapped suddenly appear to be beyond EOF. This behaviour is a longstanding writeback constraint, and is what iomap_writepage_handle_eof() is attempting to handle. We handle this by only sampling i_size_read() whilst we have the folio locked and can determine the action we should take with that folio (i.e. nothing, partial zeroing, or skip altogether). Once we've made the decision that the folio is within EOF and taken action on it (i.e. moved the folio to writeback state), we cannot then resample the inode size because a truncate may have started and changed the inode size." To avoid resampling inode size after EOF handling, we convert end_pos to byte-granular writeback position and return it from EOF handling function. Since iomap_set_range_dirty() can handle unaligned lengths, this conversion has no impact on it. However, iomap_find_dirty_range() requires aligned start and end range to find dirty blocks within the given range, so the end position needs to be rounded up when passed to it. LINK [1]: https://lore.kernel.org/linux-xfs/Z1Gg0pAa54MoeYME@localhost.localdomain/ Signed-off-by: Long Li Link: https://lore.kernel.org/r/20241209114241.3725722-2-leo.lilong@huawei.com Reviewed-by: Brian Foster Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 955f19e27e47c..bcc7831d03afb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1774,7 +1774,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) */ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, loff_t pos, unsigned len) + struct inode *inode, loff_t pos, loff_t end_pos, + unsigned len) { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); @@ -1800,8 +1801,8 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, u64 pos, unsigned dirty_len, - unsigned *count) + struct inode *inode, u64 pos, u64 end_pos, + unsigned dirty_len, unsigned *count) { int error; @@ -1826,7 +1827,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, break; default: error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, - map_len); + end_pos, map_len); if (!error) (*count)++; break; @@ -1897,11 +1898,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, * remaining memory is zeroed when mapped, and writes to that * region are not written out to the file. * - * Also adjust the writeback range to skip all blocks entirely - * beyond i_size. + * Also adjust the end_pos to the end of file and skip writeback + * for all blocks entirely beyond i_size. */ folio_zero_segment(folio, poff, folio_size(folio)); - *end_pos = round_up(isize, i_blocksize(inode)); + *end_pos = isize; } return true; @@ -1914,6 +1915,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct inode *inode = folio->mapping->host; u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); + u64 end_aligned = 0; unsigned count = 0; int error = 0; u32 rlen; @@ -1955,9 +1957,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, /* * Walk through the folio to find dirty areas to write back. */ - while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { + end_aligned = round_up(end_pos, i_blocksize(inode)); + while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, - pos, rlen, &count); + pos, end_pos, rlen, &count); if (error) break; pos += rlen; From 51d20d1dacbec589d459e11fc88fbca419f84a99 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 9 Dec 2024 19:42:40 +0800 Subject: [PATCH 0401/1450] iomap: fix zero padding data issue in concurrent append writes During concurrent append writes to XFS filesystem, zero padding data may appear in the file after power failure. This happens due to imprecise disk size updates when handling write completion. Consider this scenario with concurrent append writes same file: Thread 1: Thread 2: ------------ ----------- write [A, A+B] update inode size to A+B submit I/O [A, A+BS] write [A+B, A+B+C] update inode size to A+B+C After reboot: 1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C] |< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000| ^ ^ ^ A A+B A+B+C (EOF) 2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS] |< Block Size (BS) >|< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000| ^ ^ ^ ^ A A+B A+BS A+B+C (EOF) D = Valid Data 0 = Zero Padding The issue stems from disk size being set to min(io_offset + io_size, inode->i_size) at I/O completion. Since io_offset+io_size is block size granularity, it may exceed the actual valid file data size. In the case of concurrent append writes, inode->i_size may be larger than the actual range of valid file data written to disk, leading to inaccurate disk size updates. This patch modifies the meaning of io_size to represent the size of valid data within EOF in an ioend. If the ioend spans beyond i_size, io_size will be trimmed to provide the file with more accurate size information. This is particularly useful for on-disk size updates at completion time. After this change, ioends that span i_size will not grow or merge with other ioends in concurrent scenarios. However, these cases that need growth/merging rarely occur and it seems no noticeable performance impact. Although rounding up io_size could enable ioend growth/merging in these scenarios, we decided to keep the code simple after discussion [1]. Another benefit is that it makes the xfs_ioend_is_append() check more accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio() in certain scenarios, such as repeated writes at the file tail without extending the file size. Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this Signed-off-by: Long Li Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 45 ++++++++++++++++++++++++++++++++++++++++++ include/linux/iomap.h | 2 +- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bcc7831d03afb..54dc27d927810 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1794,7 +1794,52 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, if (ifs) atomic_add(len, &ifs->write_bytes_pending); + + /* + * Clamp io_offset and io_size to the incore EOF so that ondisk + * file size updates in the ioend completion are byte-accurate. + * This avoids recovering files with zeroed tail regions when + * writeback races with appending writes: + * + * Thread 1: Thread 2: + * ------------ ----------- + * write [A, A+B] + * update inode size to A+B + * submit I/O [A, A+BS] + * write [A+B, A+B+C] + * update inode size to A+B+C + * + * + * + * After reboot: + * 1) with A+B+C < A+BS, the file has zero padding in range + * [A+B, A+B+C] + * + * |< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000| + * ^ ^ ^ + * A A+B A+B+C + * (EOF) + * + * 2) with A+B+C > A+BS, the file has zero padding in range + * [A+B, A+BS] + * + * |< Block Size (BS) >|< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| + * ^ ^ ^ ^ + * A A+B A+BS A+B+C + * (EOF) + * + * D = Valid Data + * 0 = Zero Padding + * + * Note that this defeats the ability to chain the ioends of + * appending writes. + */ wpc->ioend->io_size += len; + if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) + wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; + wbc_account_cgroup_owner(wbc, folio, len); return 0; } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5675af6b740c2..75bf54e76f3b8 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -335,7 +335,7 @@ struct iomap_ioend { u16 io_type; u16 io_flags; /* IOMAP_F_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ + size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ From 54d3970548bd9de40f921c95f8c31e1b1b2382cb Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Mon, 9 Dec 2024 11:24:11 +0500 Subject: [PATCH 0402/1450] net: renesas: rswitch: enable only used MFWD features Currently, rswitch driver does not utilize most of MFWD forwarding and processing features. It only uses port-based forwarding for ETHA ports, and direct descriptor forwarding for GWCA port. Update rswitch_fwd_init() to enable exactly that, and keep everything else disabled. Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/rswitch.c | 30 +++++++++++++++++--------- drivers/net/ethernet/renesas/rswitch.h | 14 ++++++------ 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index b754cc96e268d..ec52f04163893 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -111,25 +111,35 @@ static void rswitch_top_init(struct rswitch_private *priv) /* Forwarding engine block (MFWD) */ static void rswitch_fwd_init(struct rswitch_private *priv) { + u32 all_ports_mask = GENMASK(RSWITCH_NUM_AGENTS - 1, 0); unsigned int i; - /* For ETHA */ - for (i = 0; i < RSWITCH_NUM_PORTS; i++) { - iowrite32(FWPC0_DEFAULT, priv->addr + FWPC0(i)); + /* Start with empty configuration */ + for (i = 0; i < RSWITCH_NUM_AGENTS; i++) { + /* Disable all port features */ + iowrite32(0, priv->addr + FWPC0(i)); + /* Disallow L3 forwarding and direct descriptor forwarding */ + iowrite32(FIELD_PREP(FWCP1_LTHFW, all_ports_mask), + priv->addr + FWPC1(i)); + /* Disallow L2 forwarding */ + iowrite32(FIELD_PREP(FWCP2_LTWFW, all_ports_mask), + priv->addr + FWPC2(i)); + /* Disallow port based forwarding */ iowrite32(0, priv->addr + FWPBFC(i)); } - for (i = 0; i < RSWITCH_NUM_PORTS; i++) { + /* For enabled ETHA ports, setup port based forwarding */ + rswitch_for_each_enabled_port(priv, i) { + /* Port based forwarding from port i to GWCA port */ + rswitch_modify(priv->addr, FWPBFC(i), FWPBFC_PBDV, + FIELD_PREP(FWPBFC_PBDV, BIT(priv->gwca.index))); + /* Within GWCA port, forward to Rx queue for port i */ iowrite32(priv->rdev[i]->rx_queue->index, priv->addr + FWPBFCSDC(GWCA_INDEX, i)); - iowrite32(BIT(priv->gwca.index), priv->addr + FWPBFC(i)); } - /* For GWCA */ - iowrite32(FWPC0_DEFAULT, priv->addr + FWPC0(priv->gwca.index)); - iowrite32(FWPC1_DDE, priv->addr + FWPC1(priv->gwca.index)); - iowrite32(0, priv->addr + FWPBFC(priv->gwca.index)); - iowrite32(GENMASK(RSWITCH_NUM_PORTS - 1, 0), priv->addr + FWPBFC(priv->gwca.index)); + /* For GWCA port, allow direct descriptor forwarding */ + rswitch_modify(priv->addr, FWPC1(priv->gwca.index), FWPC1_DDE, FWPC1_DDE); } /* Gateway CPU agent block (GWCA) */ diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index 303883369b94a..0f06f34db8217 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -12,6 +12,7 @@ #define RSWITCH_MAX_NUM_QUEUES 128 +#define RSWITCH_NUM_AGENTS 5 #define RSWITCH_NUM_PORTS 3 #define rswitch_for_each_enabled_port(priv, i) \ for (i = 0; i < RSWITCH_NUM_PORTS; i++) \ @@ -806,6 +807,7 @@ enum rswitch_gwca_mode { #define CABPPFLC_INIT_VALUE 0x00800080 /* MFWD */ +#define FWPC0(i) (FWPC00 + (i) * 0x10) #define FWPC0_LTHTA BIT(0) #define FWPC0_IP4UE BIT(3) #define FWPC0_IP4TE BIT(4) @@ -819,15 +821,15 @@ enum rswitch_gwca_mode { #define FWPC0_MACHMA BIT(27) #define FWPC0_VLANSA BIT(28) -#define FWPC0(i) (FWPC00 + (i) * 0x10) -#define FWPC0_DEFAULT (FWPC0_LTHTA | FWPC0_IP4UE | FWPC0_IP4TE | \ - FWPC0_IP4OE | FWPC0_L2SE | FWPC0_IP4EA | \ - FWPC0_IPDSA | FWPC0_IPHLA | FWPC0_MACSDA | \ - FWPC0_MACHLA | FWPC0_MACHMA | FWPC0_VLANSA) #define FWPC1(i) (FWPC10 + (i) * 0x10) +#define FWCP1_LTHFW GENMASK(16 + (RSWITCH_NUM_AGENTS - 1), 16) #define FWPC1_DDE BIT(0) -#define FWPBFC(i) (FWPBFC0 + (i) * 0x10) +#define FWPC2(i) (FWPC20 + (i) * 0x10) +#define FWCP2_LTWFW GENMASK(16 + (RSWITCH_NUM_AGENTS - 1), 16) + +#define FWPBFC(i) (FWPBFC0 + (i) * 0x10) +#define FWPBFC_PBDV GENMASK(RSWITCH_NUM_AGENTS - 1, 0) #define FWPBFCSDC(j, i) (FWPBFCSDC00 + (i) * 0x10 + (j) * 0x04) From c0b8980e6041afa363361e41fcafd7862721c3ee Mon Sep 17 00:00:00 2001 From: James Chapman Date: Mon, 9 Dec 2024 11:46:07 +0000 Subject: [PATCH 0403/1450] l2tp: Handle eth stats using NETDEV_PCPU_STAT_DSTATS. l2tp_eth uses the TSTATS infrastructure (dev_sw_netstats_*()) for RX and TX packet counters and DEV_STATS_INC for dropped counters. Consolidate that using the DSTATS infrastructure, which can handle both packet counters and packet drops. Statistics that don't fit DSTATS are still updated atomically with DEV_STATS_INC(). This change is inspired by the introduction of DSTATS helpers and their use in other udp tunnel drivers: Link: https://lore.kernel.org/all/cover.1733313925.git.gnault@redhat.com/ Signed-off-by: James Chapman Reviewed-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index d692b902e120c..e836910734967 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -73,9 +73,9 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev int ret = l2tp_xmit_skb(session, skb); if (likely(ret == NET_XMIT_SUCCESS)) - dev_sw_netstats_tx_add(dev, 1, len); + dev_dstats_tx_add(dev, len); else - DEV_STATS_INC(dev, tx_dropped); + dev_dstats_tx_dropped(dev); return NETDEV_TX_OK; } @@ -84,7 +84,6 @@ static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_init = l2tp_eth_dev_init, .ndo_uninit = l2tp_eth_dev_uninit, .ndo_start_xmit = l2tp_eth_dev_xmit, - .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = eth_mac_addr, }; @@ -100,7 +99,7 @@ static void l2tp_eth_dev_setup(struct net_device *dev) dev->lltx = true; dev->netdev_ops = &l2tp_eth_netdev_ops; dev->needs_free_netdev = true; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) @@ -128,7 +127,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, goto error_rcu; if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) - dev_sw_netstats_rx_add(dev, data_len); + dev_dstats_rx_add(dev, data_len); else DEV_STATS_INC(dev, rx_errors); From 220326c4650a0ef7db3bfcae903f758555ecb973 Mon Sep 17 00:00:00 2001 From: Huy Minh Date: Tue, 10 Dec 2024 22:45:00 +0700 Subject: [PATCH 0404/1450] platform/x86: touchscreen_dmi: Add info for SARY Tab 3 tablet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no info about the OEM behind the tablet, only online stores listing. This tablet uses an Intel Atom x5-Z8300, 4GB of RAM & 64GB of storage. Signed-off-by: Huy Minh Link: https://lore.kernel.org/r/20241210154500.32124-1-buingoc67@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/touchscreen_dmi.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/platform/x86/touchscreen_dmi.c b/drivers/platform/x86/touchscreen_dmi.c index 0a39f68c641d1..bdc19cd8d3edf 100644 --- a/drivers/platform/x86/touchscreen_dmi.c +++ b/drivers/platform/x86/touchscreen_dmi.c @@ -855,6 +855,23 @@ static const struct ts_dmi_data rwc_nanote_next_data = { .properties = rwc_nanote_next_props, }; +static const struct property_entry sary_tab_3_props[] = { + PROPERTY_ENTRY_U32("touchscreen-size-x", 1730), + PROPERTY_ENTRY_U32("touchscreen-size-y", 1151), + PROPERTY_ENTRY_BOOL("touchscreen-inverted-x"), + PROPERTY_ENTRY_BOOL("touchscreen-inverted-y"), + PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), + PROPERTY_ENTRY_STRING("firmware-name", "gsl1680-sary-tab-3.fw"), + PROPERTY_ENTRY_U32("silead,max-fingers", 10), + PROPERTY_ENTRY_BOOL("silead,home-button"), + { } +}; + +static const struct ts_dmi_data sary_tab_3_data = { + .acpi_name = "MSSL1680:00", + .properties = sary_tab_3_props, +}; + static const struct property_entry schneider_sct101ctm_props[] = { PROPERTY_ENTRY_U32("touchscreen-size-x", 1715), PROPERTY_ENTRY_U32("touchscreen-size-y", 1140), @@ -1615,6 +1632,15 @@ const struct dmi_system_id touchscreen_dmi_table[] = { DMI_MATCH(DMI_BIOS_VERSION, "S8A70R100-V005"), }, }, + { + /* SARY Tab 3 */ + .driver_data = (void *)&sary_tab_3_data, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "SARY"), + DMI_MATCH(DMI_PRODUCT_NAME, "C210C"), + DMI_MATCH(DMI_PRODUCT_SKU, "TAB3"), + }, + }, { /* Schneider SCT101CTM */ .driver_data = (void *)&schneider_sct101ctm_data, From 6c0a473fc5f89dabbed0af605a09370b533aa856 Mon Sep 17 00:00:00 2001 From: Jithu Joseph Date: Tue, 10 Dec 2024 12:31:52 -0800 Subject: [PATCH 0405/1450] platform/x86/intel/ifs: Add Clearwater Forest to CPU support list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Clearwater Forest (INTEL_ATOM_DARKMONT_X) to the x86 match table of Intel In Field Scan (IFS) driver, enabling IFS functionality on this processor. Signed-off-by: Jithu Joseph Link: https://lore.kernel.org/r/20241210203152.1136463-1-jithu.joseph@intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/ifs/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/intel/ifs/core.c b/drivers/platform/x86/intel/ifs/core.c index bc252b883210a..1ae50702bdb76 100644 --- a/drivers/platform/x86/intel/ifs/core.c +++ b/drivers/platform/x86/intel/ifs/core.c @@ -20,6 +20,7 @@ static const struct x86_cpu_id ifs_cpu_ids[] __initconst = { X86_MATCH(INTEL_GRANITERAPIDS_X, ARRAY_GEN0), X86_MATCH(INTEL_GRANITERAPIDS_D, ARRAY_GEN0), X86_MATCH(INTEL_ATOM_CRESTMONT_X, ARRAY_GEN1), + X86_MATCH(INTEL_ATOM_DARKMONT_X, ARRAY_GEN1), {} }; MODULE_DEVICE_TABLE(x86cpu, ifs_cpu_ids); From 83848e37f6ee80f60b04139fefdfa1bde4aaa826 Mon Sep 17 00:00:00 2001 From: Xi Pardee Date: Tue, 10 Dec 2024 13:26:41 -0800 Subject: [PATCH 0406/1450] platform/x86/intel/vsec: Add support for Panther Lake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Panther Lake PMT telemetry support. Signed-off-by: Xi Pardee Link: https://lore.kernel.org/r/20241210212646.239211-1-xi.pardee@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 9e0f8e38178c2..e54b6a2a16818 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -423,6 +423,7 @@ static const struct intel_vsec_platform_info lnl_info = { #define PCI_DEVICE_ID_INTEL_VSEC_RPL 0xa77d #define PCI_DEVICE_ID_INTEL_VSEC_TGL 0x9a0d #define PCI_DEVICE_ID_INTEL_VSEC_LNL_M 0x647d +#define PCI_DEVICE_ID_INTEL_VSEC_PTL 0xb07d static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_ADL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_DG1, &dg1_info) }, @@ -432,6 +433,7 @@ static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_RPL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_TGL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_LNL_M, &lnl_info) }, + { PCI_DEVICE_DATA(INTEL, VSEC_PTL, &mtl_info) }, { } }; MODULE_DEVICE_TABLE(pci, intel_vsec_pci_ids); From 48ca421268735c60ea0d4c2e19610b224d5c8656 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Wed, 11 Dec 2024 17:39:45 +0900 Subject: [PATCH 0407/1450] MAINTAINERS: add me as reviewer for sched_ext Add me as a reviewer for sched_ext. I have been actively working on the project and would like to help review patches and address related kernel issues/features. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 41bae8792a77b..cbfa4e97a3ff7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20903,6 +20903,7 @@ SCHEDULER - SCHED_EXT R: Tejun Heo R: David Vernet R: Andrea Righi +R: Changwoo Min L: linux-kernel@vger.kernel.org S: Maintained W: https://github.com/sched-ext/scx From 2dd59fe0e19e1ab955259978082b62e5751924c7 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Fri, 17 May 2024 08:58:00 -0700 Subject: [PATCH 0408/1450] media: dvb-frontends: dib3000mb: fix uninit-value in dib3000_write_reg Syzbot reports [1] an uninitialized value issue found by KMSAN in dib3000_read_reg(). Local u8 rb[2] is used in i2c_transfer() as a read buffer; in case that call fails, the buffer may end up with some undefined values. Since no elaborate error handling is expected in dib3000_write_reg(), simply zero out rb buffer to mitigate the problem. [1] Syzkaller report dvb-usb: bulk message failed: -22 (6/0) ===================================================== BUG: KMSAN: uninit-value in dib3000mb_attach+0x2d8/0x3c0 drivers/media/dvb-frontends/dib3000mb.c:758 dib3000mb_attach+0x2d8/0x3c0 drivers/media/dvb-frontends/dib3000mb.c:758 dibusb_dib3000mb_frontend_attach+0x155/0x2f0 drivers/media/usb/dvb-usb/dibusb-mb.c:31 dvb_usb_adapter_frontend_init+0xed/0x9a0 drivers/media/usb/dvb-usb/dvb-usb-dvb.c:290 dvb_usb_adapter_init drivers/media/usb/dvb-usb/dvb-usb-init.c:90 [inline] dvb_usb_init drivers/media/usb/dvb-usb/dvb-usb-init.c:186 [inline] dvb_usb_device_init+0x25a8/0x3760 drivers/media/usb/dvb-usb/dvb-usb-init.c:310 dibusb_probe+0x46/0x250 drivers/media/usb/dvb-usb/dibusb-mb.c:110 ... Local variable rb created at: dib3000_read_reg+0x86/0x4e0 drivers/media/dvb-frontends/dib3000mb.c:54 dib3000mb_attach+0x123/0x3c0 drivers/media/dvb-frontends/dib3000mb.c:758 ... Fixes: 74340b0a8bc6 ("V4L/DVB (4457): Remove dib3000-common-module") Reported-by: syzbot+c88fc0ebe0d5935c70da@syzkaller.appspotmail.com Signed-off-by: Nikita Zhandarovich Link: https://lore.kernel.org/r/20240517155800.9881-1-n.zhandarovich@fintech.ru Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb-frontends/dib3000mb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/dvb-frontends/dib3000mb.c b/drivers/media/dvb-frontends/dib3000mb.c index 822639f11c04f..63bc7b74bc8b4 100644 --- a/drivers/media/dvb-frontends/dib3000mb.c +++ b/drivers/media/dvb-frontends/dib3000mb.c @@ -51,7 +51,7 @@ MODULE_PARM_DESC(debug, "set debugging level (1=info,2=xfer,4=setfe,8=getfe (|-a static int dib3000_read_reg(struct dib3000_state *state, u16 reg) { u8 wb[] = { ((reg >> 8) | 0x80) & 0xff, reg & 0xff }; - u8 rb[2]; + u8 rb[2] = {}; struct i2c_msg msg[] = { { .addr = state->config.demod_address, .flags = 0, .buf = wb, .len = 2 }, { .addr = state->config.demod_address, .flags = I2C_M_RD, .buf = rb, .len = 2 }, From 5fa49dd8e521a42379e5e41fcf2c92edaaec0a8b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 9 Dec 2024 17:43:48 +0100 Subject: [PATCH 0409/1450] s390/ipl: Fix never less than zero warning DEFINE_IPL_ATTR_STR_RW() macro produces "unsigned 'len' is never less than zero." warning when sys_vmcmd_on_*_store() callbacks are defined. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412081614.5uel8F6W-lkp@intel.com/ Fixes: 247576bf624a ("s390/ipl: Do not accept z/VM CP diag X'008' cmds longer than max length") Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/ipl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index edbb52ce3f1ec..7d12a1305fc99 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -270,7 +270,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ if (len >= sizeof(_value)) \ return -E2BIG; \ len = strscpy(_value, buf, sizeof(_value)); \ - if (len < 0) \ + if ((ssize_t)len < 0) \ return len; \ strim(_value); \ return len; \ From 18b2093f4598d8ee67a8153badc93f0fa7686b8a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 11 Dec 2024 11:01:51 -1000 Subject: [PATCH 0410/1450] sched_ext: Fix invalid irq restore in scx_ops_bypass() While adding outer irqsave/restore locking, 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()") forgot to convert an inner rq_unlock_irqrestore() to rq_unlock() which could re-enable IRQ prematurely leading to the following warning: raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 1 PID: 96 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x30/0x40 ... Sched_ext: create_dsq (enabling) pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : warn_bogus_irq_restore+0x30/0x40 lr : warn_bogus_irq_restore+0x30/0x40 ... Call trace: warn_bogus_irq_restore+0x30/0x40 (P) warn_bogus_irq_restore+0x30/0x40 (L) scx_ops_bypass+0x224/0x3b8 scx_ops_enable.isra.0+0x2c8/0xaa8 bpf_scx_reg+0x18/0x30 ... irq event stamp: 33739 hardirqs last enabled at (33739): [] scx_ops_bypass+0x174/0x3b8 hardirqs last disabled at (33738): [] _raw_spin_lock_irqsave+0xb4/0xd8 Drop the stray _irqrestore(). Signed-off-by: Tejun Heo Reported-by: Ihor Solodrai Link: http://lkml.kernel.org/r/qC39k3UsonrBYD_SmuxHnZIQLsuuccoCrkiqb_BT7DvH945A1_LZwE4g-5Pu9FcCtqZt4lY1HhIPi0homRuNWxkgo1rgP3bkxa0donw8kV4=@pm.me Fixes: 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()") Cc: stable@vger.kernel.org # v6.12 --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7fff1d0454770..98519e6d0dcd0 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4763,7 +4763,7 @@ static void scx_ops_bypass(bool bypass) * sees scx_rq_bypassing() before moving tasks to SCX. */ if (!scx_enabled()) { - rq_unlock_irqrestore(rq, &rf); + rq_unlock(rq, &rf); continue; } From ebefac5647968679f6ef5803e5d35a71997d20fa Mon Sep 17 00:00:00 2001 From: Robert Beckett Date: Tue, 12 Nov 2024 19:50:00 +0000 Subject: [PATCH 0411/1450] nvme-pci: 512 byte aligned dma pool segment quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We initially introduced a quick fix limiting the queue depth to 1 as experimentation showed that it fixed data corruption on 64GB steamdecks. Further experimentation revealed corruption only happens when the last PRP data element aligns to the end of the page boundary. The device appears to treat this as a PRP chain to a new list instead of the data element that it actually is. This implementation is in violation of the spec. Encountering this errata with the Linux driver requires the host request a 128k transfer and coincidently be handed the last small pool dma buffer within a page. The QD1 quirk effectly works around this because the last data PRP always was at a 248 byte offset from the page start, so it never appeared at the end of the page, but comes at the expense of throttling IO and wasting the remainder of the PRP page beyond 256 bytes. Also to note, the MDTS on these devices is small enough that the "large" prp pool can hold enough PRP elements to never reach the end, so that pool is not a problem either. Introduce a new quirk to ensure the small pool is always aligned such that the last PRP element can't appear a the end of the page. This comes at the expense of wasting 256 bytes per small pool page allocated. Link: https://lore.kernel.org/linux-nvme/20241113043151.GA20077@lst.de/T/#u Fixes: 83bdfcbdbe5d ("nvme-pci: qdepth 1 quirk") Cc: Paweł Anikiel Signed-off-by: Robert Beckett Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 611b02c8a8b37..c4bb8dfe1a458 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -173,6 +173,11 @@ enum nvme_quirks { * MSI (but not MSI-X) interrupts are broken and never fire. */ NVME_QUIRK_BROKEN_MSI = (1 << 21), + + /* + * Align dma pool segment size to 512 bytes + */ + NVME_QUIRK_DMAPOOL_ALIGN_512 = (1 << 22), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1a5ba80f1811a..e2634f437f33c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2834,15 +2834,20 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_setup_prp_pools(struct nvme_dev *dev) { + size_t small_align = 256; + dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0); if (!dev->prp_page_pool) return -ENOMEM; + if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) + small_align = 512; + /* Optimisation for I/Os between 4k and 128k */ dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, - 256, 256, 0); + 256, small_align, 0); if (!dev->prp_small_pool) { dma_pool_destroy(dev->prp_page_pool); return -ENOMEM; @@ -3607,7 +3612,7 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */ - .driver_data = NVME_QUIRK_QDEPTH_ONE }, + .driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, }, { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_BOGUS_NID, }, From 9cb189a882738c1d28b349d4e7c6a1ef9b3d8f87 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 4 Dec 2024 17:26:19 +0100 Subject: [PATCH 0412/1450] udmabuf: fix racy memfd sealing check The current check_memfd_seals() is racy: Since we first do check_memfd_seals() and then udmabuf_pin_folios() without holding any relevant lock across both, F_SEAL_WRITE can be set in between. This is problematic because we can end up holding pins to pages in a write-sealed memfd. Fix it using the inode lock, that's probably the easiest way. In the future, we might want to consider moving this logic into memfd, especially if anyone else wants to use memfd_pin_folios(). Reported-by: Julian Orth Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219106 Closes: https://lore.kernel.org/r/CAG48ez0w8HrFEZtJkfmkVKFDhE5aP7nz=obrimeTgpD+StkV9w@mail.gmail.com Fixes: fbb0de795078 ("Add udmabuf misc device") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Acked-by: Joel Fernandes (Google) Acked-by: Vivek Kasireddy Signed-off-by: Vivek Kasireddy Link: https://patchwork.freedesktop.org/patch/msgid/20241204-udmabuf-fixes-v2-1-23887289de1c@google.com --- drivers/dma-buf/udmabuf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 8ce1f074c2d32..c1d8c2766d6d3 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -436,14 +436,19 @@ static long udmabuf_create(struct miscdevice *device, goto err; } + /* + * Take the inode lock to protect against concurrent + * memfd_add_seals(), which takes this lock in write mode. + */ + inode_lock_shared(file_inode(memfd)); ret = check_memfd_seals(memfd); - if (ret < 0) { - fput(memfd); - goto err; - } + if (ret) + goto out_unlock; ret = udmabuf_pin_folios(ubuf, memfd, list[i].offset, list[i].size, folios); +out_unlock: + inode_unlock_shared(file_inode(memfd)); fput(memfd); if (ret) goto err; From 0a16e24e34f28210f68195259456c73462518597 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 4 Dec 2024 17:26:20 +0100 Subject: [PATCH 0413/1450] udmabuf: also check for F_SEAL_FUTURE_WRITE When F_SEAL_FUTURE_WRITE was introduced, it was overlooked that udmabuf must reject memfds with this flag, just like ones with F_SEAL_WRITE. Fix it by adding F_SEAL_FUTURE_WRITE to SEALS_DENIED. Fixes: ab3948f58ff8 ("mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd") Cc: stable@vger.kernel.org Acked-by: Vivek Kasireddy Signed-off-by: Jann Horn Reviewed-by: Joel Fernandes (Google) Signed-off-by: Vivek Kasireddy Link: https://patchwork.freedesktop.org/patch/msgid/20241204-udmabuf-fixes-v2-2-23887289de1c@google.com --- drivers/dma-buf/udmabuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index c1d8c2766d6d3..b330b99fcc761 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -297,7 +297,7 @@ static const struct dma_buf_ops udmabuf_ops = { }; #define SEALS_WANTED (F_SEAL_SHRINK) -#define SEALS_DENIED (F_SEAL_WRITE) +#define SEALS_DENIED (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) static int check_memfd_seals(struct file *memfd) { From f49856f525acd5bef52ae28b7da2e001bbe7439e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 4 Dec 2024 17:26:21 +0100 Subject: [PATCH 0414/1450] udmabuf: fix memory leak on last export_udmabuf() error path In export_udmabuf(), if dma_buf_fd() fails because the FD table is full, a dma_buf owning the udmabuf has already been created; but the error handling in udmabuf_create() will tear down the udmabuf without doing anything about the containing dma_buf. This leaves a dma_buf in memory that contains a dangling pointer; though that doesn't seem to lead to anything bad except a memory leak. Fix it by moving the dma_buf_fd() call out of export_udmabuf() so that we can give it different error handling. Note that the shape of this code changed a lot in commit 5e72b2b41a21 ("udmabuf: convert udmabuf driver to use folios"); but the memory leak seems to have existed since the introduction of udmabuf. Fixes: fbb0de795078 ("Add udmabuf misc device") Acked-by: Vivek Kasireddy Signed-off-by: Jann Horn Signed-off-by: Vivek Kasireddy Link: https://patchwork.freedesktop.org/patch/msgid/20241204-udmabuf-fixes-v2-3-23887289de1c@google.com --- drivers/dma-buf/udmabuf.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index b330b99fcc761..cc7398cc17d67 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -317,12 +317,10 @@ static int check_memfd_seals(struct file *memfd) return 0; } -static int export_udmabuf(struct udmabuf *ubuf, - struct miscdevice *device, - u32 flags) +static struct dma_buf *export_udmabuf(struct udmabuf *ubuf, + struct miscdevice *device) { DEFINE_DMA_BUF_EXPORT_INFO(exp_info); - struct dma_buf *buf; ubuf->device = device; exp_info.ops = &udmabuf_ops; @@ -330,11 +328,7 @@ static int export_udmabuf(struct udmabuf *ubuf, exp_info.priv = ubuf; exp_info.flags = O_RDWR; - buf = dma_buf_export(&exp_info); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - return dma_buf_fd(buf, flags); + return dma_buf_export(&exp_info); } static long udmabuf_pin_folios(struct udmabuf *ubuf, struct file *memfd, @@ -391,6 +385,7 @@ static long udmabuf_create(struct miscdevice *device, struct folio **folios = NULL; pgoff_t pgcnt = 0, pglimit; struct udmabuf *ubuf; + struct dma_buf *dmabuf; long ret = -EINVAL; u32 i, flags; @@ -455,9 +450,20 @@ static long udmabuf_create(struct miscdevice *device, } flags = head->flags & UDMABUF_FLAGS_CLOEXEC ? O_CLOEXEC : 0; - ret = export_udmabuf(ubuf, device, flags); - if (ret < 0) + dmabuf = export_udmabuf(ubuf, device); + if (IS_ERR(dmabuf)) { + ret = PTR_ERR(dmabuf); goto err; + } + /* + * Ownership of ubuf is held by the dmabuf from here. + * If the following dma_buf_fd() fails, dma_buf_put() cleans up both the + * dmabuf and the ubuf (through udmabuf_ops.release). + */ + + ret = dma_buf_fd(dmabuf, flags); + if (ret < 0) + dma_buf_put(dmabuf); kvfree(folios); return ret; From 1015d61570802c7e0794585934d3bc5e09de743e Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Fri, 6 Dec 2024 13:57:10 +0800 Subject: [PATCH 0415/1450] wifi: rtw89: ps: refactor PS flow to support MLO Firmware can only support PS on single one VIF operating in station mode, so argument of PS entry rtw89_enter_lps() should be rtwvif insetad of rtwvif_link. To enter PS under MLO, for each rtwvif, driver sends H2C command to tell firmware which mac_id will enter PS one by one, and afterward asks firmware to enter deep PS. Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206055716.18598-2-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.c | 26 ++++++------------ drivers/net/wireless/realtek/rtw89/ps.c | 33 ++++++++++++----------- drivers/net/wireless/realtek/rtw89/ps.h | 4 +-- drivers/net/wireless/realtek/rtw89/wow.c | 6 ++--- 4 files changed, 29 insertions(+), 40 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c index 29d0ac502bab2..0519b08262817 100644 --- a/drivers/net/wireless/realtek/rtw89/core.c +++ b/drivers/net/wireless/realtek/rtw89/core.c @@ -2729,10 +2729,6 @@ static enum rtw89_ps_mode rtw89_update_ps_mode(struct rtw89_dev *rtwdev) { const struct rtw89_chip_info *chip = rtwdev->chip; - /* FIXME: Fix __rtw89_enter_ps_mode() to consider MLO cases. */ - if (rtwdev->support_mlo) - return RTW89_PS_MODE_NONE; - if (rtw89_disable_ps_mode || !chip->ps_mode_supported || RTW89_CHK_FW_FEATURE(NO_DEEP_PS, &rtwdev->fw)) return RTW89_PS_MODE_NONE; @@ -3469,21 +3465,10 @@ static bool rtw89_traffic_stats_track(struct rtw89_dev *rtwdev) return tfc_changed; } -static void rtw89_vif_enter_lps(struct rtw89_dev *rtwdev, - struct rtw89_vif_link *rtwvif_link) -{ - if (rtwvif_link->wifi_role != RTW89_WIFI_ROLE_STATION && - rtwvif_link->wifi_role != RTW89_WIFI_ROLE_P2P_CLIENT) - return; - - rtw89_enter_lps(rtwdev, rtwvif_link, true); -} - static void rtw89_enter_lps_track(struct rtw89_dev *rtwdev) { - struct rtw89_vif_link *rtwvif_link; + struct ieee80211_vif *vif; struct rtw89_vif *rtwvif; - unsigned int link_id; rtw89_for_each_rtwvif(rtwdev, rtwvif) { if (rtwvif->tdls_peer) @@ -3495,8 +3480,13 @@ static void rtw89_enter_lps_track(struct rtw89_dev *rtwdev) rtwvif->stats.rx_tfc_lv != RTW89_TFC_IDLE) continue; - rtw89_vif_for_each_link(rtwvif, rtwvif_link, link_id) - rtw89_vif_enter_lps(rtwdev, rtwvif_link); + vif = rtwvif_to_vif(rtwvif); + + if (!(vif->type == NL80211_IFTYPE_STATION || + vif->type == NL80211_IFTYPE_P2P_CLIENT)) + continue; + + rtw89_enter_lps(rtwdev, rtwvif, true); } } diff --git a/drivers/net/wireless/realtek/rtw89/ps.c b/drivers/net/wireless/realtek/rtw89/ps.c index c1c12abc2ea93..5e3a5e3c9776f 100644 --- a/drivers/net/wireless/realtek/rtw89/ps.c +++ b/drivers/net/wireless/realtek/rtw89/ps.c @@ -62,11 +62,8 @@ static void rtw89_ps_power_mode_change(struct rtw89_dev *rtwdev, bool enter) rtw89_mac_power_mode_change(rtwdev, enter); } -void __rtw89_enter_ps_mode(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link) +void __rtw89_enter_ps_mode(struct rtw89_dev *rtwdev) { - if (rtwvif_link->wifi_role == RTW89_WIFI_ROLE_P2P_CLIENT) - return; - if (!rtwdev->ps_mode) return; @@ -85,8 +82,8 @@ void __rtw89_leave_ps_mode(struct rtw89_dev *rtwdev) rtw89_ps_power_mode_change(rtwdev, false); } -static void __rtw89_enter_lps(struct rtw89_dev *rtwdev, - struct rtw89_vif_link *rtwvif_link) +static void __rtw89_enter_lps_link(struct rtw89_dev *rtwdev, + struct rtw89_vif_link *rtwvif_link) { struct rtw89_lps_parm lps_param = { .macid = rtwvif_link->mac_id, @@ -121,17 +118,27 @@ void rtw89_leave_ps_mode(struct rtw89_dev *rtwdev) __rtw89_leave_ps_mode(rtwdev); } -void rtw89_enter_lps(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link, +void rtw89_enter_lps(struct rtw89_dev *rtwdev, struct rtw89_vif *rtwvif, bool ps_mode) { + struct rtw89_vif_link *rtwvif_link; + bool can_ps_mode = true; + unsigned int link_id; + lockdep_assert_held(&rtwdev->mutex); if (test_and_set_bit(RTW89_FLAG_LEISURE_PS, rtwdev->flags)) return; - __rtw89_enter_lps(rtwdev, rtwvif_link); - if (ps_mode) - __rtw89_enter_ps_mode(rtwdev, rtwvif_link); + rtw89_vif_for_each_link(rtwvif, rtwvif_link, link_id) { + __rtw89_enter_lps_link(rtwdev, rtwvif_link); + + if (rtwvif_link->wifi_role == RTW89_WIFI_ROLE_P2P_CLIENT) + can_ps_mode = false; + } + + if (ps_mode && can_ps_mode) + __rtw89_enter_ps_mode(rtwdev); } static void rtw89_leave_lps_vif(struct rtw89_dev *rtwdev, @@ -282,12 +289,6 @@ void rtw89_recalc_lps(struct rtw89_dev *rtwdev) enum rtw89_entity_mode mode; int count = 0; - /* FIXME: Fix rtw89_enter_lps() and __rtw89_enter_ps_mode() - * to take MLO cases into account before doing the following. - */ - if (rtwdev->support_mlo) - goto disable_lps; - mode = rtw89_get_entity_mode(rtwdev); if (mode == RTW89_ENTITY_MODE_MCC) goto disable_lps; diff --git a/drivers/net/wireless/realtek/rtw89/ps.h b/drivers/net/wireless/realtek/rtw89/ps.h index cdd712966b09d..2b88f254a32d0 100644 --- a/drivers/net/wireless/realtek/rtw89/ps.h +++ b/drivers/net/wireless/realtek/rtw89/ps.h @@ -5,11 +5,11 @@ #ifndef __RTW89_PS_H_ #define __RTW89_PS_H_ -void rtw89_enter_lps(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link, +void rtw89_enter_lps(struct rtw89_dev *rtwdev, struct rtw89_vif *rtwvif, bool ps_mode); void rtw89_leave_lps(struct rtw89_dev *rtwdev); void __rtw89_leave_ps_mode(struct rtw89_dev *rtwdev); -void __rtw89_enter_ps_mode(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link); +void __rtw89_enter_ps_mode(struct rtw89_dev *rtwdev); void rtw89_leave_ps_mode(struct rtw89_dev *rtwdev); void rtw89_enter_ips(struct rtw89_dev *rtwdev); void rtw89_leave_ips(struct rtw89_dev *rtwdev); diff --git a/drivers/net/wireless/realtek/rtw89/wow.c b/drivers/net/wireless/realtek/rtw89/wow.c index 1e1dbb20d47ad..01754d031bb43 100644 --- a/drivers/net/wireless/realtek/rtw89/wow.c +++ b/drivers/net/wireless/realtek/rtw89/wow.c @@ -694,9 +694,7 @@ static void rtw89_wow_leave_deep_ps(struct rtw89_dev *rtwdev) static void rtw89_wow_enter_deep_ps(struct rtw89_dev *rtwdev) { - struct rtw89_vif_link *rtwvif_link = rtwdev->wow.rtwvif_link; - - __rtw89_enter_ps_mode(rtwdev, rtwvif_link); + __rtw89_enter_ps_mode(rtwdev); } static void rtw89_wow_enter_ps(struct rtw89_dev *rtwdev) @@ -704,7 +702,7 @@ static void rtw89_wow_enter_ps(struct rtw89_dev *rtwdev) struct rtw89_vif_link *rtwvif_link = rtwdev->wow.rtwvif_link; if (rtw89_wow_mgd_linked(rtwdev)) - rtw89_enter_lps(rtwdev, rtwvif_link, false); + rtw89_enter_lps(rtwdev, rtwvif_link->rtwvif, false); else if (rtw89_wow_no_link(rtwdev)) rtw89_fw_h2c_fwips(rtwdev, rtwvif_link, true); } From 8c86036693a3c7e24008734f01109f14807e7347 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Fri, 6 Dec 2024 13:57:11 +0800 Subject: [PATCH 0416/1450] wifi: rtw89: ps: refactor channel info to firmware before entering PS In PS mode, firmware needs hardware parameters related to channel info to configure hardware itself. Before entering PS, driver prepares these info to firmware via firmware H2C command. Since firmware only consider PS for single one vif, change the argument of entry function to rtwvif, and only consider first link for this old H2C command that only support legacy. Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206055716.18598-3-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/fw.c | 35 +++++++++++++++++++------ drivers/net/wireless/realtek/rtw89/fw.h | 3 +-- drivers/net/wireless/realtek/rtw89/ps.c | 3 ++- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index c604ea1d39f17..d17c6037c9a65 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -2594,14 +2594,17 @@ int rtw89_fw_h2c_lps_parm(struct rtw89_dev *rtwdev, return ret; } -int rtw89_fw_h2c_lps_ch_info(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link) +int rtw89_fw_h2c_lps_ch_info(struct rtw89_dev *rtwdev, struct rtw89_vif *rtwvif) { - const struct rtw89_chan *chan = rtw89_chan_get(rtwdev, - rtwvif_link->chanctx_idx); const struct rtw89_chip_info *chip = rtwdev->chip; + const struct rtw89_chan *chan; + struct rtw89_vif_link *rtwvif_link; struct rtw89_h2c_lps_ch_info *h2c; u32 len = sizeof(*h2c); + unsigned int link_id; struct sk_buff *skb; + bool no_chan = true; + u8 phy_idx; u32 done; int ret; @@ -2616,11 +2619,27 @@ int rtw89_fw_h2c_lps_ch_info(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rt skb_put(skb, len); h2c = (struct rtw89_h2c_lps_ch_info *)skb->data; - h2c->info[0].central_ch = chan->channel; - h2c->info[0].pri_ch = chan->primary_channel; - h2c->info[0].band = chan->band_type; - h2c->info[0].bw = chan->band_width; - h2c->mlo_dbcc_mode_lps = cpu_to_le32(MLO_2_PLUS_0_1RF); + rtw89_vif_for_each_link(rtwvif, rtwvif_link, link_id) { + phy_idx = rtwvif_link->phy_idx; + if (phy_idx >= ARRAY_SIZE(h2c->info)) + continue; + + chan = rtw89_chan_get(rtwdev, rtwvif_link->chanctx_idx); + no_chan = false; + + h2c->info[phy_idx].central_ch = chan->channel; + h2c->info[phy_idx].pri_ch = chan->primary_channel; + h2c->info[phy_idx].band = chan->band_type; + h2c->info[phy_idx].bw = chan->band_width; + } + + if (no_chan) { + rtw89_err(rtwdev, "no chan for h2c lps_ch_info\n"); + ret = -ENOENT; + goto fail; + } + + h2c->mlo_dbcc_mode_lps = cpu_to_le32(rtwdev->mlo_dbcc_mode); rtw89_h2c_pkt_set_hdr(rtwdev, skb, FWCMD_TYPE_H2C, H2C_CAT_OUTSRC, H2C_CL_OUTSRC_DM, diff --git a/drivers/net/wireless/realtek/rtw89/fw.h b/drivers/net/wireless/realtek/rtw89/fw.h index 95681c390bb84..b38705868caae 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.h +++ b/drivers/net/wireless/realtek/rtw89/fw.h @@ -4638,8 +4638,7 @@ int rtw89_fw_h2c_init_ba_cam_users(struct rtw89_dev *rtwdev, u8 users, int rtw89_fw_h2c_lps_parm(struct rtw89_dev *rtwdev, struct rtw89_lps_parm *lps_param); -int rtw89_fw_h2c_lps_ch_info(struct rtw89_dev *rtwdev, - struct rtw89_vif_link *rtwvif_link); +int rtw89_fw_h2c_lps_ch_info(struct rtw89_dev *rtwdev, struct rtw89_vif *rtwvif); int rtw89_fw_h2c_fwips(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link, bool enable); struct sk_buff *rtw89_fw_h2c_alloc_skb_with_hdr(struct rtw89_dev *rtwdev, u32 len); diff --git a/drivers/net/wireless/realtek/rtw89/ps.c b/drivers/net/wireless/realtek/rtw89/ps.c index 5e3a5e3c9776f..a8b4b9095dc81 100644 --- a/drivers/net/wireless/realtek/rtw89/ps.c +++ b/drivers/net/wireless/realtek/rtw89/ps.c @@ -93,7 +93,6 @@ static void __rtw89_enter_lps_link(struct rtw89_dev *rtwdev, rtw89_btc_ntfy_radio_state(rtwdev, BTC_RFCTRL_FW_CTRL); rtw89_fw_h2c_lps_parm(rtwdev, &lps_param); - rtw89_fw_h2c_lps_ch_info(rtwdev, rtwvif_link); } static void __rtw89_leave_lps(struct rtw89_dev *rtwdev, @@ -137,6 +136,8 @@ void rtw89_enter_lps(struct rtw89_dev *rtwdev, struct rtw89_vif *rtwvif, can_ps_mode = false; } + rtw89_fw_h2c_lps_ch_info(rtwdev, rtwvif); + if (ps_mode && can_ps_mode) __rtw89_enter_ps_mode(rtwdev); } From 5b4ca804792a3128ee56ac74a390358fabba6fa3 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Fri, 6 Dec 2024 13:57:12 +0800 Subject: [PATCH 0417/1450] wifi: rtw89: ps: update data for firmware and settings for hardware before/after PS For MLO supported IC, send H2C command to firmware before PS with link information for each PHY for MLO to work properly. And re-init hardware settings regarding to RX descriptor information after PS. Signed-off-by: Eric Huang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206055716.18598-4-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.c | 2 + drivers/net/wireless/realtek/rtw89/core.h | 2 + drivers/net/wireless/realtek/rtw89/fw.c | 82 +++++++++++++++++++++++ drivers/net/wireless/realtek/rtw89/fw.h | 18 +++++ drivers/net/wireless/realtek/rtw89/phy.c | 6 ++ drivers/net/wireless/realtek/rtw89/phy.h | 1 + drivers/net/wireless/realtek/rtw89/ps.c | 8 ++- 7 files changed, 118 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c index 0519b08262817..ee6ad185135c3 100644 --- a/drivers/net/wireless/realtek/rtw89/core.c +++ b/drivers/net/wireless/realtek/rtw89/core.c @@ -2189,6 +2189,8 @@ static void rtw89_vif_rx_stats_iter(void *data, u8 *mac, if (phy_ppdu) ewma_rssi_add(&rtwdev->phystat.bcn_rssi, phy_ppdu->rssi_avg); + + pkt_stat->beacon_rate = desc_info->data_rate; } if (!ether_addr_equal(bss_conf->addr, hdr->addr1)) diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index 82844e470d1b8..ec2a80af04bb4 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -4466,6 +4466,7 @@ enum rtw89_fw_feature { RTW89_FW_FEATURE_NO_WOW_CPU_IO_RX, RTW89_FW_FEATURE_NOTIFY_AP_INFO, RTW89_FW_FEATURE_CH_INFO_BE_V0, + RTW89_FW_FEATURE_LPS_CH_INFO, }; struct rtw89_fw_suit { @@ -4835,6 +4836,7 @@ struct rtw89_pkt_drop_params { struct rtw89_pkt_stat { u16 beacon_nr; + u8 beacon_rate; u32 rx_rate_cnt[RTW89_HW_RATE_NR]; }; diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index d17c6037c9a65..e5f3efe3a7e6e 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -729,6 +729,7 @@ static const struct __fw_feat_cfg fw_feat_tbl[] = { __CFG_FW_FEAT(RTL8922A, ge, 0, 35, 12, 0, BEACON_FILTER), __CFG_FW_FEAT(RTL8922A, ge, 0, 35, 22, 0, WOW_REASON_V1), __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 31, 0, RFK_PRE_NOTIFY_V0), + __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 31, 0, LPS_CH_INFO), __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 42, 0, RFK_RXDCK_V0), __CFG_FW_FEAT(RTL8922A, ge, 0, 35, 46, 0, NOTIFY_AP_INFO), __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 47, 0, CH_INFO_BE_V0), @@ -2664,6 +2665,87 @@ int rtw89_fw_h2c_lps_ch_info(struct rtw89_dev *rtwdev, struct rtw89_vif *rtwvif) return ret; } +int rtw89_fw_h2c_lps_ml_cmn_info(struct rtw89_dev *rtwdev, + struct rtw89_vif *rtwvif) +{ + const struct rtw89_phy_bb_gain_info_be *gain = &rtwdev->bb_gain.be; + struct rtw89_pkt_stat *pkt_stat = &rtwdev->phystat.cur_pkt_stat; + const struct rtw89_chip_info *chip = rtwdev->chip; + struct rtw89_h2c_lps_ml_cmn_info *h2c; + struct rtw89_vif_link *rtwvif_link; + const struct rtw89_chan *chan; + u8 bw_idx = RTW89_BB_BW_20_40; + u32 len = sizeof(*h2c); + unsigned int link_id; + struct sk_buff *skb; + u8 gain_band; + u32 done; + u8 path; + int ret; + int i; + + if (chip->chip_gen != RTW89_CHIP_BE) + return 0; + + skb = rtw89_fw_h2c_alloc_skb_with_hdr(rtwdev, len); + if (!skb) { + rtw89_err(rtwdev, "failed to alloc skb for h2c lps_ml_cmn_info\n"); + return -ENOMEM; + } + skb_put(skb, len); + h2c = (struct rtw89_h2c_lps_ml_cmn_info *)skb->data; + + h2c->fmt_id = 0x1; + + h2c->mlo_dbcc_mode = cpu_to_le32(rtwdev->mlo_dbcc_mode); + + rtw89_vif_for_each_link(rtwvif, rtwvif_link, link_id) { + path = rtwvif_link->phy_idx == RTW89_PHY_1 ? RF_PATH_B : RF_PATH_A; + chan = rtw89_chan_get(rtwdev, rtwvif_link->chanctx_idx); + gain_band = rtw89_subband_to_gain_band_be(chan->subband_type); + + h2c->central_ch[rtwvif_link->phy_idx] = chan->channel; + h2c->pri_ch[rtwvif_link->phy_idx] = chan->primary_channel; + h2c->band[rtwvif_link->phy_idx] = chan->band_type; + h2c->bw[rtwvif_link->phy_idx] = chan->band_width; + if (pkt_stat->beacon_rate < RTW89_HW_RATE_OFDM6) + h2c->bcn_rate_type[rtwvif_link->phy_idx] = 0x1; + else + h2c->bcn_rate_type[rtwvif_link->phy_idx] = 0x2; + + /* Fill BW20 RX gain table for beacon mode */ + for (i = 0; i < TIA_GAIN_NUM; i++) { + h2c->tia_gain[rtwvif_link->phy_idx][i] = + cpu_to_le16(gain->tia_gain[gain_band][bw_idx][path][i]); + } + memcpy(h2c->lna_gain[rtwvif_link->phy_idx], + gain->lna_gain[gain_band][bw_idx][path], + LNA_GAIN_NUM); + } + + rtw89_h2c_pkt_set_hdr(rtwdev, skb, FWCMD_TYPE_H2C, + H2C_CAT_OUTSRC, H2C_CL_OUTSRC_DM, + H2C_FUNC_FW_LPS_ML_CMN_INFO, 0, 0, len); + + rtw89_phy_write32_mask(rtwdev, R_CHK_LPS_STAT, B_CHK_LPS_STAT, 0); + ret = rtw89_h2c_tx(rtwdev, skb, false); + if (ret) { + rtw89_err(rtwdev, "failed to send h2c\n"); + goto fail; + } + + ret = read_poll_timeout(rtw89_phy_read32_mask, done, done, 50, 5000, + true, rtwdev, R_CHK_LPS_STAT, B_CHK_LPS_STAT); + if (ret) + rtw89_warn(rtwdev, "h2c_lps_ml_cmn_info done polling timeout\n"); + + return 0; +fail: + dev_kfree_skb_any(skb); + + return ret; +} + #define H2C_P2P_ACT_LEN 20 int rtw89_fw_h2c_p2p_act(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link, diff --git a/drivers/net/wireless/realtek/rtw89/fw.h b/drivers/net/wireless/realtek/rtw89/fw.h index b38705868caae..2dfc584da7d65 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.h +++ b/drivers/net/wireless/realtek/rtw89/fw.h @@ -1783,6 +1783,21 @@ struct rtw89_h2c_lps_ch_info { __le32 mlo_dbcc_mode_lps; } __packed; +struct rtw89_h2c_lps_ml_cmn_info { + u8 fmt_id; + u8 rsvd0[3]; + __le32 mlo_dbcc_mode; + u8 central_ch[RTW89_PHY_MAX]; + u8 pri_ch[RTW89_PHY_MAX]; + u8 bw[RTW89_PHY_MAX]; + u8 band[RTW89_PHY_MAX]; + u8 bcn_rate_type[RTW89_PHY_MAX]; + u8 rsvd1[2]; + __le16 tia_gain[RTW89_PHY_MAX][TIA_GAIN_NUM]; + u8 lna_gain[RTW89_PHY_MAX][LNA_GAIN_NUM]; + u8 rsvd2[2]; +} __packed; + static inline void RTW89_SET_FWCMD_CPU_EXCEPTION_TYPE(void *cmd, u32 val) { le32p_replace_bits((__le32 *)cmd, val, GENMASK(31, 0)); @@ -4211,6 +4226,7 @@ enum rtw89_mrc_h2c_func { #define H2C_CL_OUTSRC_DM 0x2 #define H2C_FUNC_FW_LPS_CH_INFO 0xb +#define H2C_FUNC_FW_LPS_ML_CMN_INFO 0xe #define H2C_CL_OUTSRC_RF_REG_A 0x8 #define H2C_CL_OUTSRC_RF_REG_B 0x9 @@ -4639,6 +4655,8 @@ int rtw89_fw_h2c_init_ba_cam_users(struct rtw89_dev *rtwdev, u8 users, int rtw89_fw_h2c_lps_parm(struct rtw89_dev *rtwdev, struct rtw89_lps_parm *lps_param); int rtw89_fw_h2c_lps_ch_info(struct rtw89_dev *rtwdev, struct rtw89_vif *rtwvif); +int rtw89_fw_h2c_lps_ml_cmn_info(struct rtw89_dev *rtwdev, + struct rtw89_vif *rtwvif); int rtw89_fw_h2c_fwips(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link, bool enable); struct sk_buff *rtw89_fw_h2c_alloc_skb_with_hdr(struct rtw89_dev *rtwdev, u32 len); diff --git a/drivers/net/wireless/realtek/rtw89/phy.c b/drivers/net/wireless/realtek/rtw89/phy.c index 8d36bf9627323..1d4d3dcce0604 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.c +++ b/drivers/net/wireless/realtek/rtw89/phy.c @@ -6519,6 +6519,12 @@ void rtw89_phy_dm_init(struct rtw89_dev *rtwdev) rtw89_chip_cfg_txrx_path(rtwdev); } +void rtw89_phy_dm_reinit(struct rtw89_dev *rtwdev) +{ + rtw89_phy_env_monitor_init(rtwdev); + rtw89_physts_parsing_init(rtwdev); +} + void rtw89_phy_set_bss_color(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link) { diff --git a/drivers/net/wireless/realtek/rtw89/phy.h b/drivers/net/wireless/realtek/rtw89/phy.h index e6d06f0a6c093..cf33c1655b7a7 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.h +++ b/drivers/net/wireless/realtek/rtw89/phy.h @@ -813,6 +813,7 @@ void rtw89_phy_config_rf_reg_v1(struct rtw89_dev *rtwdev, enum rtw89_rf_path rf_path, void *extra_data); void rtw89_phy_dm_init(struct rtw89_dev *rtwdev); +void rtw89_phy_dm_reinit(struct rtw89_dev *rtwdev); void rtw89_phy_write32_idx(struct rtw89_dev *rtwdev, u32 addr, u32 mask, u32 data, enum rtw89_phy_idx phy_idx); void rtw89_phy_write32_idx_set(struct rtw89_dev *rtwdev, u32 addr, u32 bits, diff --git a/drivers/net/wireless/realtek/rtw89/ps.c b/drivers/net/wireless/realtek/rtw89/ps.c index a8b4b9095dc81..96ea04d90cd36 100644 --- a/drivers/net/wireless/realtek/rtw89/ps.c +++ b/drivers/net/wireless/realtek/rtw89/ps.c @@ -8,6 +8,7 @@ #include "debug.h" #include "fw.h" #include "mac.h" +#include "phy.h" #include "ps.h" #include "reg.h" #include "util.h" @@ -136,7 +137,10 @@ void rtw89_enter_lps(struct rtw89_dev *rtwdev, struct rtw89_vif *rtwvif, can_ps_mode = false; } - rtw89_fw_h2c_lps_ch_info(rtwdev, rtwvif); + if (RTW89_CHK_FW_FEATURE(LPS_CH_INFO, &rtwdev->fw)) + rtw89_fw_h2c_lps_ch_info(rtwdev, rtwvif); + else + rtw89_fw_h2c_lps_ml_cmn_info(rtwdev, rtwvif); if (ps_mode && can_ps_mode) __rtw89_enter_ps_mode(rtwdev); @@ -165,6 +169,8 @@ void rtw89_leave_lps(struct rtw89_dev *rtwdev) __rtw89_leave_ps_mode(rtwdev); + rtw89_phy_dm_reinit(rtwdev); + rtw89_for_each_rtwvif(rtwdev, rtwvif) rtw89_vif_for_each_link(rtwvif, rtwvif_link, link_id) rtw89_leave_lps_vif(rtwdev, rtwvif_link); From f0441c540fe808570c275a5700adc42b2cfd914b Mon Sep 17 00:00:00 2001 From: Kuan-Chung Chen Date: Fri, 6 Dec 2024 13:57:13 +0800 Subject: [PATCH 0418/1450] wifi: rtw89: disable firmware training HE GI and LTF Given the performance trade-off associated with firmware training HE GI/LTF, especially in high attenuation environments, we have decided to utilize a constant value instead. Signed-off-by: Kuan-Chung Chen Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206055716.18598-5-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.h | 10 ++++++++++ drivers/net/wireless/realtek/rtw89/phy.c | 24 +++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index ec2a80af04bb4..15967978bf4ae 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -6875,6 +6875,16 @@ bool rtw89_sta_has_beamformer_cap(struct ieee80211_link_sta *link_sta) return false; } +static inline +bool rtw89_sta_link_has_su_mu_4xhe08(struct ieee80211_link_sta *link_sta) +{ + if (link_sta->he_cap.he_cap_elem.phy_cap_info[7] & + IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI) + return true; + + return false; +} + static inline struct rtw89_fw_suit *rtw89_fw_suit_get(struct rtw89_dev *rtwdev, enum rtw89_fw_type type) { diff --git a/drivers/net/wireless/realtek/rtw89/phy.c b/drivers/net/wireless/realtek/rtw89/phy.c index 1d4d3dcce0604..604ea048c3abc 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.c +++ b/drivers/net/wireless/realtek/rtw89/phy.c @@ -264,16 +264,26 @@ rtw89_ra_mask_eht_rates[4] = {RA_MASK_EHT_1SS_RATES, RA_MASK_EHT_2SS_RATES, static void rtw89_phy_ra_gi_ltf(struct rtw89_dev *rtwdev, struct rtw89_sta_link *rtwsta_link, + struct ieee80211_link_sta *link_sta, const struct rtw89_chan *chan, bool *fix_giltf_en, u8 *fix_giltf) { struct cfg80211_bitrate_mask *mask = &rtwsta_link->mask; u8 band = chan->band_type; enum nl80211_band nl_band = rtw89_hw_to_nl80211_band(band); - u8 he_gi = mask->control[nl_band].he_gi; u8 he_ltf = mask->control[nl_band].he_ltf; + u8 he_gi = mask->control[nl_band].he_gi; - if (!rtwsta_link->use_cfg_mask) + *fix_giltf_en = true; + + if (rtwdev->chip->chip_id == RTL8852C && + chan->band_width == RTW89_CHANNEL_WIDTH_160 && + rtw89_sta_link_has_su_mu_4xhe08(link_sta)) + *fix_giltf = RTW89_GILTF_SGI_4XHE08; + else + *fix_giltf = RTW89_GILTF_2XHE08; + + if (!(rtwsta_link->use_cfg_mask && link_sta->he_cap.has_he)) return; if (he_ltf == 2 && he_gi == 2) { @@ -288,12 +298,7 @@ static void rtw89_phy_ra_gi_ltf(struct rtw89_dev *rtwdev, *fix_giltf = RTW89_GILTF_1XHE16; } else if (he_ltf == 0 && he_gi == 0) { *fix_giltf = RTW89_GILTF_1XHE08; - } else { - *fix_giltf_en = false; - return; } - - *fix_giltf_en = true; } static void rtw89_phy_ra_sta_update(struct rtw89_dev *rtwdev, @@ -326,6 +331,8 @@ static void rtw89_phy_ra_sta_update(struct rtw89_dev *rtwdev, mode |= RTW89_RA_MODE_EHT; ra_mask |= get_eht_ra_mask(link_sta); high_rate_masks = rtw89_ra_mask_eht_rates; + rtw89_phy_ra_gi_ltf(rtwdev, rtwsta_link, link_sta, + chan, &fix_giltf_en, &fix_giltf); } else if (link_sta->he_cap.has_he) { mode |= RTW89_RA_MODE_HE; csi_mode = RTW89_RA_RPT_MODE_HE; @@ -337,7 +344,8 @@ static void rtw89_phy_ra_sta_update(struct rtw89_dev *rtwdev, if (link_sta->he_cap.he_cap_elem.phy_cap_info[1] & IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD) ldpc_en = 1; - rtw89_phy_ra_gi_ltf(rtwdev, rtwsta_link, chan, &fix_giltf_en, &fix_giltf); + rtw89_phy_ra_gi_ltf(rtwdev, rtwsta_link, link_sta, + chan, &fix_giltf_en, &fix_giltf); } else if (link_sta->vht_cap.vht_supported) { u16 mcs_map = le16_to_cpu(link_sta->vht_cap.vht_mcs.rx_mcs_map); From 9ddc6ee0b215783252fdab234661ece8c32e2c61 Mon Sep 17 00:00:00 2001 From: Kuan-Chung Chen Date: Fri, 6 Dec 2024 13:57:14 +0800 Subject: [PATCH 0419/1450] wifi: rtw89: 8852c: disable ER SU when 4x HE-LTF and 0.8 GI capability differ Since hardware only has single one register for HE-LTF setting, to prevent interoperability issues, 8852CE disables ER SU when the AP can handle SU/MU with 4x HE-LTF and 0.8 GI, but does not support ER SU with the same settings. Signed-off-by: Kuan-Chung Chen Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206055716.18598-6-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.c | 24 ++++++++++++++++++----- drivers/net/wireless/realtek/rtw89/core.h | 10 ++++++++++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c index ee6ad185135c3..f848185e2cede 100644 --- a/drivers/net/wireless/realtek/rtw89/core.c +++ b/drivers/net/wireless/realtek/rtw89/core.c @@ -3844,6 +3844,22 @@ int rtw89_core_sta_link_disconnect(struct rtw89_dev *rtwdev, return ret; } +static bool rtw89_sta_link_can_er(struct rtw89_dev *rtwdev, + struct ieee80211_bss_conf *bss_conf, + struct ieee80211_link_sta *link_sta) +{ + if (!bss_conf->he_support || + bss_conf->he_oper.params & IEEE80211_HE_OPERATION_ER_SU_DISABLE) + return false; + + if (rtwdev->chip->chip_id == RTL8852C && + rtw89_sta_link_has_su_mu_4xhe08(link_sta) && + !rtw89_sta_link_has_er_su_4xhe08(link_sta)) + return false; + + return true; +} + int rtw89_core_sta_link_assoc(struct rtw89_dev *rtwdev, struct rtw89_vif_link *rtwvif_link, struct rtw89_sta_link *rtwsta_link) @@ -3854,12 +3870,11 @@ int rtw89_core_sta_link_assoc(struct rtw89_dev *rtwdev, rtwsta_link); const struct rtw89_chan *chan = rtw89_chan_get(rtwdev, rtwvif_link->chanctx_idx); + struct ieee80211_link_sta *link_sta; int ret; if (vif->type == NL80211_IFTYPE_AP || sta->tdls) { if (sta->tdls) { - struct ieee80211_link_sta *link_sta; - rcu_read_lock(); link_sta = rtw89_sta_rcu_dereference_link(rtwsta_link, true); @@ -3910,9 +3925,8 @@ int rtw89_core_sta_link_assoc(struct rtw89_dev *rtwdev, rcu_read_lock(); bss_conf = rtw89_vif_rcu_dereference_link(rtwvif_link, true); - if (bss_conf->he_support && - !(bss_conf->he_oper.params & IEEE80211_HE_OPERATION_ER_SU_DISABLE)) - rtwsta_link->er_cap = true; + link_sta = rtw89_sta_rcu_dereference_link(rtwsta_link, true); + rtwsta_link->er_cap = rtw89_sta_link_can_er(rtwdev, bss_conf, link_sta); rcu_read_unlock(); diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index 15967978bf4ae..c2b5eeb4a4f14 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -6885,6 +6885,16 @@ bool rtw89_sta_link_has_su_mu_4xhe08(struct ieee80211_link_sta *link_sta) return false; } +static inline +bool rtw89_sta_link_has_er_su_4xhe08(struct ieee80211_link_sta *link_sta) +{ + if (link_sta->he_cap.he_cap_elem.phy_cap_info[8] & + IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI) + return true; + + return false; +} + static inline struct rtw89_fw_suit *rtw89_fw_suit_get(struct rtw89_dev *rtwdev, enum rtw89_fw_type type) { From a2854ac3383032310db381b2cfda5b164d8585ec Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Fri, 6 Dec 2024 13:57:15 +0800 Subject: [PATCH 0420/1450] wifi: rtw89: regd: update regulatory map to R68-R51 Sync Realtek Channel Plan R68 and Realtek Regulatory R51. Configure 6 GHz field of Realtek regd for the following countries. BO, DO, EG, LS, MZ, NG, OM, ZW, PK, PH, TH, KM, CG, CD, GE, GI, GU, LR, MH, FM, MP, PW, MF, SX, SZ, TZ, VI Besides, add entries for the following countries. CU, SY, SD Signed-off-by: Zong-Zhe Yang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206055716.18598-7-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/regd.c | 57 ++++++++++++----------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/regd.c b/drivers/net/wireless/realtek/rtw89/regd.c index cad5189708e79..80b2f74589eb9 100644 --- a/drivers/net/wireless/realtek/rtw89/regd.c +++ b/drivers/net/wireless/realtek/rtw89/regd.c @@ -17,7 +17,7 @@ static const struct rtw89_regd rtw89_ww_regd = static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("AR", RTW89_MEXICO, RTW89_MEXICO, RTW89_FCC), - COUNTRY_REGD("BO", RTW89_FCC, RTW89_FCC, RTW89_FCC), + COUNTRY_REGD("BO", RTW89_FCC, RTW89_FCC, RTW89_NA), COUNTRY_REGD("BR", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("CL", RTW89_CHILE, RTW89_CHILE, RTW89_CHILE), COUNTRY_REGD("CO", RTW89_FCC, RTW89_FCC, RTW89_FCC), @@ -35,7 +35,7 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("UY", RTW89_FCC, RTW89_FCC, RTW89_NA), COUNTRY_REGD("VE", RTW89_FCC, RTW89_FCC, RTW89_NA), COUNTRY_REGD("PR", RTW89_FCC, RTW89_FCC, RTW89_NA), - COUNTRY_REGD("DO", RTW89_FCC, RTW89_FCC, RTW89_NA), + COUNTRY_REGD("DO", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("AT", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("BE", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("CY", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), @@ -72,7 +72,7 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("BA", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("BG", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("HR", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), - COUNTRY_REGD("EG", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("EG", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("GH", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("IQ", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("IL", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), @@ -82,13 +82,13 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("KW", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("KG", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("LB", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), - COUNTRY_REGD("LS", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("LS", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("MK", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("MA", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), - COUNTRY_REGD("MZ", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("MZ", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("NA", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("NG", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("OM", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("NG", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), + COUNTRY_REGD("OM", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("QA", RTW89_QATAR, RTW89_QATAR, RTW89_QATAR), COUNTRY_REGD("RO", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("RU", RTW89_ETSI, RTW89_ETSI, RTW89_NA), @@ -101,7 +101,7 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("UA", RTW89_UKRAINE, RTW89_UKRAINE, RTW89_UKRAINE), COUNTRY_REGD("AE", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("YE", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("ZW", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("ZW", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("BD", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("KH", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("CN", RTW89_CN, RTW89_CN, RTW89_CN), @@ -110,12 +110,12 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("ID", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("KR", RTW89_KCC, RTW89_KCC, RTW89_KCC), COUNTRY_REGD("MY", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), - COUNTRY_REGD("PK", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("PH", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("PK", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), + COUNTRY_REGD("PH", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("SG", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("LK", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("TW", RTW89_FCC, RTW89_FCC, RTW89_ETSI), - COUNTRY_REGD("TH", RTW89_ETSI, RTW89_ETSI, RTW89_THAILAND), + COUNTRY_REGD("TH", RTW89_THAILAND, RTW89_THAILAND, RTW89_THAILAND), COUNTRY_REGD("VN", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("AU", RTW89_ACMA, RTW89_ACMA, RTW89_ACMA), COUNTRY_REGD("NZ", RTW89_ACMA, RTW89_ACMA, RTW89_ACMA), @@ -158,9 +158,9 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("TD", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("CX", RTW89_ACMA, RTW89_ACMA, RTW89_NA), COUNTRY_REGD("CC", RTW89_ACMA, RTW89_ACMA, RTW89_NA), - COUNTRY_REGD("KM", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), - COUNTRY_REGD("CG", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("CD", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("KM", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("CG", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), + COUNTRY_REGD("CD", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("CK", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("CI", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("DJ", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), @@ -176,12 +176,12 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("TF", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("GA", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("GM", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), - COUNTRY_REGD("GE", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("GI", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("GE", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), + COUNTRY_REGD("GI", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("GL", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("GD", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("GP", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("GU", RTW89_FCC, RTW89_FCC, RTW89_NA), + COUNTRY_REGD("GU", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("GG", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("GN", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("GW", RTW89_ETSI, RTW89_ETSI, RTW89_NA), @@ -194,19 +194,19 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("KI", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("XK", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("LA", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), - COUNTRY_REGD("LR", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("LR", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("LY", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("MO", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("MG", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("MW", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("MV", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("ML", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("MH", RTW89_FCC, RTW89_FCC, RTW89_NA), + COUNTRY_REGD("MH", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("MQ", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("MR", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("MU", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("YT", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("FM", RTW89_FCC, RTW89_FCC, RTW89_NA), + COUNTRY_REGD("FM", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("MD", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("MN", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("MS", RTW89_ETSI, RTW89_ETSI, RTW89_NA), @@ -216,15 +216,15 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("NE", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("NU", RTW89_ACMA, RTW89_ACMA, RTW89_NA), COUNTRY_REGD("NF", RTW89_ACMA, RTW89_ACMA, RTW89_NA), - COUNTRY_REGD("MP", RTW89_FCC, RTW89_FCC, RTW89_NA), - COUNTRY_REGD("PW", RTW89_FCC, RTW89_FCC, RTW89_NA), + COUNTRY_REGD("MP", RTW89_FCC, RTW89_FCC, RTW89_FCC), + COUNTRY_REGD("PW", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("RE", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("RW", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("SH", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("KN", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("LC", RTW89_FCC, RTW89_FCC, RTW89_FCC), - COUNTRY_REGD("MF", RTW89_FCC, RTW89_FCC, RTW89_NA), - COUNTRY_REGD("SX", RTW89_FCC, RTW89_FCC, RTW89_NA), + COUNTRY_REGD("MF", RTW89_FCC, RTW89_FCC, RTW89_FCC), + COUNTRY_REGD("SX", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("PM", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("VC", RTW89_FCC, RTW89_FCC, RTW89_NA), COUNTRY_REGD("WS", RTW89_FCC, RTW89_FCC, RTW89_NA), @@ -237,9 +237,9 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("GS", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("SR", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("SJ", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("SZ", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("SZ", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("TJ", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), - COUNTRY_REGD("TZ", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("TZ", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("TG", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("TK", RTW89_ACMA, RTW89_ACMA, RTW89_NA), COUNTRY_REGD("TO", RTW89_ETSI, RTW89_ETSI, RTW89_NA), @@ -247,13 +247,16 @@ static const struct rtw89_regd rtw89_regd_map[] = { COUNTRY_REGD("TC", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("TV", RTW89_ETSI, RTW89_NA, RTW89_NA), COUNTRY_REGD("UG", RTW89_ETSI, RTW89_ETSI, RTW89_NA), - COUNTRY_REGD("VI", RTW89_FCC, RTW89_FCC, RTW89_NA), + COUNTRY_REGD("VI", RTW89_FCC, RTW89_FCC, RTW89_FCC), COUNTRY_REGD("UZ", RTW89_ETSI, RTW89_ETSI, RTW89_ETSI), COUNTRY_REGD("VU", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("WF", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("EH", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("ZM", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("CU", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("IR", RTW89_ETSI, RTW89_ETSI, RTW89_NA), + COUNTRY_REGD("SY", RTW89_ETSI, RTW89_NA, RTW89_NA), + COUNTRY_REGD("SD", RTW89_ETSI, RTW89_ETSI, RTW89_NA), COUNTRY_REGD("PS", RTW89_ETSI, RTW89_ETSI, RTW89_NA), }; From 9ae817c779df50710218e65593ca8105aa27ff91 Mon Sep 17 00:00:00 2001 From: Chih-Kang Chang Date: Fri, 6 Dec 2024 13:57:16 +0800 Subject: [PATCH 0421/1450] wifi: rtw89: 8922a: update format of RFK pre-notify H2C command v2 The RFK pre-notify H2C command is to tell firmware the channels driver is using. Since the format is changed after 0.35.49.0, update it accordingly. Signed-off-by: Chih-Kang Chang Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206055716.18598-8-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/core.h | 1 + drivers/net/wireless/realtek/rtw89/fw.c | 33 ++++++++++++++++------- drivers/net/wireless/realtek/rtw89/fw.h | 7 ++++- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/core.h b/drivers/net/wireless/realtek/rtw89/core.h index c2b5eeb4a4f14..155538370a898 100644 --- a/drivers/net/wireless/realtek/rtw89/core.h +++ b/drivers/net/wireless/realtek/rtw89/core.h @@ -4462,6 +4462,7 @@ enum rtw89_fw_feature { RTW89_FW_FEATURE_SCAN_OFFLOAD_BE_V0, RTW89_FW_FEATURE_WOW_REASON_V1, RTW89_FW_FEATURE_RFK_PRE_NOTIFY_V0, + RTW89_FW_FEATURE_RFK_PRE_NOTIFY_V1, RTW89_FW_FEATURE_RFK_RXDCK_V0, RTW89_FW_FEATURE_NO_WOW_CPU_IO_RX, RTW89_FW_FEATURE_NOTIFY_AP_INFO, diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index e5f3efe3a7e6e..90db156857281 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -733,6 +733,7 @@ static const struct __fw_feat_cfg fw_feat_tbl[] = { __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 42, 0, RFK_RXDCK_V0), __CFG_FW_FEAT(RTL8922A, ge, 0, 35, 46, 0, NOTIFY_AP_INFO), __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 47, 0, CH_INFO_BE_V0), + __CFG_FW_FEAT(RTL8922A, lt, 0, 35, 49, 0, RFK_PRE_NOTIFY_V1), }; static void rtw89_fw_iterate_feature_cfg(struct rtw89_fw_info *fw, @@ -5540,7 +5541,9 @@ int rtw89_fw_h2c_rf_pre_ntfy(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy_idx) { struct rtw89_rfk_mcc_info *rfk_mcc = &rtwdev->rfk_mcc; + struct rtw89_fw_h2c_rfk_pre_info_common *common; struct rtw89_fw_h2c_rfk_pre_info_v0 *h2c_v0; + struct rtw89_fw_h2c_rfk_pre_info_v1 *h2c_v1; struct rtw89_fw_h2c_rfk_pre_info *h2c; u8 tbl_sel[NUM_OF_RTW89_FW_RFK_PATH]; u32 len = sizeof(*h2c); @@ -5550,7 +5553,10 @@ int rtw89_fw_h2c_rf_pre_ntfy(struct rtw89_dev *rtwdev, u32 val32; int ret; - if (RTW89_CHK_FW_FEATURE(RFK_PRE_NOTIFY_V0, &rtwdev->fw)) { + if (RTW89_CHK_FW_FEATURE(RFK_PRE_NOTIFY_V1, &rtwdev->fw)) { + len = sizeof(*h2c_v1); + ver = 1; + } else if (RTW89_CHK_FW_FEATURE(RFK_PRE_NOTIFY_V0, &rtwdev->fw)) { len = sizeof(*h2c_v0); ver = 0; } @@ -5562,17 +5568,18 @@ int rtw89_fw_h2c_rf_pre_ntfy(struct rtw89_dev *rtwdev, } skb_put(skb, len); h2c = (struct rtw89_fw_h2c_rfk_pre_info *)skb->data; + common = &h2c->base_v1.common; - h2c->common.mlo_mode = cpu_to_le32(rtwdev->mlo_dbcc_mode); + common->mlo_mode = cpu_to_le32(rtwdev->mlo_dbcc_mode); BUILD_BUG_ON(NUM_OF_RTW89_FW_RFK_TBL > RTW89_RFK_CHS_NR); BUILD_BUG_ON(ARRAY_SIZE(rfk_mcc->data) < NUM_OF_RTW89_FW_RFK_PATH); for (tbl = 0; tbl < NUM_OF_RTW89_FW_RFK_TBL; tbl++) { for (path = 0; path < NUM_OF_RTW89_FW_RFK_PATH; path++) { - h2c->common.dbcc.ch[path][tbl] = + common->dbcc.ch[path][tbl] = cpu_to_le32(rfk_mcc->data[path].ch[tbl]); - h2c->common.dbcc.band[path][tbl] = + common->dbcc.band[path][tbl] = cpu_to_le32(rfk_mcc->data[path].band[tbl]); } } @@ -5580,13 +5587,19 @@ int rtw89_fw_h2c_rf_pre_ntfy(struct rtw89_dev *rtwdev, for (path = 0; path < NUM_OF_RTW89_FW_RFK_PATH; path++) { tbl_sel[path] = rfk_mcc->data[path].table_idx; - h2c->common.tbl.cur_ch[path] = + common->tbl.cur_ch[path] = cpu_to_le32(rfk_mcc->data[path].ch[tbl_sel[path]]); - h2c->common.tbl.cur_band[path] = + common->tbl.cur_band[path] = cpu_to_le32(rfk_mcc->data[path].band[tbl_sel[path]]); + + if (ver <= 1) + continue; + + h2c->cur_bandwidth[path] = + cpu_to_le32(rfk_mcc->data[path].bw[tbl_sel[path]]); } - h2c->common.phy_idx = cpu_to_le32(phy_idx); + common->phy_idx = cpu_to_le32(phy_idx); if (ver == 0) { /* RFK_PRE_NOTIFY_V0 */ h2c_v0 = (struct rtw89_fw_h2c_rfk_pre_info_v0 *)skb->data; @@ -5612,8 +5625,10 @@ int rtw89_fw_h2c_rf_pre_ntfy(struct rtw89_dev *rtwdev, goto done; } - if (rtw89_is_mlo_1_1(rtwdev)) - h2c->mlo_1_1 = cpu_to_le32(1); + if (rtw89_is_mlo_1_1(rtwdev)) { + h2c_v1 = &h2c->base_v1; + h2c_v1->mlo_1_1 = cpu_to_le32(1); + } done: rtw89_h2c_pkt_set_hdr(rtwdev, skb, FWCMD_TYPE_H2C, H2C_CAT_OUTSRC, H2C_CL_OUTSRC_RF_FW_RFK, diff --git a/drivers/net/wireless/realtek/rtw89/fw.h b/drivers/net/wireless/realtek/rtw89/fw.h index 2dfc584da7d65..a3fe183c2ab08 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.h +++ b/drivers/net/wireless/realtek/rtw89/fw.h @@ -4293,11 +4293,16 @@ struct rtw89_fw_h2c_rfk_pre_info_v0 { } __packed mlo; } __packed; -struct rtw89_fw_h2c_rfk_pre_info { +struct rtw89_fw_h2c_rfk_pre_info_v1 { struct rtw89_fw_h2c_rfk_pre_info_common common; __le32 mlo_1_1; } __packed; +struct rtw89_fw_h2c_rfk_pre_info { + struct rtw89_fw_h2c_rfk_pre_info_v1 base_v1; + __le32 cur_bandwidth[NUM_OF_RTW89_FW_RFK_PATH]; +} __packed; + struct rtw89_h2c_rf_tssi { __le16 len; u8 phy; From 2fdac64c3c35858aa8ac5caa70b232e03456e120 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 6 Dec 2024 14:37:10 -0300 Subject: [PATCH 0422/1450] wifi: rtlwifi: remove unused check_buddy_priv Commit 2461c7d60f9f ("rtlwifi: Update header file") introduced a global list of private data structures. Later on, commit 26634c4b1868 ("rtlwifi Modify existing bits to match vendor version 2013.02.07") started adding the private data to that list at probe time and added a hook, check_buddy_priv to find the private data from a similar device. However, that function was never used. Besides, though there is a lock for that list, it is never used. And when the probe fails, the private data is never removed from the list. This would cause a second probe to access freed memory. Remove the unused hook, structures and members, which will prevent the potential race condition on the list and its corruption during a second probe when probe fails. Fixes: 26634c4b1868 ("rtlwifi Modify existing bits to match vendor version 2013.02.07") Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206173713.3222187-2-cascardo@igalia.com --- drivers/net/wireless/realtek/rtlwifi/base.c | 7 ---- drivers/net/wireless/realtek/rtlwifi/base.h | 1 - drivers/net/wireless/realtek/rtlwifi/pci.c | 44 --------------------- drivers/net/wireless/realtek/rtlwifi/wifi.h | 12 ------ 4 files changed, 64 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index aab4605de9c47..fd28c7a722d89 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -2696,9 +2696,6 @@ MODULE_AUTHOR("Larry Finger "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Realtek 802.11n PCI wireless core"); -struct rtl_global_var rtl_global_var = {}; -EXPORT_SYMBOL_GPL(rtl_global_var); - static int __init rtl_core_module_init(void) { BUILD_BUG_ON(TX_PWR_BY_RATE_NUM_RATE < TX_PWR_BY_RATE_NUM_SECTION); @@ -2712,10 +2709,6 @@ static int __init rtl_core_module_init(void) /* add debugfs */ rtl_debugfs_add_topdir(); - /* init some global vars */ - INIT_LIST_HEAD(&rtl_global_var.glb_priv_list); - spin_lock_init(&rtl_global_var.glb_list_lock); - return 0; } diff --git a/drivers/net/wireless/realtek/rtlwifi/base.h b/drivers/net/wireless/realtek/rtlwifi/base.h index f081a9a90563f..f3a6a43a42eca 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.h +++ b/drivers/net/wireless/realtek/rtlwifi/base.h @@ -124,7 +124,6 @@ int rtl_send_smps_action(struct ieee80211_hw *hw, u8 *rtl_find_ie(u8 *data, unsigned int len, u8 ie); void rtl_recognize_peer(struct ieee80211_hw *hw, u8 *data, unsigned int len); u8 rtl_tid_to_ac(u8 tid); -extern struct rtl_global_var rtl_global_var; void rtl_phy_scan_operation_backup(struct ieee80211_hw *hw, u8 operation); #endif diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 40fc3c297a8ac..4388066eb9e27 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -295,46 +295,6 @@ static bool rtl_pci_get_amd_l1_patch(struct ieee80211_hw *hw) return status; } -static bool rtl_pci_check_buddy_priv(struct ieee80211_hw *hw, - struct rtl_priv **buddy_priv) -{ - struct rtl_priv *rtlpriv = rtl_priv(hw); - struct rtl_pci_priv *pcipriv = rtl_pcipriv(hw); - struct rtl_priv *tpriv = NULL, *iter; - struct rtl_pci_priv *tpcipriv = NULL; - - if (!list_empty(&rtlpriv->glb_var->glb_priv_list)) { - list_for_each_entry(iter, &rtlpriv->glb_var->glb_priv_list, - list) { - tpcipriv = (struct rtl_pci_priv *)iter->priv; - rtl_dbg(rtlpriv, COMP_INIT, DBG_LOUD, - "pcipriv->ndis_adapter.funcnumber %x\n", - pcipriv->ndis_adapter.funcnumber); - rtl_dbg(rtlpriv, COMP_INIT, DBG_LOUD, - "tpcipriv->ndis_adapter.funcnumber %x\n", - tpcipriv->ndis_adapter.funcnumber); - - if (pcipriv->ndis_adapter.busnumber == - tpcipriv->ndis_adapter.busnumber && - pcipriv->ndis_adapter.devnumber == - tpcipriv->ndis_adapter.devnumber && - pcipriv->ndis_adapter.funcnumber != - tpcipriv->ndis_adapter.funcnumber) { - tpriv = iter; - break; - } - } - } - - rtl_dbg(rtlpriv, COMP_INIT, DBG_LOUD, - "find_buddy_priv %d\n", tpriv != NULL); - - if (tpriv) - *buddy_priv = tpriv; - - return tpriv != NULL; -} - static void rtl_pci_parse_configuration(struct pci_dev *pdev, struct ieee80211_hw *hw) { @@ -2011,7 +1971,6 @@ static bool _rtl_pci_find_adapter(struct pci_dev *pdev, pcipriv->ndis_adapter.amd_l1_patch); rtl_pci_parse_configuration(pdev, hw); - list_add_tail(&rtlpriv->list, &rtlpriv->glb_var->glb_priv_list); return true; } @@ -2158,7 +2117,6 @@ int rtl_pci_probe(struct pci_dev *pdev, rtlpriv->rtlhal.interface = INTF_PCI; rtlpriv->cfg = (struct rtl_hal_cfg *)(id->driver_data); rtlpriv->intf_ops = &rtl_pci_ops; - rtlpriv->glb_var = &rtl_global_var; rtl_efuse_ops_init(hw); /* MEM map */ @@ -2316,7 +2274,6 @@ void rtl_pci_disconnect(struct pci_dev *pdev) if (rtlpci->using_msi) pci_disable_msi(rtlpci->pdev); - list_del(&rtlpriv->list); if (rtlpriv->io.pci_mem_start != 0) { pci_iounmap(pdev, (void __iomem *)rtlpriv->io.pci_mem_start); pci_release_regions(pdev); @@ -2375,7 +2332,6 @@ EXPORT_SYMBOL(rtl_pci_resume); const struct rtl_intf_ops rtl_pci_ops = { .adapter_start = rtl_pci_start, .adapter_stop = rtl_pci_stop, - .check_buddy_priv = rtl_pci_check_buddy_priv, .adapter_tx = rtl_pci_tx, .flush = rtl_pci_flush, .reset_trx_ring = rtl_pci_reset_trx_ring, diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h index ae6e351bc83c9..f1830ddcdd8c1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/wifi.h +++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h @@ -2270,8 +2270,6 @@ struct rtl_intf_ops { /*com */ int (*adapter_start)(struct ieee80211_hw *hw); void (*adapter_stop)(struct ieee80211_hw *hw); - bool (*check_buddy_priv)(struct ieee80211_hw *hw, - struct rtl_priv **buddy_priv); int (*adapter_tx)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, @@ -2514,14 +2512,6 @@ struct dig_t { u32 rssi_max; }; -struct rtl_global_var { - /* from this list we can get - * other adapter's rtl_priv - */ - struct list_head glb_priv_list; - spinlock_t glb_list_lock; -}; - #define IN_4WAY_TIMEOUT_TIME (30 * MSEC_PER_SEC) /* 30 seconds */ struct rtl_btc_info { @@ -2667,9 +2657,7 @@ struct rtl_scan_list { struct rtl_priv { struct ieee80211_hw *hw; struct completion firmware_loading_complete; - struct list_head list; struct rtl_priv *buddy_priv; - struct rtl_global_var *glb_var; struct rtl_dmsp_ctl dmsp_ctl; struct rtl_locks locks; struct rtl_works works; From d8ece6fc3694657e4886191b32ca1690af11adda Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 6 Dec 2024 14:37:11 -0300 Subject: [PATCH 0423/1450] wifi: rtlwifi: destroy workqueue at rtl_deinit_core rtl_wq is allocated at rtl_init_core, so it makes more sense to destroy it at rtl_deinit_core. In the case of USB, where _rtl_usb_init does not require anything to be undone, that is fine. But for PCI, rtl_pci_init, which is called after rtl_init_core, needs to deallocate data, but only if it has been called. That means that destroying the workqueue needs to be done whether rtl_pci_init has been called or not. And since rtl_pci_deinit was doing it, it has to be moved out of there. It makes more sense to move it to rtl_deinit_core and have it done in both cases, USB and PCI. Since this is a requirement for a followup memory leak fix, mark this as fixing such memory leak. Fixes: 0c8173385e54 ("rtl8192ce: Add new driver") Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206173713.3222187-3-cascardo@igalia.com --- drivers/net/wireless/realtek/rtlwifi/base.c | 6 ++++++ drivers/net/wireless/realtek/rtlwifi/pci.c | 2 -- drivers/net/wireless/realtek/rtlwifi/usb.c | 5 ----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index fd28c7a722d89..ff61867d142fa 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -575,9 +575,15 @@ static void rtl_free_entries_from_ack_queue(struct ieee80211_hw *hw, void rtl_deinit_core(struct ieee80211_hw *hw) { + struct rtl_priv *rtlpriv = rtl_priv(hw); + rtl_c2hcmd_launcher(hw, 0); rtl_free_entries_from_scan_list(hw); rtl_free_entries_from_ack_queue(hw, false); + if (rtlpriv->works.rtl_wq) { + destroy_workqueue(rtlpriv->works.rtl_wq); + rtlpriv->works.rtl_wq = NULL; + } } EXPORT_SYMBOL_GPL(rtl_deinit_core); diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 4388066eb9e27..e60ac910e750b 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -1656,8 +1656,6 @@ static void rtl_pci_deinit(struct ieee80211_hw *hw) synchronize_irq(rtlpci->pdev->irq); tasklet_kill(&rtlpriv->works.irq_tasklet); cancel_work_sync(&rtlpriv->works.lps_change_work); - - destroy_workqueue(rtlpriv->works.rtl_wq); } static int rtl_pci_init(struct ieee80211_hw *hw, struct pci_dev *pdev) diff --git a/drivers/net/wireless/realtek/rtlwifi/usb.c b/drivers/net/wireless/realtek/rtlwifi/usb.c index 0368ecea2e817..f5718e570011e 100644 --- a/drivers/net/wireless/realtek/rtlwifi/usb.c +++ b/drivers/net/wireless/realtek/rtlwifi/usb.c @@ -629,11 +629,6 @@ static void _rtl_usb_cleanup_rx(struct ieee80211_hw *hw) tasklet_kill(&rtlusb->rx_work_tasklet); cancel_work_sync(&rtlpriv->works.lps_change_work); - if (rtlpriv->works.rtl_wq) { - destroy_workqueue(rtlpriv->works.rtl_wq); - rtlpriv->works.rtl_wq = NULL; - } - skb_queue_purge(&rtlusb->rx_queue); while ((urb = usb_get_from_anchor(&rtlusb->rx_cleanup_urbs))) { From e7ceefbfd8d447abc8aca8ab993a942803522c06 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 6 Dec 2024 14:37:12 -0300 Subject: [PATCH 0424/1450] wifi: rtlwifi: fix memory leaks and invalid access at probe error path Deinitialize at reverse order when probe fails. When init_sw_vars fails, rtl_deinit_core should not be called, specially now that it destroys the rtl_wq workqueue. And call rtl_pci_deinit and deinit_sw_vars, otherwise, memory will be leaked. Remove pci_set_drvdata call as it will already be cleaned up by the core driver code and could lead to memory leaks too. cf. commit 8d450935ae7f ("wireless: rtlwifi: remove unnecessary pci_set_drvdata()") and commit 3d86b93064c7 ("rtlwifi: Fix PCI probe error path orphaned memory"). Fixes: 0c8173385e54 ("rtl8192ce: Add new driver") Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206173713.3222187-4-cascardo@igalia.com --- drivers/net/wireless/realtek/rtlwifi/pci.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index e60ac910e750b..a870117cf12af 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -2165,7 +2165,7 @@ int rtl_pci_probe(struct pci_dev *pdev, if (rtlpriv->cfg->ops->init_sw_vars(hw)) { pr_err("Can't init_sw_vars\n"); err = -ENODEV; - goto fail3; + goto fail2; } rtl_init_sw_leds(hw); @@ -2183,14 +2183,14 @@ int rtl_pci_probe(struct pci_dev *pdev, err = rtl_pci_init(hw, pdev); if (err) { pr_err("Failed to init PCI\n"); - goto fail3; + goto fail4; } err = ieee80211_register_hw(hw); if (err) { pr_err("Can't register mac80211 hw.\n"); err = -ENODEV; - goto fail3; + goto fail5; } rtlpriv->mac80211.mac80211_registered = 1; @@ -2213,9 +2213,12 @@ int rtl_pci_probe(struct pci_dev *pdev, set_bit(RTL_STATUS_INTERFACE_START, &rtlpriv->status); return 0; -fail3: - pci_set_drvdata(pdev, NULL); +fail5: + rtl_pci_deinit(hw); +fail4: rtl_deinit_core(hw); +fail3: + rtlpriv->cfg->ops->deinit_sw_vars(hw); fail2: if (rtlpriv->io.pci_mem_start != 0) From b59b86c5d08be7d761c04affcbcec8184738c200 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 6 Dec 2024 14:37:13 -0300 Subject: [PATCH 0425/1450] wifi: rtlwifi: pci: wait for firmware loading before releasing memory At probe error path, the firmware loading work may have already been queued. In such a case, it will try to access memory allocated by the probe function, which is about to be released. In such paths, wait for the firmware worker to finish before releasing memory. Fixes: 3d86b93064c7 ("rtlwifi: Fix PCI probe error path orphaned memory") Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241206173713.3222187-5-cascardo@igalia.com --- drivers/net/wireless/realtek/rtlwifi/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index a870117cf12af..0eafc4d125f91 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -2218,6 +2218,7 @@ int rtl_pci_probe(struct pci_dev *pdev, fail4: rtl_deinit_core(hw); fail3: + wait_for_completion(&rtlpriv->firmware_loading_complete); rtlpriv->cfg->ops->deinit_sw_vars(hw); fail2: From b6c10a19363787ffdffe08049b9e0b71c101d401 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 9 Dec 2024 12:20:19 +0800 Subject: [PATCH 0426/1450] wifi: rtw89: 8852c: rfk: refine target channel calculation in _rx_dck_channel_calc() The channel is not possibly 0, so original code is fine. Still want to avoid Coverity warning, so ensure -32 offset for the channel number which is larger than 125 only. Actually, don't change logic at all. Addresses-Coverity-ID: 1628150 ("Overflowed constant") Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241209042020.21290-1-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/rtw8852c_rfk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/rtw8852c_rfk.c b/drivers/net/wireless/realtek/rtw89/rtw8852c_rfk.c index bd17c0a1c6846..b92e2ce4f4adc 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8852c_rfk.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8852c_rfk.c @@ -1769,10 +1769,10 @@ u8 _rx_dck_channel_calc(struct rtw89_dev *rtwdev, const struct rtw89_chan *chan) target_ch = chan->channel - 33; } } else if (chan->band_type == RTW89_BAND_6G) { - if (chan->channel >= 1 && chan->channel <= 125) - target_ch = chan->channel + 32; - else + if (chan->channel > 125) target_ch = chan->channel - 32; + else + target_ch = chan->channel + 32; } else { target_ch = chan->channel; } From 5fdf5e557f06213ef5134d31770c29e77de205cd Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 9 Dec 2024 12:20:20 +0800 Subject: [PATCH 0427/1450] wifi: rtw89: 8851b: rfk: remove unnecessary assignment of return value of _dpk_dgain_read() The return value of _dpk_dgain_read() is not used afterward, so remove it safely. Addresses-Coverity-ID: 1504753 ("Unused value") Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241209042020.21290-2-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/rtw8851b_rfk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw89/rtw8851b_rfk.c b/drivers/net/wireless/realtek/rtw89/rtw8851b_rfk.c index 364e363542256..f72b3ac6f1492 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8851b_rfk.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8851b_rfk.c @@ -2199,7 +2199,7 @@ static u8 _dpk_agc(struct rtw89_dev *rtwdev, enum rtw89_phy_idx phy, if (dgain > 0x5fc || dgain < 0x556) { _dpk_one_shot(rtwdev, phy, path, D_SYNC); - dgain = _dpk_dgain_read(rtwdev); + _dpk_dgain_read(rtwdev); } if (agc_cnt == 0) { From 09489812013f9ff3850c3af9900c88012b8c1e5d Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 9 Dec 2024 12:21:27 +0800 Subject: [PATCH 0428/1450] wifi: rtw89: phy: add dummy C2H event handler for report of TAS power The newer firmware, lik RTL8852C version 0.27.111.0, will notify driver report of TAS (Time Averaged SAR) power by new C2H events. This is to assist in higher accurate calculation of TAS. For now, driver doesn't use the report yet, so add a dummy handler to avoid it throws info like: rtw89_8852ce 0000:03:00.0: c2h class 9 func 6 not support Also add "MAC" and "PHY" to the message to disambiguate the source of C2H event. Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20241209042127.21424-1-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/mac.c | 4 ++-- drivers/net/wireless/realtek/rtw89/phy.c | 10 ++++++++-- drivers/net/wireless/realtek/rtw89/phy.h | 1 + 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/mac.c b/drivers/net/wireless/realtek/rtw89/mac.c index bb4f58118e05a..c78066fd45048 100644 --- a/drivers/net/wireless/realtek/rtw89/mac.c +++ b/drivers/net/wireless/realtek/rtw89/mac.c @@ -5558,11 +5558,11 @@ void rtw89_mac_c2h_handle(struct rtw89_dev *rtwdev, struct sk_buff *skb, case RTW89_MAC_C2H_CLASS_FWDBG: return; default: - rtw89_info(rtwdev, "c2h class %d not support\n", class); + rtw89_info(rtwdev, "MAC c2h class %d not support\n", class); return; } if (!handler) { - rtw89_info(rtwdev, "c2h class %d func %d not support\n", class, + rtw89_info(rtwdev, "MAC c2h class %d func %d not support\n", class, func); return; } diff --git a/drivers/net/wireless/realtek/rtw89/phy.c b/drivers/net/wireless/realtek/rtw89/phy.c index 604ea048c3abc..4e3754fd18fd4 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.c +++ b/drivers/net/wireless/realtek/rtw89/phy.c @@ -3444,10 +3444,16 @@ rtw89_phy_c2h_rfk_report_state(struct rtw89_dev *rtwdev, struct sk_buff *c2h, u3 (int)(len - sizeof(report->hdr)), &report->state); } +static void +rtw89_phy_c2h_rfk_log_tas_pwr(struct rtw89_dev *rtwdev, struct sk_buff *c2h, u32 len) +{ +} + static void (* const rtw89_phy_c2h_rfk_report_handler[])(struct rtw89_dev *rtwdev, struct sk_buff *c2h, u32 len) = { [RTW89_PHY_C2H_RFK_REPORT_FUNC_STATE] = rtw89_phy_c2h_rfk_report_state, + [RTW89_PHY_C2H_RFK_LOG_TAS_PWR] = rtw89_phy_c2h_rfk_log_tas_pwr, }; bool rtw89_phy_c2h_chk_atomic(struct rtw89_dev *rtwdev, u8 class, u8 func) @@ -3501,11 +3507,11 @@ void rtw89_phy_c2h_handle(struct rtw89_dev *rtwdev, struct sk_buff *skb, return; fallthrough; default: - rtw89_info(rtwdev, "c2h class %d not support\n", class); + rtw89_info(rtwdev, "PHY c2h class %d not support\n", class); return; } if (!handler) { - rtw89_info(rtwdev, "c2h class %d func %d not support\n", class, + rtw89_info(rtwdev, "PHY c2h class %d func %d not support\n", class, func); return; } diff --git a/drivers/net/wireless/realtek/rtw89/phy.h b/drivers/net/wireless/realtek/rtw89/phy.h index cf33c1655b7a7..697ee47fe325c 100644 --- a/drivers/net/wireless/realtek/rtw89/phy.h +++ b/drivers/net/wireless/realtek/rtw89/phy.h @@ -151,6 +151,7 @@ enum rtw89_phy_c2h_rfk_log_func { enum rtw89_phy_c2h_rfk_report_func { RTW89_PHY_C2H_RFK_REPORT_FUNC_STATE = 0, + RTW89_PHY_C2H_RFK_LOG_TAS_PWR = 6, }; enum rtw89_phy_c2h_dm_func { From f87e4f2434430b0f750fbdff4fd0601807571bb2 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Tue, 10 Dec 2024 22:56:53 +0000 Subject: [PATCH 0429/1450] nfp: Convert timeouts to secs_to_jiffies() Commit b35108a51cf7 ("jiffies: Define secs_to_jiffies()") introduced secs_to_jiffies(). As the value here is a multiple of 1000, use secs_to_jiffies() instead of msecs_to_jiffies to avoid the multiplication. This is converted using scripts/coccinelle/misc/secs_to_jiffies.cocci with the following Coccinelle rules: @@ constant C; @@ - msecs_to_jiffies(C * 1000) + secs_to_jiffies(C) @@ constant C; @@ - msecs_to_jiffies(C * MSEC_PER_SEC) + secs_to_jiffies(C) Signed-off-by: Easwar Hariharan Reviewed-by: Louis Peens Link: https://patch.msgid.link/20241210-converge-secs-to-jiffies-v3-20-59479891e658@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 98e098c09c034..abba165738a3f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2779,7 +2779,7 @@ static void nfp_net_netdev_init(struct nfp_net *nn) break; } - netdev->watchdog_timeo = msecs_to_jiffies(5 * 1000); + netdev->watchdog_timeo = secs_to_jiffies(5); /* MTU range: 68 - hw-specific max */ netdev->min_mtu = ETH_MIN_MTU; From 67571036635b8136a53b615c6bb57021d982d7da Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 11 Dec 2024 00:19:27 +0000 Subject: [PATCH 0430/1450] gve: Remove unused gve_adminq_set_mtu The last use of gve_adminq_set_mtu() was removed by commit 37149e9374bf ("gve: Implement packet continuation for RX.") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kalesh AP Reviewed-by: Praveen Kaligineedi Link: https://patch.msgid.link/20241211001927.253161-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_adminq.c | 14 -------------- drivers/net/ethernet/google/gve/gve_adminq.h | 1 - 2 files changed, 15 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index 060e0e6749380..aa7d723011d0c 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -1128,20 +1128,6 @@ int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id) return gve_adminq_execute_cmd(priv, &cmd); } -int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu) -{ - union gve_adminq_command cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = cpu_to_be32(GVE_ADMINQ_SET_DRIVER_PARAMETER); - cmd.set_driver_param = (struct gve_adminq_set_driver_parameter) { - .parameter_type = cpu_to_be32(GVE_SET_PARAM_MTU), - .parameter_value = cpu_to_be64(mtu), - }; - - return gve_adminq_execute_cmd(priv, &cmd); -} - int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, dma_addr_t stats_report_addr, u64 interval) { diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h index 863683de96942..2282174582753 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.h +++ b/drivers/net/ethernet/google/gve/gve_adminq.h @@ -612,7 +612,6 @@ int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 queue_id); int gve_adminq_register_page_list(struct gve_priv *priv, struct gve_queue_page_list *qpl); int gve_adminq_unregister_page_list(struct gve_priv *priv, u32 page_list_id); -int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu); int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, dma_addr_t stats_report_addr, u64 interval); int gve_adminq_verify_driver_compatibility(struct gve_priv *priv, From b82ca90d5512b07be2b9ee28d0e9a775bc23e3e9 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 11 Dec 2024 11:54:19 +0530 Subject: [PATCH 0431/1450] cn10k-ipsec: Fix compilation error when CONFIG_XFRM_OFFLOAD disabled Define static branch variable "cn10k_ipsec_sa_enabled" in "otx2_txrx.c". This fixes below compilation error when CONFIG_XFRM_OFFLOAD is disabled. drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.o:(__jump_table+0x8): undefined reference to `cn10k_ipsec_sa_enabled' drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.o:(__jump_table+0x18): undefined reference to `cn10k_ipsec_sa_enabled' drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.o:(__jump_table+0x28): undefined reference to `cn10k_ipsec_sa_enabled' Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412110505.ZKDzGRMv-lkp@intel.com/ Fixes: 6a77a158848a ("cn10k-ipsec: Process outbound ipsec crypto offload") Signed-off-by: Bharat Bhushan Link: https://patch.msgid.link/20241211062419.2587111-1-bbhushan2@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c | 2 -- drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index c333e04daad38..09a5b52682055 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -14,8 +14,6 @@ #include "otx2_struct.h" #include "cn10k_ipsec.h" -DEFINE_STATIC_KEY_FALSE(cn10k_ipsec_sa_enabled); - static bool is_dev_support_ipsec_offload(struct pci_dev *pdev) { return is_dev_cn10ka_b0(pdev) || is_dev_cn10kb(pdev); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 4e0133d1d8921..224cef9389274 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -27,6 +27,8 @@ */ #define PTP_SYNC_SEC_OFFSET 34 +DEFINE_STATIC_KEY_FALSE(cn10k_ipsec_sa_enabled); + static bool otx2_xdp_rcv_pkt_handler(struct otx2_nic *pfvf, struct bpf_prog *prog, struct nix_cqe_rx_s *cqe, From ae7837bb3d9d0bad1230353bbafb92b3e6ad3941 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 11 Dec 2024 00:58:02 +0000 Subject: [PATCH 0432/1450] isdn: Remove unused get_Bprotocol4id() get_Bprotocol4id() was added in 2008 in commit 1b2b03f8e514 ("Add mISDN core files") but hasn't been used. Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241211005802.258279-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/isdn/mISDN/core.c | 14 -------------- drivers/isdn/mISDN/core.h | 1 - 2 files changed, 15 deletions(-) diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c index e34a7a46754e8..8ec2d4d4f1352 100644 --- a/drivers/isdn/mISDN/core.c +++ b/drivers/isdn/mISDN/core.c @@ -294,20 +294,6 @@ get_Bprotocol4mask(u_int m) return NULL; } -struct Bprotocol * -get_Bprotocol4id(u_int id) -{ - u_int m; - - if (id < ISDN_P_B_START || id > 63) { - printk(KERN_WARNING "%s id not in range %d\n", - __func__, id); - return NULL; - } - m = 1 << (id & ISDN_P_B_MASK); - return get_Bprotocol4mask(m); -} - int mISDN_register_Bprotocol(struct Bprotocol *bp) { diff --git a/drivers/isdn/mISDN/core.h b/drivers/isdn/mISDN/core.h index 42599f49c189d..5617c06de8e4d 100644 --- a/drivers/isdn/mISDN/core.h +++ b/drivers/isdn/mISDN/core.h @@ -55,7 +55,6 @@ extern void __add_layer2(struct mISDNchannel *, struct mISDNstack *); extern u_int get_all_Bprotocols(void); struct Bprotocol *get_Bprotocol4mask(u_int); -struct Bprotocol *get_Bprotocol4id(u_int); extern int mISDN_inittimer(u_int *); extern void mISDN_timer_cleanup(void); From c4117091d029087abde76e6947d43dca8f1db20b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 10 Dec 2024 12:27:10 -0800 Subject: [PATCH 0433/1450] lib: packing: create __pack() and __unpack() variants without error checking A future variant of the API, which works on arrays of packed_field structures, will make most of these checks redundant. The idea will be that we want to perform sanity checks at compile time, not once for every function call. Introduce new variants of pack() and unpack(), which elide the sanity checks, assuming that the input was pre-sanitized. Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-1-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- lib/packing.c | 142 +++++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 64 deletions(-) diff --git a/lib/packing.c b/lib/packing.c index 793942745e34f..f237b8af99f5f 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -51,64 +51,20 @@ static size_t calculate_box_addr(size_t box, size_t len, u8 quirks) return offset_of_group + offset_in_group; } -/** - * pack - Pack u64 number into bitfield of buffer. - * - * @pbuf: Pointer to a buffer holding the packed value. - * @uval: CPU-readable unpacked value to pack. - * @startbit: The index (in logical notation, compensated for quirks) where - * the packed value starts within pbuf. Must be larger than, or - * equal to, endbit. - * @endbit: The index (in logical notation, compensated for quirks) where - * the packed value ends within pbuf. Must be smaller than, or equal - * to, startbit. - * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. - * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and - * QUIRK_MSB_ON_THE_RIGHT. - * - * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming - * correct usage, return code may be discarded. The @pbuf memory will - * be modified on success. - */ -int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, - u8 quirks) +static void __pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks) { /* Logical byte indices corresponding to the * start and end of the field. */ - int plogical_first_u8, plogical_last_u8, box; - /* width of the field to access in the pbuf */ - u64 value_width; - - /* startbit is expected to be larger than endbit, and both are - * expected to be within the logically addressable range of the buffer. - */ - if (unlikely(startbit < endbit || startbit >= BITS_PER_BYTE * pbuflen)) - /* Invalid function call */ - return -EINVAL; - - value_width = startbit - endbit + 1; - if (unlikely(value_width > 64)) - return -ERANGE; - - /* Check if "uval" fits in "value_width" bits. - * If value_width is 64, the check will fail, but any - * 64-bit uval will surely fit. - */ - if (unlikely(value_width < 64 && uval >= (1ull << value_width))) - /* Cannot store "uval" inside "value_width" bits. - * Truncating "uval" is most certainly not desirable, - * so simply erroring out is appropriate. - */ - return -ERANGE; + int plogical_first_u8 = startbit / BITS_PER_BYTE; + int plogical_last_u8 = endbit / BITS_PER_BYTE; + int box; /* Iterate through an idealistic view of the pbuf as an u64 with * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low * logical bit significance. "box" denotes the current logical u8. */ - plogical_first_u8 = startbit / BITS_PER_BYTE; - plogical_last_u8 = endbit / BITS_PER_BYTE; - for (box = plogical_first_u8; box >= plogical_last_u8; box--) { /* Bit indices into the currently accessed 8-bit box */ size_t box_start_bit, box_end_bit, box_addr; @@ -163,15 +119,13 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, ((u8 *)pbuf)[box_addr] &= ~box_mask; ((u8 *)pbuf)[box_addr] |= pval; } - return 0; } -EXPORT_SYMBOL(pack); /** - * unpack - Unpack u64 number from packed buffer. + * pack - Pack u64 number into bitfield of buffer. * * @pbuf: Pointer to a buffer holding the packed value. - * @uval: Pointer to an u64 holding the unpacked value. + * @uval: CPU-readable unpacked value to pack. * @startbit: The index (in logical notation, compensated for quirks) where * the packed value starts within pbuf. Must be larger than, or * equal to, endbit. @@ -183,16 +137,12 @@ EXPORT_SYMBOL(pack); * QUIRK_MSB_ON_THE_RIGHT. * * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming - * correct usage, return code may be discarded. The @uval will be - * modified on success. + * correct usage, return code may be discarded. The @pbuf memory will + * be modified on success. */ -int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, - size_t pbuflen, u8 quirks) +int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, + u8 quirks) { - /* Logical byte indices corresponding to the - * start and end of the field. - */ - int plogical_first_u8, plogical_last_u8, box; /* width of the field to access in the pbuf */ u64 value_width; @@ -207,6 +157,33 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, if (unlikely(value_width > 64)) return -ERANGE; + /* Check if "uval" fits in "value_width" bits. + * If value_width is 64, the check will fail, but any + * 64-bit uval will surely fit. + */ + if (value_width < 64 && uval >= (1ull << value_width)) + /* Cannot store "uval" inside "value_width" bits. + * Truncating "uval" is most certainly not desirable, + * so simply erroring out is appropriate. + */ + return -ERANGE; + + __pack(pbuf, uval, startbit, endbit, pbuflen, quirks); + + return 0; +} +EXPORT_SYMBOL(pack); + +static void __unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks) +{ + /* Logical byte indices corresponding to the + * start and end of the field. + */ + int plogical_first_u8 = startbit / BITS_PER_BYTE; + int plogical_last_u8 = endbit / BITS_PER_BYTE; + int box; + /* Initialize parameter */ *uval = 0; @@ -214,9 +191,6 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low * logical bit significance. "box" denotes the current logical u8. */ - plogical_first_u8 = startbit / BITS_PER_BYTE; - plogical_last_u8 = endbit / BITS_PER_BYTE; - for (box = plogical_first_u8; box >= plogical_last_u8; box--) { /* Bit indices into the currently accessed 8-bit box */ size_t box_start_bit, box_end_bit, box_addr; @@ -271,6 +245,46 @@ int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, *uval &= ~proj_mask; *uval |= pval; } +} + +/** + * unpack - Unpack u64 number from packed buffer. + * + * @pbuf: Pointer to a buffer holding the packed value. + * @uval: Pointer to an u64 holding the unpacked value. + * @startbit: The index (in logical notation, compensated for quirks) where + * the packed value starts within pbuf. Must be larger than, or + * equal to, endbit. + * @endbit: The index (in logical notation, compensated for quirks) where + * the packed value ends within pbuf. Must be smaller than, or equal + * to, startbit. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming + * correct usage, return code may be discarded. The @uval will be + * modified on success. + */ +int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks) +{ + /* width of the field to access in the pbuf */ + u64 value_width; + + /* startbit is expected to be larger than endbit, and both are + * expected to be within the logically addressable range of the buffer. + */ + if (startbit < endbit || startbit >= BITS_PER_BYTE * pbuflen) + /* Invalid function call */ + return -EINVAL; + + value_width = startbit - endbit + 1; + if (value_width > 64) + return -ERANGE; + + __unpack(pbuf, uval, startbit, endbit, pbuflen, quirks); + return 0; } EXPORT_SYMBOL(unpack); From 48c2752785ad1730e08a64507f05d0e5d5bc79b8 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 10 Dec 2024 12:27:11 -0800 Subject: [PATCH 0434/1450] lib: packing: demote truncation error in pack() to a warning in __pack() Most of the sanity checks in pack() and unpack() can be covered at compile time. There is only one exception, and that is truncation of the uval during a pack() operation. We'd like the error-less __pack() to catch that condition as well. But at the same time, it is currently the responsibility of consumer drivers (currently just sja1105) to print anything at all when this error occurs, and then discard the return code. We can just print a loud warning in the library code and continue with the truncated __pack() operation. In practice, having the warning is very important, see commit 24deec6b9e4a ("net: dsa: sja1105: disallow C45 transactions on the BASE-TX MDIO bus") where the bug was caught exactly by noticing this print. Add the first print to the packing library, and at the same time remove the print for the same condition from the sja1105 driver, to avoid double printing. Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-2-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- .../net/dsa/sja1105/sja1105_static_config.c | 8 ++---- lib/packing.c | 26 +++++++------------ 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c index baba204ad62f6..3d790f8c6f4da 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.c +++ b/drivers/net/dsa/sja1105/sja1105_static_config.c @@ -26,12 +26,8 @@ void sja1105_pack(void *buf, const u64 *val, int start, int end, size_t len) pr_err("Start bit (%d) expected to be larger than end (%d)\n", start, end); } else if (rc == -ERANGE) { - if ((start - end + 1) > 64) - pr_err("Field %d-%d too large for 64 bits!\n", - start, end); - else - pr_err("Cannot store %llx inside bits %d-%d (would truncate)\n", - *val, start, end); + pr_err("Field %d-%d too large for 64 bits!\n", + start, end); } dump_stack(); } diff --git a/lib/packing.c b/lib/packing.c index f237b8af99f5f..09a2d195b9433 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -59,8 +59,17 @@ static void __pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, */ int plogical_first_u8 = startbit / BITS_PER_BYTE; int plogical_last_u8 = endbit / BITS_PER_BYTE; + int value_width = startbit - endbit + 1; int box; + /* Check if "uval" fits in "value_width" bits. + * The test only works for value_width < 64, but in the latter case, + * any 64-bit uval will surely fit. + */ + WARN(value_width < 64 && uval >= (1ull << value_width), + "Cannot store 0x%llx inside bits %zu-%zu - will truncate\n", + uval, startbit, endbit); + /* Iterate through an idealistic view of the pbuf as an u64 with * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low * logical bit significance. "box" denotes the current logical u8. @@ -143,9 +152,6 @@ static void __pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, u8 quirks) { - /* width of the field to access in the pbuf */ - u64 value_width; - /* startbit is expected to be larger than endbit, and both are * expected to be within the logically addressable range of the buffer. */ @@ -153,19 +159,7 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, /* Invalid function call */ return -EINVAL; - value_width = startbit - endbit + 1; - if (unlikely(value_width > 64)) - return -ERANGE; - - /* Check if "uval" fits in "value_width" bits. - * If value_width is 64, the check will fail, but any - * 64-bit uval will surely fit. - */ - if (value_width < 64 && uval >= (1ull << value_width)) - /* Cannot store "uval" inside "value_width" bits. - * Truncating "uval" is most certainly not desirable, - * so simply erroring out is appropriate. - */ + if (unlikely(startbit - endbit >= 64)) return -ERANGE; __pack(pbuf, uval, startbit, endbit, pbuflen, quirks); From 41d7ea30494cc0dde3e124a75ce0add93f988ba9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 10 Dec 2024 12:27:12 -0800 Subject: [PATCH 0435/1450] lib: packing: add pack_fields() and unpack_fields() This is new API which caters to the following requirements: - Pack or unpack a large number of fields to/from a buffer with a small code footprint. The current alternative is to open-code a large number of calls to pack() and unpack(), or to use packing() to reduce that number to half. But packing() is not const-correct. - Use unpacked numbers stored in variables smaller than u64. This reduces the rodata footprint of the stored field arrays. - Perform error checking at compile time, rather than runtime, and return void from the API functions. Because the C preprocessor can't generate variable length code (loops), this is a bit tricky to do with macros. To handle this, implement macros which sanity check the packed field definitions based on their size. Finally, a single macro with a chain of __builtin_choose_expr() is used to select the appropriate macros. We enforce the use of ascending or descending order to avoid O(N^2) scaling when checking for overlap. Note that the macros are written with care to ensure that the compilers can correctly evaluate the resulting code at compile time. In particular, care was taken with avoiding too many nested statement expressions. Nested statement expressions trip up some compilers, especially when passing down variables created in previous statement expressions. There are two key design choices intended to keep the overall macro code size small. First, the definition of each CHECK_PACKED_FIELDS_N macro is implemented recursively, by calling the N-1 macro. This avoids needing the code to repeat multiple times. Second, the CHECK_PACKED_FIELD macro enforces that the fields in the array are sorted in order. This allows checking for overlap only with neighboring fields, rather than the general overlap case where each field would need to be checked against other fields. The overlap checks use the first two fields to determine the order of the remaining fields, thus allowing either ascending or descending order. This enables drivers the flexibility to keep the fields ordered in which ever order most naturally fits their hardware design and its associated documentation. The CHECK_PACKED_FIELDS macro is directly called from within pack_fields and unpack_fields, ensuring that all drivers using the API receive the benefits of the compile-time checks. Users do not need to directly call any of the macros directly. The CHECK_PACKED_FIELDS and its helper macros CHECK_PACKED_FIELDS_(0..50) are generated using a simple C program in scripts/gen_packed_field_checks.c This program can be compiled on demand and executed to generate the macro code in include/linux/packing.h. This will aid in the event that a driver needs more than 50 fields. The generator can be updated with a new size, and used to update the packing.h header file. In practice, the ice driver will need to support 27 fields, and the sja1105 driver will need to support 0 fields. This on-demand generation avoids the need to modify Kbuild. We do not anticipate the maximum number of fields to grow very often. - Reduced rodata footprint for the storage of the packed field arrays. To that end, we have struct packed_field_u8 and packed_field_u16, which define the fields with the associated type. More can be added as needed (unlikely for now). On these types, the same generic pack_fields() and unpack_fields() API can be used, thanks to the new C11 _Generic() selection feature, which can call pack_fields_u8() or pack_fields_16(), depending on the type of the "fields" array - a simplistic form of polymorphism. It is evaluated at compile time which function will actually be called. Over time, packing() is expected to be completely replaced either with pack() or with pack_fields(). Signed-off-by: Vladimir Oltean Co-developed-by: Jacob Keller Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-3-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + Makefile | 4 + include/linux/packing.h | 425 ++++++++++++++++++++++++++++++ lib/packing.c | 153 +++++++++++ lib/packing_test.c | 61 +++++ scripts/.gitignore | 1 + scripts/Makefile | 2 +- scripts/gen_packed_field_checks.c | 37 +++ 8 files changed, 683 insertions(+), 1 deletion(-) create mode 100644 scripts/gen_packed_field_checks.c diff --git a/MAINTAINERS b/MAINTAINERS index af35519be3200..15cf366c0aecf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17644,6 +17644,7 @@ F: Documentation/core-api/packing.rst F: include/linux/packing.h F: lib/packing.c F: lib/packing_test.c +F: scripts/gen_packed_field_checks.c PADATA PARALLEL EXECUTION MECHANISM M: Steffen Klassert diff --git a/Makefile b/Makefile index 93ab62cef244f..9a9fd5504ae8f 100644 --- a/Makefile +++ b/Makefile @@ -1367,6 +1367,10 @@ PHONY += scripts_unifdef scripts_unifdef: scripts_basic $(Q)$(MAKE) $(build)=scripts scripts/unifdef +PHONY += scripts_gen_packed_field_checks +scripts_gen_packed_field_checks: scripts_basic + $(Q)$(MAKE) $(build)=scripts scripts/gen_packed_field_checks + # --------------------------------------------------------------------------- # Install diff --git a/include/linux/packing.h b/include/linux/packing.h index 5d36dcd06f604..0589d70bbe043 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -8,6 +8,83 @@ #include #include +#define GEN_PACKED_FIELD_STRUCT(__type) \ + struct packed_field_ ## __type { \ + __type startbit; \ + __type endbit; \ + __type offset; \ + __type size; \ + } + +/* struct packed_field_u8. Use with bit offsets < 256, buffers < 32B and + * unpacked structures < 256B. + */ +GEN_PACKED_FIELD_STRUCT(u8); + +/* struct packed_field_u16. Use with bit offsets < 65536, buffers < 8KB and + * unpacked structures < 64KB. + */ +GEN_PACKED_FIELD_STRUCT(u16); + +#define PACKED_FIELD(start, end, struct_name, struct_field) \ +{ \ + (start), \ + (end), \ + offsetof(struct_name, struct_field), \ + sizeof_field(struct_name, struct_field), \ +} + +#define CHECK_PACKED_FIELD_OVERLAP(fields, index1, index2) ({ \ + typeof(&(fields)[0]) __f = (fields); \ + typeof(__f[0]) _f1 = __f[index1]; typeof(__f[0]) _f2 = __f[index2]; \ + const bool _ascending = __f[0].startbit < __f[1].startbit; \ + BUILD_BUG_ON_MSG(_ascending && _f1.startbit >= _f2.startbit, \ + __stringify(fields) " field " __stringify(index2) \ + " breaks ascending order"); \ + BUILD_BUG_ON_MSG(!_ascending && _f1.startbit <= _f2.startbit, \ + __stringify(fields) " field " __stringify(index2) \ + " breaks descending order"); \ + BUILD_BUG_ON_MSG(max(_f1.endbit, _f2.endbit) <= \ + min(_f1.startbit, _f2.startbit), \ + __stringify(fields) " field " __stringify(index2) \ + " overlaps with previous field"); \ +}) + +#define CHECK_PACKED_FIELD(fields, index) ({ \ + typeof(&(fields)[0]) _f = (fields); \ + typeof(_f[0]) __f = _f[index]; \ + BUILD_BUG_ON_MSG(__f.startbit < __f.endbit, \ + __stringify(fields) " field " __stringify(index) \ + " start bit must not be smaller than end bit"); \ + BUILD_BUG_ON_MSG(__f.size != 1 && __f.size != 2 && \ + __f.size != 4 && __f.size != 8, \ + __stringify(fields) " field " __stringify(index) \ + " has unsupported unpacked storage size"); \ + BUILD_BUG_ON_MSG(__f.startbit - __f.endbit >= BITS_PER_BYTE * __f.size, \ + __stringify(fields) " field " __stringify(index) \ + " exceeds unpacked storage size"); \ + __builtin_choose_expr(index != 0, \ + CHECK_PACKED_FIELD_OVERLAP(fields, index - 1, index), \ + 1); \ +}) + +/* Note that the packed fields may be either in ascending or descending order. + * Thus, we must check that both the first and last field wit within the + * packed buffer size. + */ +#define CHECK_PACKED_FIELDS_SIZE(fields, pbuflen) ({ \ + typeof(&(fields)[0]) _f = (fields); \ + typeof(pbuflen) _len = (pbuflen); \ + const size_t num_fields = ARRAY_SIZE(fields); \ + BUILD_BUG_ON_MSG(!__builtin_constant_p(_len), \ + __stringify(fields) " pbuflen " __stringify(pbuflen) \ + " must be a compile time constant"); \ + BUILD_BUG_ON_MSG(_f[0].startbit >= BITS_PER_BYTE * _len, \ + __stringify(fields) " first field exceeds packed buffer size"); \ + BUILD_BUG_ON_MSG(_f[num_fields - 1].startbit >= BITS_PER_BYTE * _len, \ + __stringify(fields) " last field exceeds packed buffer size"); \ +}) + #define QUIRK_MSB_ON_THE_RIGHT BIT(0) #define QUIRK_LITTLE_ENDIAN BIT(1) #define QUIRK_LSW32_IS_FIRST BIT(2) @@ -26,4 +103,352 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, size_t pbuflen, u8 quirks); +void pack_fields_u8(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks); + +void pack_fields_u16(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks); + +void unpack_fields_u8(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks); + +void unpack_fields_u16(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks); + +/* Do not hand-edit the following packed field check macros! + * + * They are generated using scripts/gen_packed_field_checks.c, which may be + * built via "make scripts_gen_packed_field_checks". If larger macro sizes are + * needed in the future, please use this program to re-generate the macros and + * insert them here. + */ + +#define CHECK_PACKED_FIELDS_1(fields) \ + CHECK_PACKED_FIELD(fields, 0) + +#define CHECK_PACKED_FIELDS_2(fields) do { \ + CHECK_PACKED_FIELDS_1(fields); \ + CHECK_PACKED_FIELD(fields, 1); \ +} while (0) + +#define CHECK_PACKED_FIELDS_3(fields) do { \ + CHECK_PACKED_FIELDS_2(fields); \ + CHECK_PACKED_FIELD(fields, 2); \ +} while (0) + +#define CHECK_PACKED_FIELDS_4(fields) do { \ + CHECK_PACKED_FIELDS_3(fields); \ + CHECK_PACKED_FIELD(fields, 3); \ +} while (0) + +#define CHECK_PACKED_FIELDS_5(fields) do { \ + CHECK_PACKED_FIELDS_4(fields); \ + CHECK_PACKED_FIELD(fields, 4); \ +} while (0) + +#define CHECK_PACKED_FIELDS_6(fields) do { \ + CHECK_PACKED_FIELDS_5(fields); \ + CHECK_PACKED_FIELD(fields, 5); \ +} while (0) + +#define CHECK_PACKED_FIELDS_7(fields) do { \ + CHECK_PACKED_FIELDS_6(fields); \ + CHECK_PACKED_FIELD(fields, 6); \ +} while (0) + +#define CHECK_PACKED_FIELDS_8(fields) do { \ + CHECK_PACKED_FIELDS_7(fields); \ + CHECK_PACKED_FIELD(fields, 7); \ +} while (0) + +#define CHECK_PACKED_FIELDS_9(fields) do { \ + CHECK_PACKED_FIELDS_8(fields); \ + CHECK_PACKED_FIELD(fields, 8); \ +} while (0) + +#define CHECK_PACKED_FIELDS_10(fields) do { \ + CHECK_PACKED_FIELDS_9(fields); \ + CHECK_PACKED_FIELD(fields, 9); \ +} while (0) + +#define CHECK_PACKED_FIELDS_11(fields) do { \ + CHECK_PACKED_FIELDS_10(fields); \ + CHECK_PACKED_FIELD(fields, 10); \ +} while (0) + +#define CHECK_PACKED_FIELDS_12(fields) do { \ + CHECK_PACKED_FIELDS_11(fields); \ + CHECK_PACKED_FIELD(fields, 11); \ +} while (0) + +#define CHECK_PACKED_FIELDS_13(fields) do { \ + CHECK_PACKED_FIELDS_12(fields); \ + CHECK_PACKED_FIELD(fields, 12); \ +} while (0) + +#define CHECK_PACKED_FIELDS_14(fields) do { \ + CHECK_PACKED_FIELDS_13(fields); \ + CHECK_PACKED_FIELD(fields, 13); \ +} while (0) + +#define CHECK_PACKED_FIELDS_15(fields) do { \ + CHECK_PACKED_FIELDS_14(fields); \ + CHECK_PACKED_FIELD(fields, 14); \ +} while (0) + +#define CHECK_PACKED_FIELDS_16(fields) do { \ + CHECK_PACKED_FIELDS_15(fields); \ + CHECK_PACKED_FIELD(fields, 15); \ +} while (0) + +#define CHECK_PACKED_FIELDS_17(fields) do { \ + CHECK_PACKED_FIELDS_16(fields); \ + CHECK_PACKED_FIELD(fields, 16); \ +} while (0) + +#define CHECK_PACKED_FIELDS_18(fields) do { \ + CHECK_PACKED_FIELDS_17(fields); \ + CHECK_PACKED_FIELD(fields, 17); \ +} while (0) + +#define CHECK_PACKED_FIELDS_19(fields) do { \ + CHECK_PACKED_FIELDS_18(fields); \ + CHECK_PACKED_FIELD(fields, 18); \ +} while (0) + +#define CHECK_PACKED_FIELDS_20(fields) do { \ + CHECK_PACKED_FIELDS_19(fields); \ + CHECK_PACKED_FIELD(fields, 19); \ +} while (0) + +#define CHECK_PACKED_FIELDS_21(fields) do { \ + CHECK_PACKED_FIELDS_20(fields); \ + CHECK_PACKED_FIELD(fields, 20); \ +} while (0) + +#define CHECK_PACKED_FIELDS_22(fields) do { \ + CHECK_PACKED_FIELDS_21(fields); \ + CHECK_PACKED_FIELD(fields, 21); \ +} while (0) + +#define CHECK_PACKED_FIELDS_23(fields) do { \ + CHECK_PACKED_FIELDS_22(fields); \ + CHECK_PACKED_FIELD(fields, 22); \ +} while (0) + +#define CHECK_PACKED_FIELDS_24(fields) do { \ + CHECK_PACKED_FIELDS_23(fields); \ + CHECK_PACKED_FIELD(fields, 23); \ +} while (0) + +#define CHECK_PACKED_FIELDS_25(fields) do { \ + CHECK_PACKED_FIELDS_24(fields); \ + CHECK_PACKED_FIELD(fields, 24); \ +} while (0) + +#define CHECK_PACKED_FIELDS_26(fields) do { \ + CHECK_PACKED_FIELDS_25(fields); \ + CHECK_PACKED_FIELD(fields, 25); \ +} while (0) + +#define CHECK_PACKED_FIELDS_27(fields) do { \ + CHECK_PACKED_FIELDS_26(fields); \ + CHECK_PACKED_FIELD(fields, 26); \ +} while (0) + +#define CHECK_PACKED_FIELDS_28(fields) do { \ + CHECK_PACKED_FIELDS_27(fields); \ + CHECK_PACKED_FIELD(fields, 27); \ +} while (0) + +#define CHECK_PACKED_FIELDS_29(fields) do { \ + CHECK_PACKED_FIELDS_28(fields); \ + CHECK_PACKED_FIELD(fields, 28); \ +} while (0) + +#define CHECK_PACKED_FIELDS_30(fields) do { \ + CHECK_PACKED_FIELDS_29(fields); \ + CHECK_PACKED_FIELD(fields, 29); \ +} while (0) + +#define CHECK_PACKED_FIELDS_31(fields) do { \ + CHECK_PACKED_FIELDS_30(fields); \ + CHECK_PACKED_FIELD(fields, 30); \ +} while (0) + +#define CHECK_PACKED_FIELDS_32(fields) do { \ + CHECK_PACKED_FIELDS_31(fields); \ + CHECK_PACKED_FIELD(fields, 31); \ +} while (0) + +#define CHECK_PACKED_FIELDS_33(fields) do { \ + CHECK_PACKED_FIELDS_32(fields); \ + CHECK_PACKED_FIELD(fields, 32); \ +} while (0) + +#define CHECK_PACKED_FIELDS_34(fields) do { \ + CHECK_PACKED_FIELDS_33(fields); \ + CHECK_PACKED_FIELD(fields, 33); \ +} while (0) + +#define CHECK_PACKED_FIELDS_35(fields) do { \ + CHECK_PACKED_FIELDS_34(fields); \ + CHECK_PACKED_FIELD(fields, 34); \ +} while (0) + +#define CHECK_PACKED_FIELDS_36(fields) do { \ + CHECK_PACKED_FIELDS_35(fields); \ + CHECK_PACKED_FIELD(fields, 35); \ +} while (0) + +#define CHECK_PACKED_FIELDS_37(fields) do { \ + CHECK_PACKED_FIELDS_36(fields); \ + CHECK_PACKED_FIELD(fields, 36); \ +} while (0) + +#define CHECK_PACKED_FIELDS_38(fields) do { \ + CHECK_PACKED_FIELDS_37(fields); \ + CHECK_PACKED_FIELD(fields, 37); \ +} while (0) + +#define CHECK_PACKED_FIELDS_39(fields) do { \ + CHECK_PACKED_FIELDS_38(fields); \ + CHECK_PACKED_FIELD(fields, 38); \ +} while (0) + +#define CHECK_PACKED_FIELDS_40(fields) do { \ + CHECK_PACKED_FIELDS_39(fields); \ + CHECK_PACKED_FIELD(fields, 39); \ +} while (0) + +#define CHECK_PACKED_FIELDS_41(fields) do { \ + CHECK_PACKED_FIELDS_40(fields); \ + CHECK_PACKED_FIELD(fields, 40); \ +} while (0) + +#define CHECK_PACKED_FIELDS_42(fields) do { \ + CHECK_PACKED_FIELDS_41(fields); \ + CHECK_PACKED_FIELD(fields, 41); \ +} while (0) + +#define CHECK_PACKED_FIELDS_43(fields) do { \ + CHECK_PACKED_FIELDS_42(fields); \ + CHECK_PACKED_FIELD(fields, 42); \ +} while (0) + +#define CHECK_PACKED_FIELDS_44(fields) do { \ + CHECK_PACKED_FIELDS_43(fields); \ + CHECK_PACKED_FIELD(fields, 43); \ +} while (0) + +#define CHECK_PACKED_FIELDS_45(fields) do { \ + CHECK_PACKED_FIELDS_44(fields); \ + CHECK_PACKED_FIELD(fields, 44); \ +} while (0) + +#define CHECK_PACKED_FIELDS_46(fields) do { \ + CHECK_PACKED_FIELDS_45(fields); \ + CHECK_PACKED_FIELD(fields, 45); \ +} while (0) + +#define CHECK_PACKED_FIELDS_47(fields) do { \ + CHECK_PACKED_FIELDS_46(fields); \ + CHECK_PACKED_FIELD(fields, 46); \ +} while (0) + +#define CHECK_PACKED_FIELDS_48(fields) do { \ + CHECK_PACKED_FIELDS_47(fields); \ + CHECK_PACKED_FIELD(fields, 47); \ +} while (0) + +#define CHECK_PACKED_FIELDS_49(fields) do { \ + CHECK_PACKED_FIELDS_48(fields); \ + CHECK_PACKED_FIELD(fields, 48); \ +} while (0) + +#define CHECK_PACKED_FIELDS_50(fields) do { \ + CHECK_PACKED_FIELDS_49(fields); \ + CHECK_PACKED_FIELD(fields, 49); \ +} while (0) + +#define CHECK_PACKED_FIELDS(fields) \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 1, ({ CHECK_PACKED_FIELDS_1(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 2, ({ CHECK_PACKED_FIELDS_2(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 3, ({ CHECK_PACKED_FIELDS_3(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 4, ({ CHECK_PACKED_FIELDS_4(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 5, ({ CHECK_PACKED_FIELDS_5(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 6, ({ CHECK_PACKED_FIELDS_6(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 7, ({ CHECK_PACKED_FIELDS_7(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 8, ({ CHECK_PACKED_FIELDS_8(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 9, ({ CHECK_PACKED_FIELDS_9(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 10, ({ CHECK_PACKED_FIELDS_10(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 11, ({ CHECK_PACKED_FIELDS_11(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 12, ({ CHECK_PACKED_FIELDS_12(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 13, ({ CHECK_PACKED_FIELDS_13(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 14, ({ CHECK_PACKED_FIELDS_14(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 15, ({ CHECK_PACKED_FIELDS_15(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 16, ({ CHECK_PACKED_FIELDS_16(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 17, ({ CHECK_PACKED_FIELDS_17(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 18, ({ CHECK_PACKED_FIELDS_18(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 19, ({ CHECK_PACKED_FIELDS_19(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 20, ({ CHECK_PACKED_FIELDS_20(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 21, ({ CHECK_PACKED_FIELDS_21(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 22, ({ CHECK_PACKED_FIELDS_22(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 23, ({ CHECK_PACKED_FIELDS_23(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 24, ({ CHECK_PACKED_FIELDS_24(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 25, ({ CHECK_PACKED_FIELDS_25(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 26, ({ CHECK_PACKED_FIELDS_26(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 27, ({ CHECK_PACKED_FIELDS_27(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 28, ({ CHECK_PACKED_FIELDS_28(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 29, ({ CHECK_PACKED_FIELDS_29(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 30, ({ CHECK_PACKED_FIELDS_30(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 31, ({ CHECK_PACKED_FIELDS_31(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 32, ({ CHECK_PACKED_FIELDS_32(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 33, ({ CHECK_PACKED_FIELDS_33(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 34, ({ CHECK_PACKED_FIELDS_34(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 35, ({ CHECK_PACKED_FIELDS_35(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 36, ({ CHECK_PACKED_FIELDS_36(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 37, ({ CHECK_PACKED_FIELDS_37(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 38, ({ CHECK_PACKED_FIELDS_38(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 39, ({ CHECK_PACKED_FIELDS_39(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 40, ({ CHECK_PACKED_FIELDS_40(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 41, ({ CHECK_PACKED_FIELDS_41(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 42, ({ CHECK_PACKED_FIELDS_42(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 43, ({ CHECK_PACKED_FIELDS_43(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 44, ({ CHECK_PACKED_FIELDS_44(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 45, ({ CHECK_PACKED_FIELDS_45(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 46, ({ CHECK_PACKED_FIELDS_46(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 47, ({ CHECK_PACKED_FIELDS_47(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 48, ({ CHECK_PACKED_FIELDS_48(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 49, ({ CHECK_PACKED_FIELDS_49(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 50, ({ CHECK_PACKED_FIELDS_50(fields); }), \ + ({ BUILD_BUG_ON_MSG(1, "CHECK_PACKED_FIELDS() must be regenerated to support array sizes larger than 50."); }) \ +)))))))))))))))))))))))))))))))))))))))))))))))))) + +/* End of generated content */ + +#define pack_fields(pbuf, pbuflen, ustruct, fields, quirks) \ + ({ \ + CHECK_PACKED_FIELDS(fields); \ + CHECK_PACKED_FIELDS_SIZE((fields), (pbuflen)); \ + _Generic((fields), \ + const struct packed_field_u8 * : pack_fields_u8, \ + const struct packed_field_u16 * : pack_fields_u16 \ + )((pbuf), (pbuflen), (ustruct), (fields), ARRAY_SIZE(fields), (quirks)); \ + }) + +#define unpack_fields(pbuf, pbuflen, ustruct, fields, quirks) \ + ({ \ + CHECK_PACKED_FIELDS(fields); \ + CHECK_PACKED_FIELDS_SIZE((fields), (pbuflen)); \ + _Generic((fields), \ + const struct packed_field_u8 * : unpack_fields_u8, \ + const struct packed_field_u16 * : unpack_fields_u16 \ + )((pbuf), (pbuflen), (ustruct), (fields), ARRAY_SIZE(fields), (quirks)); \ + }) + #endif diff --git a/lib/packing.c b/lib/packing.c index 09a2d195b9433..bb1643d9e64d7 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -5,10 +5,37 @@ #include #include #include +#include #include #include #include +#define __pack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks) \ + ({ \ + for (size_t i = 0; i < (num_fields); i++) { \ + typeof(&(fields)[0]) field = &(fields)[i]; \ + u64 uval; \ + \ + uval = ustruct_field_to_u64(ustruct, field->offset, field->size); \ + \ + __pack(pbuf, uval, field->startbit, field->endbit, \ + pbuflen, quirks); \ + } \ + }) + +#define __unpack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks) \ + ({ \ + for (size_t i = 0; i < (num_fields); i++) { \ + typeof(&(fields)[0]) field = &fields[i]; \ + u64 uval; \ + \ + __unpack(pbuf, &uval, field->startbit, field->endbit, \ + pbuflen, quirks); \ + \ + u64_to_ustruct_field(ustruct, field->offset, field->size, uval); \ + } \ + }) + /** * calculate_box_addr - Determine physical location of byte in buffer * @box: Index of byte within buffer seen as a logical big-endian big number @@ -322,4 +349,130 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, } EXPORT_SYMBOL(packing); +static u64 ustruct_field_to_u64(const void *ustruct, size_t field_offset, + size_t field_size) +{ + switch (field_size) { + case 1: + return *((u8 *)(ustruct + field_offset)); + case 2: + return *((u16 *)(ustruct + field_offset)); + case 4: + return *((u32 *)(ustruct + field_offset)); + default: + return *((u64 *)(ustruct + field_offset)); + } +} + +static void u64_to_ustruct_field(void *ustruct, size_t field_offset, + size_t field_size, u64 uval) +{ + switch (field_size) { + case 1: + *((u8 *)(ustruct + field_offset)) = uval; + break; + case 2: + *((u16 *)(ustruct + field_offset)) = uval; + break; + case 4: + *((u32 *)(ustruct + field_offset)) = uval; + break; + default: + *((u64 *)(ustruct + field_offset)) = uval; + break; + } +} + +/** + * pack_fields_u8 - Pack array of fields + * + * @pbuf: Pointer to a buffer holding the packed value. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @ustruct: Pointer to CPU-readable structure holding the unpacked value. + * It is expected (but not checked) that this has the same data type + * as all struct packed_field_u8 definitions. + * @fields: Array of packed_field_u8 field definition. They must not overlap. + * @num_fields: Length of @fields array. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Use the pack_fields() macro instead of calling this directly. + */ +void pack_fields_u8(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks) +{ + __pack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks); +} +EXPORT_SYMBOL(pack_fields_u8); + +/** + * pack_fields_u16 - Pack array of fields + * + * @pbuf: Pointer to a buffer holding the packed value. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @ustruct: Pointer to CPU-readable structure holding the unpacked value. + * It is expected (but not checked) that this has the same data type + * as all struct packed_field_u16 definitions. + * @fields: Array of packed_field_u16 field definitions. They must not overlap. + * @num_fields: Length of @fields array. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Use the pack_fields() macro instead of calling this directly. + */ +void pack_fields_u16(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks) +{ + __pack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks); +} +EXPORT_SYMBOL(pack_fields_u16); + +/** + * unpack_fields_u8 - Unpack array of fields + * + * @pbuf: Pointer to a buffer holding the packed value. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @ustruct: Pointer to CPU-readable structure holding the unpacked value. + * It is expected (but not checked) that this has the same data type + * as all struct packed_field_u8 definitions. + * @fields: Array of packed_field_u8 field definitions. They must not overlap. + * @num_fields: Length of @fields array. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Use the unpack_fields() macro instead of calling this directly. + */ +void unpack_fields_u8(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks) +{ + __unpack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks); +} +EXPORT_SYMBOL(unpack_fields_u8); + +/** + * unpack_fields_u16 - Unpack array of fields + * + * @pbuf: Pointer to a buffer holding the packed value. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. + * @ustruct: Pointer to CPU-readable structure holding the unpacked value. + * It is expected (but not checked) that this has the same data type + * as all struct packed_field_u16 definitions. + * @fields: Array of packed_field_u16 field definitions. They must not overlap. + * @num_fields: Length of @fields array. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Use the unpack_fields() macro instead of calling this directly. + */ +void unpack_fields_u16(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks) +{ + __unpack_fields(pbuf, pbuflen, ustruct, fields, num_fields, quirks); +} +EXPORT_SYMBOL(unpack_fields_u16); + MODULE_DESCRIPTION("Generic bitfield packing and unpacking"); diff --git a/lib/packing_test.c b/lib/packing_test.c index b38ea43c03fd8..ce3b83d33b041 100644 --- a/lib/packing_test.c +++ b/lib/packing_test.c @@ -396,9 +396,70 @@ static void packing_test_unpack(struct kunit *test) KUNIT_EXPECT_EQ(test, uval, params->uval); } +#define PACKED_BUF_SIZE 8 + +typedef struct __packed { u8 buf[PACKED_BUF_SIZE]; } packed_buf_t; + +struct test_data { + u32 field3; + u16 field2; + u16 field4; + u16 field6; + u8 field1; + u8 field5; +}; + +static const struct packed_field_u8 test_fields[] = { + PACKED_FIELD(63, 61, struct test_data, field1), + PACKED_FIELD(60, 52, struct test_data, field2), + PACKED_FIELD(51, 28, struct test_data, field3), + PACKED_FIELD(27, 14, struct test_data, field4), + PACKED_FIELD(13, 9, struct test_data, field5), + PACKED_FIELD(8, 0, struct test_data, field6), +}; + +static void packing_test_pack_fields(struct kunit *test) +{ + const struct test_data data = { + .field1 = 0x2, + .field2 = 0x100, + .field3 = 0xF00050, + .field4 = 0x7D3, + .field5 = 0x9, + .field6 = 0x10B, + }; + packed_buf_t expect = { + .buf = { 0x50, 0x0F, 0x00, 0x05, 0x01, 0xF4, 0xD3, 0x0B }, + }; + packed_buf_t buf = {}; + + pack_fields(&buf, sizeof(buf), &data, test_fields, 0); + + KUNIT_EXPECT_MEMEQ(test, &expect, &buf, sizeof(buf)); +} + +static void packing_test_unpack_fields(struct kunit *test) +{ + const packed_buf_t buf = { + .buf = { 0x17, 0x28, 0x10, 0x19, 0x3D, 0xA9, 0x07, 0x9C }, + }; + struct test_data data = {}; + + unpack_fields(&buf, sizeof(buf), &data, test_fields, 0); + + KUNIT_EXPECT_EQ(test, 0, data.field1); + KUNIT_EXPECT_EQ(test, 0x172, data.field2); + KUNIT_EXPECT_EQ(test, 0x810193, data.field3); + KUNIT_EXPECT_EQ(test, 0x36A4, data.field4); + KUNIT_EXPECT_EQ(test, 0x3, data.field5); + KUNIT_EXPECT_EQ(test, 0x19C, data.field6); +} + static struct kunit_case packing_test_cases[] = { KUNIT_CASE_PARAM(packing_test_pack, packing_gen_params), KUNIT_CASE_PARAM(packing_test_unpack, packing_gen_params), + KUNIT_CASE(packing_test_pack_fields), + KUNIT_CASE(packing_test_unpack_fields), {}, }; diff --git a/scripts/.gitignore b/scripts/.gitignore index 3dbb8bb2457bc..c2ef68848da5c 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only /asn1_compiler +/gen_packed_field_checks /generate_rust_target /insert-sys-cert /kallsyms diff --git a/scripts/Makefile b/scripts/Makefile index 6bcda4b9d0540..546e8175e1c4c 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -47,7 +47,7 @@ HOSTCFLAGS_sorttable.o += -DMCOUNT_SORT_ENABLED endif # The following programs are only built on demand -hostprogs += unifdef +hostprogs += unifdef gen_packed_field_checks # The module linker script is preprocessed on demand targets += module.lds diff --git a/scripts/gen_packed_field_checks.c b/scripts/gen_packed_field_checks.c new file mode 100644 index 0000000000000..60042b7616eef --- /dev/null +++ b/scripts/gen_packed_field_checks.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024, Intel Corporation +#include +#include + +#define MAX_PACKED_FIELD_SIZE 50 + +int main(int argc, char **argv) +{ + /* The first macro doesn't need a 'do {} while(0)' loop */ + printf("#define CHECK_PACKED_FIELDS_1(fields) \\\n"); + printf("\tCHECK_PACKED_FIELD(fields, 0)\n\n"); + + /* Remaining macros require a do/while loop, and are implemented + * recursively by calling the previous iteration's macro. + */ + for (int i = 2; i <= MAX_PACKED_FIELD_SIZE; i++) { + printf("#define CHECK_PACKED_FIELDS_%d(fields) do { \\\n", i); + printf("\tCHECK_PACKED_FIELDS_%d(fields); \\\n", i - 1); + printf("\tCHECK_PACKED_FIELD(fields, %d); \\\n", i - 1); + printf("} while (0)\n\n"); + } + + printf("#define CHECK_PACKED_FIELDS(fields) \\\n"); + + for (int i = 1; i <= MAX_PACKED_FIELD_SIZE; i++) + printf("\t__builtin_choose_expr(ARRAY_SIZE(fields) == %d, ({ CHECK_PACKED_FIELDS_%d(fields); }), \\\n", + i, i); + + printf("\t({ BUILD_BUG_ON_MSG(1, \"CHECK_PACKED_FIELDS() must be regenerated to support array sizes larger than %d.\"); }) \\\n", + MAX_PACKED_FIELD_SIZE); + + for (int i = 1; i <= MAX_PACKED_FIELD_SIZE; i++) + printf(")"); + + printf("\n"); +} From a9ad2a8dfb436c55607c8038aa926f55a6d6ca8e Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 10 Dec 2024 12:27:13 -0800 Subject: [PATCH 0436/1450] lib: packing: document recently added APIs Extend the documentation for the packing library, covering the intended use for the recently added APIs. This includes the pack() and unpack() macros, as well as the pack_fields() and unpack_fields() macros. Add a note that the packing() API is now deprecated in favor of pack() and unpack(). For the pack_fields() and unpack_fields() APIs, explain the rationale for when a driver may want to select this API. Provide an example which shows how to define the fields and call the pack_fields() and unpack_fields() macros. Co-developed-by: Vladimir Oltean Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-4-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- Documentation/core-api/packing.rst | 118 +++++++++++++++++++++++++++-- 1 file changed, 113 insertions(+), 5 deletions(-) diff --git a/Documentation/core-api/packing.rst b/Documentation/core-api/packing.rst index 821691f23c541..0ce2078c8e13d 100644 --- a/Documentation/core-api/packing.rst +++ b/Documentation/core-api/packing.rst @@ -227,11 +227,119 @@ Intended use Drivers that opt to use this API first need to identify which of the above 3 quirk combinations (for a total of 8) match what the hardware documentation -describes. Then they should wrap the packing() function, creating a new -xxx_packing() that calls it using the proper QUIRK_* one-hot bits set. +describes. + +There are 3 supported usage patterns, detailed below. + +packing() +^^^^^^^^^ + +This API function is deprecated. The packing() function returns an int-encoded error code, which protects the programmer against incorrect API use. The errors are not expected to occur -during runtime, therefore it is reasonable for xxx_packing() to return void -and simply swallow those errors. Optionally it can dump stack or print the -error description. +during runtime, therefore it is reasonable to wrap packing() into a custom +function which returns void and swallows those errors. Optionally it can +dump stack or print the error description. + +.. code-block:: c + + void my_packing(void *buf, u64 *val, int startbit, int endbit, + size_t len, enum packing_op op) + { + int err; + + /* Adjust quirks accordingly */ + err = packing(buf, val, startbit, endbit, len, op, QUIRK_LSW32_IS_FIRST); + if (likely(!err)) + return; + + if (err == -EINVAL) { + pr_err("Start bit (%d) expected to be larger than end (%d)\n", + startbit, endbit); + } else if (err == -ERANGE) { + if ((startbit - endbit + 1) > 64) + pr_err("Field %d-%d too large for 64 bits!\n", + startbit, endbit); + else + pr_err("Cannot store %llx inside bits %d-%d (would truncate)\n", + *val, startbit, endbit); + } + dump_stack(); + } + +pack() and unpack() +^^^^^^^^^^^^^^^^^^^ + +These are const-correct variants of packing(), and eliminate the last "enum +packing_op op" argument. + +Calling pack(...) is equivalent, and preferred, to calling packing(..., PACK). + +Calling unpack(...) is equivalent, and preferred, to calling packing(..., UNPACK). + +pack_fields() and unpack_fields() +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The library exposes optimized functions for the scenario where there are many +fields represented in a buffer, and it encourages consumer drivers to avoid +repetitive calls to pack() and unpack() for each field, but instead use +pack_fields() and unpack_fields(), which reduces the code footprint. + +These APIs use field definitions in arrays of ``struct packed_field_u8`` or +``struct packed_field_u16``, allowing consumer drivers to minimize the size +of these arrays according to their custom requirements. + +The pack_fields() and unpack_fields() API functions are actually macros which +automatically select the appropriate function at compile time, based on the +type of the fields array passed in. + +An additional benefit over pack() and unpack() is that sanity checks on the +field definitions are handled at compile time with ``BUILD_BUG_ON`` rather +than only when the offending code is executed. These functions return void and +wrapping them to handle unexpected errors is not necessary. + +It is recommended, but not required, that you wrap your packed buffer into a +structured type with a fixed size. This generally makes it easier for the +compiler to enforce that the correct size buffer is used. + +Here is an example of how to use the fields APIs: + +.. code-block:: c + + /* Ordering inside the unpacked structure is flexible and can be different + * from the packed buffer. Here, it is optimized to reduce padding. + */ + struct data { + u64 field3; + u32 field4; + u16 field1; + u8 field2; + }; + + #define SIZE 13 + + typdef struct __packed { u8 buf[SIZE]; } packed_buf_t; + + static const struct packed_field_u8 fields[] = { + PACKED_FIELD(100, 90, struct data, field1), + PACKED_FIELD(90, 87, struct data, field2), + PACKED_FIELD(86, 30, struct data, field3), + PACKED_FIELD(29, 0, struct data, field4), + }; + + void unpack_your_data(const packed_buf_t *buf, struct data *unpacked) + { + BUILD_BUG_ON(sizeof(*buf) != SIZE; + + unpack_fields(buf, sizeof(*buf), unpacked, fields, + QUIRK_LITTLE_ENDIAN); + } + + void pack_your_data(const struct data *unpacked, packed_buf_t *buf) + { + BUILD_BUG_ON(sizeof(*buf) != SIZE; + + pack_fields(buf, sizeof(*buf), unpacked, fields, + QUIRK_LITTLE_ENDIAN); + } From aeeaa9f891737cadbb0832c2b552f3dca3b04675 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 10 Dec 2024 12:27:14 -0800 Subject: [PATCH 0437/1450] ice: remove int_q_state from ice_tlan_ctx The int_q_state field of the ice_tlan_ctx structure represents the internal queue state. However, we never actually need to assign this or read this during normal operation. In fact, trying to unpack it would not be possible as it is larger than a u64. Remove this field from the ice_tlan_ctx structure, and remove its packing field from the ice_tlan_ctx_info array. Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-5-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_common.c | 1 - drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 496d86cbd13fc..e2a4f4897119f 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1470,7 +1470,6 @@ const struct ice_ctx_ele ice_tlan_ctx_info[] = { ICE_CTX_STORE(ice_tlan_ctx, drop_ena, 1, 165), ICE_CTX_STORE(ice_tlan_ctx, cache_prof_idx, 2, 166), ICE_CTX_STORE(ice_tlan_ctx, pkt_shaper_prof_idx, 3, 168), - ICE_CTX_STORE(ice_tlan_ctx, int_q_state, 122, 171), { 0 } }; diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h index 611577ebc29d8..0e8ed8c226e68 100644 --- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h +++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h @@ -590,7 +590,6 @@ struct ice_tlan_ctx { u8 drop_ena; u8 cache_prof_idx; u8 pkt_shaper_prof_idx; - u8 int_q_state; /* width not needed - internal - DO NOT WRITE!!! */ }; #endif /* _ICE_LAN_TX_RX_H_ */ From efe39d8b4b9d8175fd7c3610da4c8fa663154a1e Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 10 Dec 2024 12:27:15 -0800 Subject: [PATCH 0438/1450] ice: use structures to keep track of queue context size The ice Tx and Rx queue context are currently stored as arrays of bytes with defined size (ICE_RXQ_CTX_SZ and ICE_TXQ_CTX_SZ). The packed queue context is often passed to other functions as a simple u8 * pointer, which does not allow tracking the size. This makes the queue context API easy to misuse, as you can pass an arbitrary u8 array or pointer. Introduce wrapper typedefs which use a __packed structure that has the proper fixed size for the Tx and Rx context buffers. This enables the compiler to track the size of the value and ensures that passing the wrong buffer size will be detected by the compiler. The existing APIs do not benefit much from this change, however the wrapping structures will be used to simplify the arguments of new packing functions based on the recently introduced pack_fields API. Co-developed-by: Vladimir Oltean Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-6-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/intel/ice/ice_adminq_cmd.h | 11 +++++++-- drivers/net/ethernet/intel/ice/ice_base.c | 2 +- drivers/net/ethernet/intel/ice/ice_common.c | 24 +++++++++---------- .../net/ethernet/intel/ice/ice_lan_tx_rx.h | 2 -- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 1489a8ceec51d..3bf05b135b355 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -12,6 +12,13 @@ #define ICE_AQC_TOPO_MAX_LEVEL_NUM 0x9 #define ICE_AQ_SET_MAC_FRAME_SIZE_MAX 9728 +#define ICE_RXQ_CTX_SIZE_DWORDS 8 +#define ICE_RXQ_CTX_SZ (ICE_RXQ_CTX_SIZE_DWORDS * sizeof(u32)) +#define ICE_TXQ_CTX_SZ 22 + +typedef struct __packed { u8 buf[ICE_RXQ_CTX_SZ]; } ice_rxq_ctx_buf_t; +typedef struct __packed { u8 buf[ICE_TXQ_CTX_SZ]; } ice_txq_ctx_buf_t; + struct ice_aqc_generic { __le32 param0; __le32 param1; @@ -2084,10 +2091,10 @@ struct ice_aqc_add_txqs_perq { __le16 txq_id; u8 rsvd[2]; __le32 q_teid; - u8 txq_ctx[22]; + ice_txq_ctx_buf_t txq_ctx; u8 rsvd2[2]; struct ice_aqc_txsched_elem info; -}; +} __packed; /* The format of the command buffer for Add Tx LAN Queues (0x0C30) * is an array of the following structs. Please note that the length of diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 82a9cd4ec7aec..e7aaa06241210 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -910,7 +910,7 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_tx_ring *ring, ice_setup_tx_ctx(ring, &tlan_ctx, pf_q); /* copy context contents into the qg_buf */ qg_buf->txqs[0].txq_id = cpu_to_le16(pf_q); - ice_set_ctx(hw, (u8 *)&tlan_ctx, qg_buf->txqs[0].txq_ctx, + ice_set_ctx(hw, (u8 *)&tlan_ctx, (u8 *)&qg_buf->txqs[0].txq_ctx, ice_tlan_ctx_info); /* init queue specific tail reg. It is referred as diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index e2a4f4897119f..64bf25aab673c 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1362,29 +1362,27 @@ int ice_reset(struct ice_hw *hw, enum ice_reset_req req) /** * ice_copy_rxq_ctx_to_hw * @hw: pointer to the hardware structure - * @ice_rxq_ctx: pointer to the rxq context + * @rxq_ctx: pointer to the packed Rx queue context * @rxq_index: the index of the Rx queue * * Copies rxq context from dense structure to HW register space */ -static int -ice_copy_rxq_ctx_to_hw(struct ice_hw *hw, u8 *ice_rxq_ctx, u32 rxq_index) +static int ice_copy_rxq_ctx_to_hw(struct ice_hw *hw, + const ice_rxq_ctx_buf_t *rxq_ctx, + u32 rxq_index) { u8 i; - if (!ice_rxq_ctx) - return -EINVAL; - if (rxq_index > QRX_CTRL_MAX_INDEX) return -EINVAL; /* Copy each dword separately to HW */ for (i = 0; i < ICE_RXQ_CTX_SIZE_DWORDS; i++) { - wr32(hw, QRX_CONTEXT(i, rxq_index), - *((u32 *)(ice_rxq_ctx + (i * sizeof(u32))))); + u32 ctx = ((const u32 *)rxq_ctx)[i]; + + wr32(hw, QRX_CONTEXT(i, rxq_index), ctx); - ice_debug(hw, ICE_DBG_QCTX, "qrxdata[%d]: %08X\n", i, - *((u32 *)(ice_rxq_ctx + (i * sizeof(u32))))); + ice_debug(hw, ICE_DBG_QCTX, "qrxdata[%d]: %08X\n", i, ctx); } return 0; @@ -1429,15 +1427,15 @@ static const struct ice_ctx_ele ice_rlan_ctx_info[] = { int ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx, u32 rxq_index) { - u8 ctx_buf[ICE_RXQ_CTX_SZ] = { 0 }; + ice_rxq_ctx_buf_t buf = {}; if (!rlan_ctx) return -EINVAL; rlan_ctx->prefena = 1; - ice_set_ctx(hw, (u8 *)rlan_ctx, ctx_buf, ice_rlan_ctx_info); - return ice_copy_rxq_ctx_to_hw(hw, ctx_buf, rxq_index); + ice_set_ctx(hw, (u8 *)rlan_ctx, (u8 *)&buf, ice_rlan_ctx_info); + return ice_copy_rxq_ctx_to_hw(hw, &buf, rxq_index); } /* LAN Tx Queue Context */ diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h index 0e8ed8c226e68..a76e5b0e7861e 100644 --- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h +++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h @@ -371,8 +371,6 @@ enum ice_rx_flex_desc_status_error_1_bits { ICE_RX_FLEX_DESC_STATUS1_LAST /* this entry must be last!!! */ }; -#define ICE_RXQ_CTX_SIZE_DWORDS 8 -#define ICE_RXQ_CTX_SZ (ICE_RXQ_CTX_SIZE_DWORDS * sizeof(u32)) #define ICE_TX_CMPLTNQ_CTX_SIZE_DWORDS 22 #define ICE_TX_DRBELL_Q_CTX_SIZE_DWORDS 5 #define GLTCLAN_CQ_CNTX(i, CQ) (GLTCLAN_CQ_CNTX0(CQ) + ((i) * 0x0800)) From dc4305be467a6f84f66005cfc58557d56b5ab107 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 10 Dec 2024 12:27:16 -0800 Subject: [PATCH 0439/1450] ice: use for Tx and Rx queue context data The ice driver needs to write the Tx and Rx queue context when programming Tx and Rx queues. This is currently done using some bespoke custom logic via the ice_set_ctx() and its helper functions, along with bit position definitions in the ice_tlan_ctx_info and ice_rlan_ctx_info structures. This logic does work, but is problematic for several reasons: 1) ice_set_ctx requires a helper function for each byte size being packed, as it uses a separate function to pack u8, u16, u32, and u64 fields. This requires 4 functions which contain near-duplicate logic with the types changed out. 2) The logic in the ice_pack_ctx_word, ice_pack_ctx_dword, and ice_pack_ctx_qword does not handle values which straddle alignment boundaries very well. This requires that several fields in the ice_tlan_ctx_info and ice_rlan_ctx_info be a size larger than their bit size should require. 3) Future support for live migration will require adding unpacking functions to take the packed hardware context and unpack it into the ice_rlan_ctx and ice_tlan_ctx structures. Implementing this would require implementing ice_get_ctx, and its associated helper functions, which essentially doubles the amount of code required. The Linux kernel has had a packing library that can handle this logic since commit 554aae35007e ("lib: Add support for generic packing operations"). The library was recently extended with support for packing or unpacking an array of fields, with a similar structure as the ice_ctx_ele structure. Replace the ice-specific ice_set_ctx() logic with the recently added pack_fields and packed_field_s infrastructure from For API simplicity, the Tx and Rx queue context are programmed using separate ice_pack_txq_ctx() and ice_pack_rxq_ctx(). This avoids needing to export the packed_field_s arrays. The functions can pointers to the appropriate ice_txq_ctx_buf_t and ice_rxq_ctx_buf_t types, ensuring that only buffers of the appropriate size are passed. Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-7-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/Kconfig | 1 + drivers/net/ethernet/intel/ice/ice_base.c | 3 +- drivers/net/ethernet/intel/ice/ice_common.c | 243 +++--------------- drivers/net/ethernet/intel/ice/ice_common.h | 5 +- .../net/ethernet/intel/ice/ice_lan_tx_rx.h | 14 - 5 files changed, 42 insertions(+), 224 deletions(-) diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 20bc40eec487a..24ec9a4f1ffa8 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -292,6 +292,7 @@ config ICE select DIMLIB select LIBIE select NET_DEVLINK + select PACKING select PLDMFW select DPLL help diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index e7aaa06241210..5fe7b5a100202 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -910,8 +910,7 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_tx_ring *ring, ice_setup_tx_ctx(ring, &tlan_ctx, pf_q); /* copy context contents into the qg_buf */ qg_buf->txqs[0].txq_id = cpu_to_le16(pf_q); - ice_set_ctx(hw, (u8 *)&tlan_ctx, (u8 *)&qg_buf->txqs[0].txq_ctx, - ice_tlan_ctx_info); + ice_pack_txq_ctx(&tlan_ctx, &qg_buf->txqs[0].txq_ctx); /* init queue specific tail reg. It is referred as * transmit comm scheduler queue doorbell. diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 64bf25aab673c..8683c9ac6cedc 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -6,6 +6,7 @@ #include "ice_adminq_cmd.h" #include "ice_flow.h" #include "ice_ptp_hw.h" +#include #define ICE_PF_RESET_WAIT_COUNT 300 #define ICE_MAX_NETLIST_SIZE 10 @@ -1388,9 +1389,12 @@ static int ice_copy_rxq_ctx_to_hw(struct ice_hw *hw, return 0; } +#define ICE_CTX_STORE(struct_name, struct_field, width, lsb) \ + PACKED_FIELD((lsb) + (width) - 1, (lsb), struct struct_name, struct_field) + /* LAN Rx Queue Context */ -static const struct ice_ctx_ele ice_rlan_ctx_info[] = { - /* Field Width LSB */ +static const struct packed_field_u8 ice_rlan_ctx_fields[] = { + /* Field Width LSB */ ICE_CTX_STORE(ice_rlan_ctx, head, 13, 0), ICE_CTX_STORE(ice_rlan_ctx, cpuid, 8, 13), ICE_CTX_STORE(ice_rlan_ctx, base, 57, 32), @@ -1411,9 +1415,23 @@ static const struct ice_ctx_ele ice_rlan_ctx_info[] = { ICE_CTX_STORE(ice_rlan_ctx, tphhead_ena, 1, 196), ICE_CTX_STORE(ice_rlan_ctx, lrxqthresh, 3, 198), ICE_CTX_STORE(ice_rlan_ctx, prefena, 1, 201), - { 0 } }; +/** + * ice_pack_rxq_ctx - Pack Rx queue context into a HW buffer + * @ctx: the Rx queue context to pack + * @buf: the HW buffer to pack into + * + * Pack the Rx queue context from the CPU-friendly unpacked buffer into its + * bit-packed HW layout. + */ +static void ice_pack_rxq_ctx(const struct ice_rlan_ctx *ctx, + ice_rxq_ctx_buf_t *buf) +{ + pack_fields(buf, sizeof(*buf), ctx, ice_rlan_ctx_fields, + QUIRK_LITTLE_ENDIAN | QUIRK_LSW32_IS_FIRST); +} + /** * ice_write_rxq_ctx * @hw: pointer to the hardware structure @@ -1434,12 +1452,13 @@ int ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx, rlan_ctx->prefena = 1; - ice_set_ctx(hw, (u8 *)rlan_ctx, (u8 *)&buf, ice_rlan_ctx_info); + ice_pack_rxq_ctx(rlan_ctx, &buf); + return ice_copy_rxq_ctx_to_hw(hw, &buf, rxq_index); } /* LAN Tx Queue Context */ -const struct ice_ctx_ele ice_tlan_ctx_info[] = { +static const struct packed_field_u8 ice_tlan_ctx_fields[] = { /* Field Width LSB */ ICE_CTX_STORE(ice_tlan_ctx, base, 57, 0), ICE_CTX_STORE(ice_tlan_ctx, port_num, 3, 57), @@ -1468,9 +1487,22 @@ const struct ice_ctx_ele ice_tlan_ctx_info[] = { ICE_CTX_STORE(ice_tlan_ctx, drop_ena, 1, 165), ICE_CTX_STORE(ice_tlan_ctx, cache_prof_idx, 2, 166), ICE_CTX_STORE(ice_tlan_ctx, pkt_shaper_prof_idx, 3, 168), - { 0 } }; +/** + * ice_pack_txq_ctx - Pack Tx queue context into a HW buffer + * @ctx: the Tx queue context to pack + * @buf: the HW buffer to pack into + * + * Pack the Tx queue context from the CPU-friendly unpacked buffer into its + * bit-packed HW layout. + */ +void ice_pack_txq_ctx(const struct ice_tlan_ctx *ctx, ice_txq_ctx_buf_t *buf) +{ + pack_fields(buf, sizeof(*buf), ctx, ice_tlan_ctx_fields, + QUIRK_LITTLE_ENDIAN | QUIRK_LSW32_IS_FIRST); +} + /* Sideband Queue command wrappers */ /** @@ -4554,205 +4586,6 @@ ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps, /* End of FW Admin Queue command wrappers */ -/** - * ice_pack_ctx_byte - write a byte to a packed context structure - * @src_ctx: unpacked source context structure - * @dest_ctx: packed destination context data - * @ce_info: context element description - */ -static void ice_pack_ctx_byte(u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - u8 src_byte, dest_byte, mask; - u8 *from, *dest; - u16 shift_width; - - /* copy from the next struct field */ - from = src_ctx + ce_info->offset; - - /* prepare the bits and mask */ - shift_width = ce_info->lsb % 8; - mask = GENMASK(ce_info->width - 1 + shift_width, shift_width); - - src_byte = *from; - src_byte <<= shift_width; - src_byte &= mask; - - /* get the current bits from the target bit string */ - dest = dest_ctx + (ce_info->lsb / 8); - - memcpy(&dest_byte, dest, sizeof(dest_byte)); - - dest_byte &= ~mask; /* get the bits not changing */ - dest_byte |= src_byte; /* add in the new bits */ - - /* put it all back */ - memcpy(dest, &dest_byte, sizeof(dest_byte)); -} - -/** - * ice_pack_ctx_word - write a word to a packed context structure - * @src_ctx: unpacked source context structure - * @dest_ctx: packed destination context data - * @ce_info: context element description - */ -static void ice_pack_ctx_word(u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - u16 src_word, mask; - __le16 dest_word; - u8 *from, *dest; - u16 shift_width; - - /* copy from the next struct field */ - from = src_ctx + ce_info->offset; - - /* prepare the bits and mask */ - shift_width = ce_info->lsb % 8; - mask = GENMASK(ce_info->width - 1 + shift_width, shift_width); - - /* don't swizzle the bits until after the mask because the mask bits - * will be in a different bit position on big endian machines - */ - src_word = *(u16 *)from; - src_word <<= shift_width; - src_word &= mask; - - /* get the current bits from the target bit string */ - dest = dest_ctx + (ce_info->lsb / 8); - - memcpy(&dest_word, dest, sizeof(dest_word)); - - dest_word &= ~(cpu_to_le16(mask)); /* get the bits not changing */ - dest_word |= cpu_to_le16(src_word); /* add in the new bits */ - - /* put it all back */ - memcpy(dest, &dest_word, sizeof(dest_word)); -} - -/** - * ice_pack_ctx_dword - write a dword to a packed context structure - * @src_ctx: unpacked source context structure - * @dest_ctx: packed destination context data - * @ce_info: context element description - */ -static void ice_pack_ctx_dword(u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - u32 src_dword, mask; - __le32 dest_dword; - u8 *from, *dest; - u16 shift_width; - - /* copy from the next struct field */ - from = src_ctx + ce_info->offset; - - /* prepare the bits and mask */ - shift_width = ce_info->lsb % 8; - mask = GENMASK(ce_info->width - 1 + shift_width, shift_width); - - /* don't swizzle the bits until after the mask because the mask bits - * will be in a different bit position on big endian machines - */ - src_dword = *(u32 *)from; - src_dword <<= shift_width; - src_dword &= mask; - - /* get the current bits from the target bit string */ - dest = dest_ctx + (ce_info->lsb / 8); - - memcpy(&dest_dword, dest, sizeof(dest_dword)); - - dest_dword &= ~(cpu_to_le32(mask)); /* get the bits not changing */ - dest_dword |= cpu_to_le32(src_dword); /* add in the new bits */ - - /* put it all back */ - memcpy(dest, &dest_dword, sizeof(dest_dword)); -} - -/** - * ice_pack_ctx_qword - write a qword to a packed context structure - * @src_ctx: unpacked source context structure - * @dest_ctx: packed destination context data - * @ce_info: context element description - */ -static void ice_pack_ctx_qword(u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - u64 src_qword, mask; - __le64 dest_qword; - u8 *from, *dest; - u16 shift_width; - - /* copy from the next struct field */ - from = src_ctx + ce_info->offset; - - /* prepare the bits and mask */ - shift_width = ce_info->lsb % 8; - mask = GENMASK_ULL(ce_info->width - 1 + shift_width, shift_width); - - /* don't swizzle the bits until after the mask because the mask bits - * will be in a different bit position on big endian machines - */ - src_qword = *(u64 *)from; - src_qword <<= shift_width; - src_qword &= mask; - - /* get the current bits from the target bit string */ - dest = dest_ctx + (ce_info->lsb / 8); - - memcpy(&dest_qword, dest, sizeof(dest_qword)); - - dest_qword &= ~(cpu_to_le64(mask)); /* get the bits not changing */ - dest_qword |= cpu_to_le64(src_qword); /* add in the new bits */ - - /* put it all back */ - memcpy(dest, &dest_qword, sizeof(dest_qword)); -} - -/** - * ice_set_ctx - set context bits in packed structure - * @hw: pointer to the hardware structure - * @src_ctx: pointer to a generic non-packed context structure - * @dest_ctx: pointer to memory for the packed structure - * @ce_info: List of Rx context elements - */ -int ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info) -{ - int f; - - for (f = 0; ce_info[f].width; f++) { - /* We have to deal with each element of the FW response - * using the correct size so that we are correct regardless - * of the endianness of the machine. - */ - if (ce_info[f].width > (ce_info[f].size_of * BITS_PER_BYTE)) { - ice_debug(hw, ICE_DBG_QCTX, "Field %d width of %d bits larger than size of %d byte(s) ... skipping write\n", - f, ce_info[f].width, ce_info[f].size_of); - continue; - } - switch (ce_info[f].size_of) { - case sizeof(u8): - ice_pack_ctx_byte(src_ctx, dest_ctx, &ce_info[f]); - break; - case sizeof(u16): - ice_pack_ctx_word(src_ctx, dest_ctx, &ce_info[f]); - break; - case sizeof(u32): - ice_pack_ctx_dword(src_ctx, dest_ctx, &ce_info[f]); - break; - case sizeof(u64): - ice_pack_ctx_qword(src_ctx, dest_ctx, &ce_info[f]); - break; - default: - return -EINVAL; - } - } - - return 0; -} - /** * ice_get_lan_q_ctx - get the LAN queue context for the given VSI and TC * @hw: pointer to the HW struct diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h index 27208a60cece5..a68bea3934e35 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.h +++ b/drivers/net/ethernet/intel/ice/ice_common.h @@ -92,9 +92,8 @@ ice_aq_set_rss_key(struct ice_hw *hw, u16 vsi_handle, bool ice_check_sq_alive(struct ice_hw *hw, struct ice_ctl_q_info *cq); int ice_aq_q_shutdown(struct ice_hw *hw, bool unloading); void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode); -extern const struct ice_ctx_ele ice_tlan_ctx_info[]; -int ice_set_ctx(struct ice_hw *hw, u8 *src_ctx, u8 *dest_ctx, - const struct ice_ctx_ele *ce_info); + +void ice_pack_txq_ctx(const struct ice_tlan_ctx *ctx, ice_txq_ctx_buf_t *buf); extern struct mutex ice_global_cfg_lock_sw; diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h index a76e5b0e7861e..31d4a445d640d 100644 --- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h +++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h @@ -408,20 +408,6 @@ struct ice_rlan_ctx { u8 prefena; /* NOTE: normally must be set to 1 at init */ }; -struct ice_ctx_ele { - u16 offset; - u16 size_of; - u16 width; - u16 lsb; -}; - -#define ICE_CTX_STORE(_struct, _ele, _width, _lsb) { \ - .offset = offsetof(struct _struct, _ele), \ - .size_of = sizeof_field(struct _struct, _ele), \ - .width = _width, \ - .lsb = _lsb, \ -} - /* for hsplit_0 field of Rx RLAN context */ enum ice_rlan_ctx_rx_hsplit_0 { ICE_RLAN_RX_HSPLIT_0_NO_SPLIT = 0, From f72588a4267b2211d18328433035b629d24f6f03 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 10 Dec 2024 12:27:17 -0800 Subject: [PATCH 0440/1450] ice: reduce size of queue context fields The ice_rlan_ctx and ice_tlan_ctx structures have some fields which are intentionally sized larger than necessary relative to the packed sizes the data must fit into. This was done because the original ice_set_ctx() function and its helpers did not correctly handle packing when the packed bits straddled a byte. This is no longer the case with the use of the implementation. Save some bytes in these structures by sizing the variables to the number of bytes the actual bitpacked fields fit into. There are a couple of gaps left in the structure, which is a result of the fields being in the order they appear in the packed bit layout, but where alignment forces some extra gaps. We could fix this, saving ~8 bytes from each structure. However, these structures are not used heavily, and the resulting savings is minimal: $ bloat-o-meter ice-before-reorder.ko ice-after-reorder.ko add/remove: 0/0 grow/shrink: 1/1 up/down: 26/-70 (-44) Function old new delta ice_vsi_cfg_txq 1873 1899 +26 ice_setup_rx_ctx.constprop 1529 1459 -70 Total: Before=1459555, After=1459511, chg -0.00% Thus, the fields are left in the same order as the packed bit layout, despite the gaps this causes. Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-8-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/intel/ice/ice_lan_tx_rx.h | 32 ++++++------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h index 31d4a445d640d..1479b45738af1 100644 --- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h +++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h @@ -375,23 +375,17 @@ enum ice_rx_flex_desc_status_error_1_bits { #define ICE_TX_DRBELL_Q_CTX_SIZE_DWORDS 5 #define GLTCLAN_CQ_CNTX(i, CQ) (GLTCLAN_CQ_CNTX0(CQ) + ((i) * 0x0800)) -/* RLAN Rx queue context data - * - * The sizes of the variables may be larger than needed due to crossing byte - * boundaries. If we do not have the width of the variable set to the correct - * size then we could end up shifting bits off the top of the variable when the - * variable is at the top of a byte and crosses over into the next byte. - */ +/* RLAN Rx queue context data */ struct ice_rlan_ctx { u16 head; - u16 cpuid; /* bigger than needed, see above for reason */ + u8 cpuid; #define ICE_RLAN_BASE_S 7 u64 base; u16 qlen; #define ICE_RLAN_CTX_DBUF_S 7 - u16 dbuf; /* bigger than needed, see above for reason */ + u8 dbuf; #define ICE_RLAN_CTX_HBUF_S 6 - u16 hbuf; /* bigger than needed, see above for reason */ + u8 hbuf; u8 dtype; u8 dsize; u8 crcstrip; @@ -399,12 +393,12 @@ struct ice_rlan_ctx { u8 hsplit_0; u8 hsplit_1; u8 showiv; - u32 rxmax; /* bigger than needed, see above for reason */ + u16 rxmax; u8 tphrdesc_ena; u8 tphwdesc_ena; u8 tphdata_ena; u8 tphhead_ena; - u16 lrxqthresh; /* bigger than needed, see above for reason */ + u8 lrxqthresh; u8 prefena; /* NOTE: normally must be set to 1 at init */ }; @@ -535,18 +529,12 @@ enum ice_tx_ctx_desc_eipt_offload { #define ICE_LAN_TXQ_MAX_QGRPS 127 #define ICE_LAN_TXQ_MAX_QDIS 1023 -/* Tx queue context data - * - * The sizes of the variables may be larger than needed due to crossing byte - * boundaries. If we do not have the width of the variable set to the correct - * size then we could end up shifting bits off the top of the variable when the - * variable is at the top of a byte and crosses over into the next byte. - */ +/* Tx queue context data */ struct ice_tlan_ctx { #define ICE_TLAN_CTX_BASE_S 7 u64 base; /* base is defined in 128-byte units */ u8 port_num; - u16 cgd_num; /* bigger than needed, see above for reason */ + u8 cgd_num; u8 pf_num; u16 vmvf_num; u8 vmvf_type; @@ -557,7 +545,7 @@ struct ice_tlan_ctx { u8 tsyn_ena; u8 internal_usage_flag; u8 alt_vlan; - u16 cpuid; /* bigger than needed, see above for reason */ + u8 cpuid; u8 wb_mode; u8 tphrd_desc; u8 tphrd; @@ -566,7 +554,7 @@ struct ice_tlan_ctx { u16 qnum_in_func; u8 itr_notification_mode; u8 adjust_prof_id; - u32 qlen; /* bigger than needed, see above for reason */ + u16 qlen; u8 quanta_prof_idx; u8 tso_ena; u16 tso_qnum; From ac001acc4d353455dd9f7b8e74d2f9c01e83ace2 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 10 Dec 2024 12:27:18 -0800 Subject: [PATCH 0441/1450] ice: move prefetch enable to ice_setup_rx_ctx The ice_write_rxq_ctx() function is responsible for programming the Rx Queue context into hardware. It receives the configuration in unpacked form via the ice_rlan_ctx structure. This function unconditionally modifies the context to set the prefetch enable bit. This was done by commit c31a5c25bb19 ("ice: Always set prefena when configuring an Rx queue"). Setting this bit makes sense, since prefetching descriptors is almost always the preferred behavior. However, the ice_write_rxq_ctx() function is not the place that actually defines the queue context. We initialize the Rx Queue context in ice_setup_rx_ctx(). It is surprising to have the Rx queue context changed by a function who's responsibility is to program the given context to hardware. Following the principle of least surprise, move the setting of the prefetch enable bit out of ice_write_rxq_ctx() and into the ice_setup_rx_ctx(). Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-9-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_base.c | 3 +++ drivers/net/ethernet/intel/ice/ice_common.c | 9 +++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 5fe7b5a100202..b2af8e3586f76 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -454,6 +454,9 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring) /* Rx queue threshold in units of 64 */ rlan_ctx.lrxqthresh = 1; + /* Enable descriptor prefetch */ + rlan_ctx.prefena = 1; + /* PF acts as uplink for switchdev; set flex descriptor with src_vsi * metadata and flags to allow redirecting to PR netdev */ diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 8683c9ac6cedc..4c6cc48aaef0c 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1433,14 +1433,13 @@ static void ice_pack_rxq_ctx(const struct ice_rlan_ctx *ctx, } /** - * ice_write_rxq_ctx + * ice_write_rxq_ctx - Write Rx Queue context to hardware * @hw: pointer to the hardware structure * @rlan_ctx: pointer to the rxq context * @rxq_index: the index of the Rx queue * - * Converts rxq context from sparse to dense structure and then writes - * it to HW register space and enables the hardware to prefetch descriptors - * instead of only fetching them on demand + * Pack the sparse Rx Queue context into dense hardware format and write it + * into the HW register space. */ int ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx, u32 rxq_index) @@ -1450,8 +1449,6 @@ int ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx, if (!rlan_ctx) return -EINVAL; - rlan_ctx->prefena = 1; - ice_pack_rxq_ctx(rlan_ctx, &buf); return ice_copy_rxq_ctx_to_hw(hw, &buf, rxq_index); From 39be64c34ca303094d11a23fc9b36e73e3adb9dc Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 10 Dec 2024 12:27:19 -0800 Subject: [PATCH 0442/1450] ice: cleanup Rx queue context programming functions The ice_copy_rxq_ctx_to_hw() and ice_write_rxq_ctx() functions perform some defensive checks which are typically frowned upon by kernel style guidelines. In particular, NULL checks on buffers which point to the stack are discouraged, especially when the functions are static and only called once. Checks of this sort only serve to hide potential programming error, as we will not produce the normal crash dump on a NULL access. In addition, ice_copy_rxq_ctx_to_hw() cannot fail in another way, so could be made void. Future support for VF Live Migration will need to introduce an inverse function for reading Rx queue context from HW registers to unpack it, as well as functions to pack and unpack Tx queue context from HW. Rather than copying these style issues into the new functions, lets first cleanup the existing code. For the ice_copy_rxq_ctx_to_hw() function: * Move the Rx queue index check out of this function. * Convert the function to a void return. * Use a simple int variable instead of a u8 for the for loop index, and initialize it inside the for loop. * Update the function description to better align with kernel doc style. For the ice_write_rxq_ctx() function: * Move the Rx queue index check into this function. * Update the function description with a Returns: to align with kernel doc style. These changes align the existing write functions to current kernel style, and will align with the style of the new functions added when we implement live migration in a future series. Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-10-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_common.c | 28 ++++++++------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 4c6cc48aaef0c..f89bc6ede3157 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1361,32 +1361,23 @@ int ice_reset(struct ice_hw *hw, enum ice_reset_req req) } /** - * ice_copy_rxq_ctx_to_hw + * ice_copy_rxq_ctx_to_hw - Copy packed Rx queue context to HW registers * @hw: pointer to the hardware structure * @rxq_ctx: pointer to the packed Rx queue context * @rxq_index: the index of the Rx queue - * - * Copies rxq context from dense structure to HW register space */ -static int ice_copy_rxq_ctx_to_hw(struct ice_hw *hw, - const ice_rxq_ctx_buf_t *rxq_ctx, - u32 rxq_index) +static void ice_copy_rxq_ctx_to_hw(struct ice_hw *hw, + const ice_rxq_ctx_buf_t *rxq_ctx, + u32 rxq_index) { - u8 i; - - if (rxq_index > QRX_CTRL_MAX_INDEX) - return -EINVAL; - /* Copy each dword separately to HW */ - for (i = 0; i < ICE_RXQ_CTX_SIZE_DWORDS; i++) { + for (int i = 0; i < ICE_RXQ_CTX_SIZE_DWORDS; i++) { u32 ctx = ((const u32 *)rxq_ctx)[i]; wr32(hw, QRX_CONTEXT(i, rxq_index), ctx); ice_debug(hw, ICE_DBG_QCTX, "qrxdata[%d]: %08X\n", i, ctx); } - - return 0; } #define ICE_CTX_STORE(struct_name, struct_field, width, lsb) \ @@ -1435,23 +1426,26 @@ static void ice_pack_rxq_ctx(const struct ice_rlan_ctx *ctx, /** * ice_write_rxq_ctx - Write Rx Queue context to hardware * @hw: pointer to the hardware structure - * @rlan_ctx: pointer to the rxq context + * @rlan_ctx: pointer to the unpacked Rx queue context * @rxq_index: the index of the Rx queue * * Pack the sparse Rx Queue context into dense hardware format and write it * into the HW register space. + * + * Return: 0 on success, or -EINVAL if the Rx queue index is invalid. */ int ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx, u32 rxq_index) { ice_rxq_ctx_buf_t buf = {}; - if (!rlan_ctx) + if (rxq_index > QRX_CTRL_MAX_INDEX) return -EINVAL; ice_pack_rxq_ctx(rlan_ctx, &buf); + ice_copy_rxq_ctx_to_hw(hw, &buf, rxq_index); - return ice_copy_rxq_ctx_to_hw(hw, &buf, rxq_index); + return 0; } /* LAN Tx Queue Context */ From d51cfd5f4fe01cd3d212703c8fe5dd6886da969c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Dec 2024 18:33:50 +0000 Subject: [PATCH 0443/1450] ipv6: mcast: reduce ipv6_chk_mcast_addr() indentation Add a label and two gotos to shorten lines by two tabulations, to ease code review of following patches. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20241210183352.86530-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/mcast.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index b244dbf61d5f3..afe707b6841d1 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1021,29 +1021,31 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, rcu_read_lock(); idev = __in6_dev_get(dev); - if (idev) { - for_each_mc_rcu(idev, mc) { - if (ipv6_addr_equal(&mc->mca_addr, group)) - break; - } - if (mc) { - if (src_addr && !ipv6_addr_any(src_addr)) { - struct ip6_sf_list *psf; + if (!idev) + goto unlock; + for_each_mc_rcu(idev, mc) { + if (ipv6_addr_equal(&mc->mca_addr, group)) + break; + } + if (!mc) + goto unlock; + if (src_addr && !ipv6_addr_any(src_addr)) { + struct ip6_sf_list *psf; - for_each_psf_rcu(mc, psf) { - if (ipv6_addr_equal(&psf->sf_addr, src_addr)) - break; - } - if (psf) - rv = psf->sf_count[MCAST_INCLUDE] || - psf->sf_count[MCAST_EXCLUDE] != - mc->mca_sfcount[MCAST_EXCLUDE]; - else - rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; - } else - rv = true; /* don't filter unspecified source */ + for_each_psf_rcu(mc, psf) { + if (ipv6_addr_equal(&psf->sf_addr, src_addr)) + break; } + if (psf) + rv = psf->sf_count[MCAST_INCLUDE] || + psf->sf_count[MCAST_EXCLUDE] != + mc->mca_sfcount[MCAST_EXCLUDE]; + else + rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; + } else { + rv = true; /* don't filter unspecified source */ } +unlock: rcu_read_unlock(); return rv; } From 626962911ad886f2b8e6d6f612289f9c7268b435 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Dec 2024 18:33:51 +0000 Subject: [PATCH 0444/1450] ipv6: mcast: annotate data-races around mc->mca_sfcount[MCAST_EXCLUDE] mc->mca_sfcount[MCAST_EXCLUDE] is read locklessly from ipv6_chk_mcast_addr(). Add READ_ONCE() and WRITE_ONCE() annotations accordingly. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20241210183352.86530-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/mcast.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index afe707b6841d1..09622142b0705 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1039,9 +1039,9 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, if (psf) rv = psf->sf_count[MCAST_INCLUDE] || psf->sf_count[MCAST_EXCLUDE] != - mc->mca_sfcount[MCAST_EXCLUDE]; + READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]); else - rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; + rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0; } else { rv = true; /* don't filter unspecified source */ } @@ -2505,7 +2505,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; if (!delta) - pmc->mca_sfcount[sfmode]++; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] + 1); err = 0; for (i = 0; i < sfcount; i++) { err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); @@ -2516,7 +2517,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int j; if (!delta) - pmc->mca_sfcount[sfmode]--; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] - 1); for (j = 0; j < i; j++) ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { @@ -2561,7 +2563,8 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; - pmc->mca_sfcount[MCAST_EXCLUDE] = 1; + /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */ + WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1); } /* called with mc_lock */ From 00bf2032e97691c4b53427b33a85b134324e2a94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Dec 2024 18:33:52 +0000 Subject: [PATCH 0445/1450] ipv6: mcast: annotate data-race around psf->sf_count[MCAST_XXX] psf->sf_count[MCAST_XXX] fields are read locklessly from ipv6_chk_mcast_addr() and igmp6_mcf_seq_show(). Add READ_ONCE() and WRITE_ONCE() annotations accordingly. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20241210183352.86530-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/mcast.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 09622142b0705..5ca8692d565d5 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1037,8 +1037,8 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, break; } if (psf) - rv = psf->sf_count[MCAST_INCLUDE] || - psf->sf_count[MCAST_EXCLUDE] != + rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) || + READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) != READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]); else rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0; @@ -2287,7 +2287,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, /* source filter not found, or count wrong => bug */ return -ESRCH; } - psf->sf_count[sfmode]--; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1); if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { struct inet6_dev *idev = pmc->idev; @@ -2393,7 +2393,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, rcu_assign_pointer(pmc->mca_sources, psf); } } - psf->sf_count[sfmode]++; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1); return 0; } @@ -3079,8 +3079,8 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) state->dev->ifindex, state->dev->name, &state->im->mca_addr, &psf->sf_addr, - psf->sf_count[MCAST_INCLUDE], - psf->sf_count[MCAST_EXCLUDE]); + READ_ONCE(psf->sf_count[MCAST_INCLUDE]), + READ_ONCE(psf->sf_count[MCAST_EXCLUDE])); } return 0; } From 19ce8cd3046587efbd2c6253947be7c22dfccc18 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 9 Dec 2024 20:38:03 +0100 Subject: [PATCH 0446/1450] tcp: Measure TIME-WAIT reuse delay with millisecond precision Prepare ground for TIME-WAIT socket reuse with subsecond delay. Today the last TS.Recent update timestamp, recorded in seconds and stored tp->ts_recent_stamp and tw->tw_ts_recent_stamp fields, has two purposes. Firstly, it is used to track the age of the last recorded TS.Recent value to detect when that value becomes outdated due to potential wrap-around of the other TCP timestamp clock (RFC 7323, section 5.5). For this purpose a second-based timestamp is completely sufficient as even in the worst case scenario of a peer using a high resolution microsecond timestamp, the wrap-around interval is ~36 minutes long. Secondly, it serves as a threshold value for allowing TIME-WAIT socket reuse. A TIME-WAIT socket can be reused only once the virtual 1 Hz clock, ktime_get_seconds, is past the TS.Recent update timestamp. The purpose behind delaying the TIME-WAIT socket reuse is to wait for the other TCP timestamp clock to tick at least once before reusing the connection. It is only then that the PAWS mechanism for the reopened connection can detect old duplicate segments from the previous connection incarnation (RFC 7323, appendix B.2). In this case using a timestamp with second resolution not only blocks the way toward allowing faster TIME-WAIT reuse after shorter subsecond delay, but also makes it impossible to reliably delay TW reuse by one second. As Eric Dumazet has pointed out [1], due to timestamp rounding, the TW reuse delay will actually be between (0, 1] seconds, and 0.5 seconds on average. We delay TW reuse for one full second only when last TS.Recent update coincides with our virtual 1 Hz clock tick. Considering the above, introduce a dedicated field to store a millisecond timestamp of transition into the TIME-WAIT state. Place it in an existing 4-byte hole inside inet_timewait_sock structure to avoid an additional memory cost. Use the new timestamp to (i) reliably delay TIME-WAIT reuse by one second, and (ii) prepare for configurable subsecond reuse delay in the subsequent change. We assume here that a full one second delay was the original intention in [2] because it accounts for the worst case scenario of the other TCP using the slowest recommended 1 Hz timestamp clock. A more involved alternative would be to change the resolution of the last TS.Recent update timestamp, tw->tw_ts_recent_stamp, to milliseconds. [1] https://lore.kernel.org/netdev/CANn89iKB4GFd8sVzCbRttqw_96o3i2wDhX-3DraQtsceNGYwug@mail.gmail.com/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b8439924316d5bcb266d165b93d632a4b4b859af Signed-off-by: Jakub Sitnicki Reviewed-by: Eric Dumazet Reviewed-by: Jason Xing Link: https://patch.msgid.link/20241209-jakub-krn-909-poc-msec-tw-tstamp-v2-1-66aca0eed03e@cloudflare.com Signed-off-by: Jakub Kicinski --- include/net/inet_timewait_sock.h | 4 ++++ net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_minisocks.c | 7 ++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 62c0a7e65d6bd..67a3135757809 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -74,6 +74,10 @@ struct inet_timewait_sock { tw_tos : 8; u32 tw_txhash; u32 tw_priority; + /** + * @tw_reuse_stamp: Time of entry into %TCP_TIME_WAIT state in msec. + */ + u32 tw_entry_stamp; struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a38c8b1f44dbd..3b6ba1d16921e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -120,6 +120,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); int ts_recent_stamp; + u32 reuse_thresh; if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) reuse = 0; @@ -162,9 +163,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + MSEC_PER_SEC; if (ts_recent_stamp && - (!twp || (reuse && time_after32(ktime_get_seconds(), - ts_recent_stamp)))) { + (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk * and releasing the bucket lock. */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7121d8573928c..b089b08e96178 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -157,8 +157,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, rcv_nxt); if (tmp_opt.saw_tstamp) { + u64 ts = tcp_clock_ms(); + + WRITE_ONCE(tw->tw_entry_stamp, ts); WRITE_ONCE(tcptw->tw_ts_recent_stamp, - ktime_get_seconds()); + div_u64(ts, MSEC_PER_SEC)); WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); } @@ -316,6 +319,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + /* refreshed when we enter true TIME-WAIT state */ + tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); From ca6a6f93867a9763bdf8685c788e2e558d10975f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 9 Dec 2024 20:38:04 +0100 Subject: [PATCH 0447/1450] tcp: Add sysctl to configure TIME-WAIT reuse delay Today we have a hardcoded delay of 1 sec before a TIME-WAIT socket can be reused by reopening a connection. This is a safe choice based on an assumption that the other TCP timestamp clock frequency, which is unknown to us, may be as low as 1 Hz (RFC 7323, section 5.4). However, this means that in the presence of short lived connections with an RTT of couple of milliseconds, the time during which a 4-tuple is blocked from reuse can be orders of magnitude longer that the connection lifetime. Combined with a reduced pool of ephemeral ports, when using IP_LOCAL_PORT_RANGE to share an egress IP address between hosts [1], the long TIME-WAIT reuse delay can lead to port exhaustion, where all available 4-tuples are tied up in TIME-WAIT state. Turn the reuse delay into a per-netns setting so that sysadmins can make more aggressive assumptions about remote TCP timestamp clock frequency and shorten the delay in order to allow connections to reincarnate faster. Note that applications can completely bypass the TIME-WAIT delay protection already today by locking the local port with bind() before connecting. Such immediate connection reuse may result in PAWS failing to detect old duplicate segments, leaving us with just the sequence number check as a safety net. This new configurable offers a trade off where the sysadmin can balance between the risk of PAWS detection failing to act versus exhausting ports by having sockets tied up in TIME-WAIT state for too long. [1] https://lpc.events/event/16/contributions/1349/ Signed-off-by: Jakub Sitnicki Reviewed-by: Eric Dumazet Reviewed-by: Jason Xing Link: https://patch.msgid.link/20241209-jakub-krn-909-poc-msec-tw-tstamp-v2-2-66aca0eed03e@cloudflare.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 14 ++++++++++++++ .../net_cachelines/netns_ipv4_sysctl.rst | 1 + include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ net/ipv4/tcp_ipv4.c | 4 +++- 5 files changed, 29 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index eacf8983e2307..2f2b00295836b 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1000,6 +1000,20 @@ tcp_tw_reuse - INTEGER Default: 2 +tcp_tw_reuse_delay - UNSIGNED INTEGER + The delay in milliseconds before a TIME-WAIT socket can be reused by a + new connection, if TIME-WAIT socket reuse is enabled. The actual reuse + threshold is within [N, N+1] range, where N is the requested delay in + milliseconds, to ensure the delay interval is never shorter than the + configured value. + + This setting contains an assumption about the other TCP timestamp clock + tick interval. It should not be set to a value lower than the peer's + clock tick for PAWS (Protection Against Wrapped Sequence numbers) + mechanism work correctly for the reused connection. + + Default: 1000 (milliseconds) + tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. diff --git a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst index 629da6dc6d746..de0263302f16d 100644 --- a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst +++ b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst @@ -79,6 +79,7 @@ u8 sysctl_tcp_retries1 u8 sysctl_tcp_retries2 u8 sysctl_tcp_orphan_retries u8 sysctl_tcp_tw_reuse timewait_sock_ops +unsigned_int sysctl_tcp_tw_reuse_delay timewait_sock_ops int sysctl_tcp_fin_timeout TCP_LAST_ACK/tcp_rcv_state_process unsigned_int sysctl_tcp_notsent_lowat read_mostly tcp_notsent_lowat/tcp_stream_memory_free u8 sysctl_tcp_sack tcp_syn_options diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3c014170e0012..46452da352061 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -175,6 +175,7 @@ struct netns_ipv4 { u8 sysctl_tcp_retries2; u8 sysctl_tcp_orphan_retries; u8 sysctl_tcp_tw_reuse; + unsigned int sysctl_tcp_tw_reuse_delay; int sysctl_tcp_fin_timeout; u8 sysctl_tcp_sack; u8 sysctl_tcp_window_scaling; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a79b2a52ce01e..42cb5dc9cb245 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; +static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1065,6 +1066,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_tw_reuse_delay", + .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &tcp_tw_reuse_delay_max, + }, { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3b6ba1d16921e..e45222d5fc2e2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -163,7 +163,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); - reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + MSEC_PER_SEC; + reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); if (ts_recent_stamp && (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk @@ -3458,6 +3459,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); From 175dd9079ecbd86d0e10927c442d64519baf5809 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 10 Dec 2024 10:45:37 +0100 Subject: [PATCH 0448/1450] mlxsw: spectrum_flower: Do not allow mixing sample and mirror actions The device does not support multiple mirror actions per rule and the driver rejects such configuration: # tc filter add dev swp1 ingress pref 1 proto ip flower skip_sw action mirred egress mirror dev swp2 action mirred egress mirror dev swp3 Error: mlxsw_spectrum: Multiple mirror actions per rule are not supported. We have an error talking to the kernel Internally, the sample action is implemented by the device by mirroring to the CPU port. Therefore, mixing sample and mirror actions in a single rule does not work correctly and results in the last action effect. Solve by rejecting such misconfiguration: # tc filter add dev swp1 ingress pref 1 proto ip flower skip_sw action mirred egress mirror dev swp2 action sample rate 100 group 1 Error: mlxsw_spectrum: Sample action after mirror action is not supported. We have an error talking to the kernel # tc filter add dev swp1 ingress pref 1 proto ip flower skip_sw action sample rate 100 group 1 action mirred egress mirror dev swp2 Error: mlxsw_spectrum: Mirror action after sample action is not supported. We have an error talking to the kernel Reported-by: Vladyslav Mykhaliuk Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Reviewed-by: Michal Swiatkowski Reviewed-by: Kalesh AP Link: https://patch.msgid.link/d6c979914e8706dbe1dedbaf29ffffb0b8d71166.1733822570.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index f07955b5439f6..6a4a81c63451c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -192,6 +192,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; } + if (sample_act_count) { + NL_SET_ERR_MSG_MOD(extack, "Mirror action after sample action is not supported"); + return -EOPNOTSUPP; + } + err = mlxsw_sp_acl_rulei_act_mirror(mlxsw_sp, rulei, block, out_dev, extack); @@ -265,6 +270,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; } + if (mirror_act_count) { + NL_SET_ERR_MSG_MOD(extack, "Sample action after mirror action is not supported"); + return -EOPNOTSUPP; + } + err = mlxsw_sp_acl_rulei_act_sample(mlxsw_sp, rulei, block, act->sample.psample_group, From 3fa2540d93d85ad18456dbd29386c737ad3f7e02 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 12:38:26 +0000 Subject: [PATCH 0449/1450] net: fec: use phydev->eee_cfg.tx_lpi_timer Rather than maintaining a private copy of the LPI timer, make use of the LPI timer maintained by phylib. In any case, phylib overwrites the value of tx_lpi_timer set by the driver in phy_ethtool_get_eee(). Note that feb->eee.tx_lpi_timer is initialised to zero, which is just the same with phylib's copy, so there should be no functional change. Signed-off-by: Russell King (Oracle) Tested-by: Wei Fang Link: https://patch.msgid.link/E1tKzVS-006c67-IJ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec.h | 2 -- drivers/net/ethernet/freescale/fec_main.c | 16 ++++++---------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 1cca0425d4939..c81f2ea588f26 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -671,8 +671,6 @@ struct fec_enet_private { unsigned int tx_time_itr; unsigned int itr_clk_rate; - /* tx lpi eee mode */ - struct ethtool_keee eee; unsigned int clk_ref_rate; /* ptp clock period in ns*/ diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1b55047c0237c..b2daed55bf6c6 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2045,14 +2045,14 @@ static int fec_enet_us_to_tx_cycle(struct net_device *ndev, int us) return us * (fep->clk_ref_rate / 1000) / 1000; } -static int fec_enet_eee_mode_set(struct net_device *ndev, bool enable) +static int fec_enet_eee_mode_set(struct net_device *ndev, u32 lpi_timer, + bool enable) { struct fec_enet_private *fep = netdev_priv(ndev); - struct ethtool_keee *p = &fep->eee; unsigned int sleep_cycle, wake_cycle; if (enable) { - sleep_cycle = fec_enet_us_to_tx_cycle(ndev, p->tx_lpi_timer); + sleep_cycle = fec_enet_us_to_tx_cycle(ndev, lpi_timer); wake_cycle = sleep_cycle; } else { sleep_cycle = 0; @@ -2105,7 +2105,9 @@ static void fec_enet_adjust_link(struct net_device *ndev) napi_enable(&fep->napi); } if (fep->quirks & FEC_QUIRK_HAS_EEE) - fec_enet_eee_mode_set(ndev, phy_dev->enable_tx_lpi); + fec_enet_eee_mode_set(ndev, + phy_dev->eee_cfg.tx_lpi_timer, + phy_dev->enable_tx_lpi); } else { if (fep->link) { netif_stop_queue(ndev); @@ -3181,7 +3183,6 @@ static int fec_enet_get_eee(struct net_device *ndev, struct ethtool_keee *edata) { struct fec_enet_private *fep = netdev_priv(ndev); - struct ethtool_keee *p = &fep->eee; if (!(fep->quirks & FEC_QUIRK_HAS_EEE)) return -EOPNOTSUPP; @@ -3189,8 +3190,6 @@ fec_enet_get_eee(struct net_device *ndev, struct ethtool_keee *edata) if (!netif_running(ndev)) return -ENETDOWN; - edata->tx_lpi_timer = p->tx_lpi_timer; - return phy_ethtool_get_eee(ndev->phydev, edata); } @@ -3198,7 +3197,6 @@ static int fec_enet_set_eee(struct net_device *ndev, struct ethtool_keee *edata) { struct fec_enet_private *fep = netdev_priv(ndev); - struct ethtool_keee *p = &fep->eee; if (!(fep->quirks & FEC_QUIRK_HAS_EEE)) return -EOPNOTSUPP; @@ -3206,8 +3204,6 @@ fec_enet_set_eee(struct net_device *ndev, struct ethtool_keee *edata) if (!netif_running(ndev)) return -ENETDOWN; - p->tx_lpi_timer = edata->tx_lpi_timer; - return phy_ethtool_set_eee(ndev->phydev, edata); } From 66c366392e55ae07e37699eeacca50f01b0bb879 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:11 +0000 Subject: [PATCH 0450/1450] net: dsa: remove check for dp->pl in EEE methods When user ports are initialised, a phylink instance is always created, and so dp->pl will always be non-NULL. The EEE methods are only used for user ports, so checking for dp->pl to be NULL makes no sense. No other phylink-calling method implements similar checks in DSA. Remove this unnecessary check. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL13z-006cZ7-BZ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- net/dsa/user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dsa/user.c b/net/dsa/user.c index c736c019e2af9..e1a0b153c3535 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1230,7 +1230,7 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev || !dp->pl) + if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) @@ -1250,7 +1250,7 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev || !dp->pl) + if (!dev->phydev) return -ENODEV; if (!ds->ops->get_mac_eee) From 9723a77318b7c0cfd06ea207e52a042f8c815318 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:16 +0000 Subject: [PATCH 0451/1450] net: dsa: add hook to determine whether EEE is supported Add a hook to determine whether the switch supports EEE. This will return false if the switch does not, or true if it does. If the method is not implemented, we assume (currently) that the switch supports EEE. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL144-006cZD-El@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 1 + net/dsa/user.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/include/net/dsa.h b/include/net/dsa.h index 72ae65e7246a7..aaa75bbaa0eac 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -988,6 +988,7 @@ struct dsa_switch_ops { /* * Port's MAC EEE settings */ + bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); int (*get_mac_eee)(struct dsa_switch *ds, int port, diff --git a/net/dsa/user.c b/net/dsa/user.c index e1a0b153c3535..a743396800104 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1229,6 +1229,10 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) struct dsa_switch *ds = dp->ds; int ret; + /* Check whether the switch supports EEE */ + if (ds->ops->support_eee && !ds->ops->support_eee(ds, dp->index)) + return -EOPNOTSUPP; + /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; @@ -1249,6 +1253,10 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) struct dsa_switch *ds = dp->ds; int ret; + /* Check whether the switch supports EEE */ + if (ds->ops->support_eee && !ds->ops->support_eee(ds, dp->index)) + return -EOPNOTSUPP; + /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; From 99379f587278c818777cb4778e2c79c6c1440c65 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:21 +0000 Subject: [PATCH 0452/1450] net: dsa: provide implementation of .support_eee() Provide a trivial implementation for the .support_eee() method which switch drivers can use to simply indicate that they support EEE on all their user ports. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL149-006cZJ-JJ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 1 + net/dsa/port.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/net/dsa.h b/include/net/dsa.h index aaa75bbaa0eac..4aeedb296d67e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1384,5 +1384,6 @@ static inline bool dsa_user_dev_check(const struct net_device *dev) netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); +bool dsa_supports_eee(struct dsa_switch *ds, int port); #endif diff --git a/net/dsa/port.c b/net/dsa/port.c index ee0aaec4c8e09..5c9d1798e830a 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1575,6 +1575,22 @@ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, cpu_dp->tag_ops = tag_ops; } +/* dsa_supports_eee - indicate that EEE is supported + * @ds: pointer to &struct dsa_switch + * @port: port index + * + * A default implementation for the .support_eee() DSA operations member, + * which drivers can use to indicate that they support EEE on all of their + * user ports. + * + * Returns: true + */ +bool dsa_supports_eee(struct dsa_switch *ds, int port) +{ + return true; +} +EXPORT_SYMBOL_GPL(dsa_supports_eee); + static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) From c86692fc2cb77d94dd8c166c2b9017f196d02a84 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:26 +0000 Subject: [PATCH 0453/1450] net: dsa: b53/bcm_sf2: implement .support_eee() method Implement the .support_eee() method to indicate that EEE is not supported by two switch variants, rather than making these checks in the .set_mac_eee() and .get_mac_eee() methods. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL14E-006cZU-Nc@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 13 +++++++------ drivers/net/dsa/b53/b53_priv.h | 1 + drivers/net/dsa/bcm_sf2.c | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 285785c942b07..0561b60f668f2 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2224,13 +2224,16 @@ int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy) } EXPORT_SYMBOL(b53_eee_init); -int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) +bool b53_support_eee(struct dsa_switch *ds, int port) { struct b53_device *dev = ds->priv; - if (is5325(dev) || is5365(dev)) - return -EOPNOTSUPP; + return !is5325(dev) && !is5365(dev); +} +EXPORT_SYMBOL(b53_support_eee); +int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) +{ return 0; } EXPORT_SYMBOL(b53_get_mac_eee); @@ -2240,9 +2243,6 @@ int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) struct b53_device *dev = ds->priv; struct ethtool_keee *p = &dev->ports[port].eee; - if (is5325(dev) || is5365(dev)) - return -EOPNOTSUPP; - p->eee_enabled = e->eee_enabled; b53_eee_enable_set(ds, port, e->eee_enabled); @@ -2298,6 +2298,7 @@ static const struct dsa_switch_ops b53_switch_ops = { .phylink_get_caps = b53_phylink_get_caps, .port_enable = b53_enable_port, .port_disable = b53_disable_port, + .support_eee = b53_support_eee, .get_mac_eee = b53_get_mac_eee, .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 05141176daf50..99e5cfc98ae82 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -384,6 +384,7 @@ int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy); void b53_disable_port(struct dsa_switch *ds, int port); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy); +bool b53_support_eee(struct dsa_switch *ds, int port); int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e); int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 43bde1f583ffd..a53fb6191e6b3 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1232,6 +1232,7 @@ static const struct dsa_switch_ops bcm_sf2_ops = { .set_wol = bcm_sf2_sw_set_wol, .port_enable = bcm_sf2_port_setup, .port_disable = bcm_sf2_port_disable, + .support_eee = b53_support_eee, .get_mac_eee = b53_get_mac_eee, .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, From 7eb4f3d9fe173d71b9f9fad7e426e528fcb59b74 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:31 +0000 Subject: [PATCH 0454/1450] net: dsa: mt753x: implement .support_eee() method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the .support_eee() method by using the generic helper as all user ports support EEE. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Arınç ÜNAL Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL14J-006cZa-Rh@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 086b8b3d5b40f..9605febd3573c 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -3238,6 +3238,7 @@ const struct dsa_switch_ops mt7530_switch_ops = { .port_mirror_add = mt753x_port_mirror_add, .port_mirror_del = mt753x_port_mirror_del, .phylink_get_caps = mt753x_phylink_get_caps, + .support_eee = dsa_supports_eee, .get_mac_eee = mt753x_get_mac_eee, .set_mac_eee = mt753x_set_mac_eee, .conduit_state_change = mt753x_conduit_state_change, From fe3ef44385b217c49fd1bfcab3c221d34174e1b4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:36 +0000 Subject: [PATCH 0455/1450] net: dsa: qca8k: implement .support_eee() method Implement the .support_eee() method by using the generic helper as all user ports support EEE. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL14O-006cZg-VM@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/dsa/qca/qca8k-8xxx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 59b4a7240b583..ec74e3c2b0e9b 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -2016,6 +2016,7 @@ static const struct dsa_switch_ops qca8k_switch_ops = { .get_ethtool_stats = qca8k_get_ethtool_stats, .get_sset_count = qca8k_get_sset_count, .set_ageing_time = qca8k_set_ageing_time, + .support_eee = dsa_supports_eee, .get_mac_eee = qca8k_get_mac_eee, .set_mac_eee = qca8k_set_mac_eee, .port_enable = qca8k_port_enable, From eb3126e720e7629474332417dae256d471727865 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:42 +0000 Subject: [PATCH 0456/1450] net: dsa: mv88e6xxx: implement .support_eee() method Implement the .support_eee() method by using the generic helper as all user ports support EEE. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL14U-006cZm-2K@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 34708c739b045..570c8642d3871 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -7099,6 +7099,7 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = { .get_sset_count = mv88e6xxx_get_sset_count, .port_max_mtu = mv88e6xxx_get_max_mtu, .port_change_mtu = mv88e6xxx_change_mtu, + .support_eee = dsa_supports_eee, .get_mac_eee = mv88e6xxx_get_mac_eee, .set_mac_eee = mv88e6xxx_set_mac_eee, .get_eeprom_len = mv88e6xxx_get_eeprom_len, From 801fd546c1cad69a00cef0a300e9f293d8e50432 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:47 +0000 Subject: [PATCH 0457/1450] net: dsa: ksz: implement .support_eee() method Implement the .support_eee() method by reusing the ksz_validate_eee() method as a template, renaming the function, changing the return type and values, and removing it from the ksz_set_mac_eee() and ksz_get_mac_eee() methods. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL14Z-006cZs-6o@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index f5822c57be32e..94f9aa983ff67 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -3454,12 +3454,12 @@ static int ksz_max_mtu(struct dsa_switch *ds, int port) return -EOPNOTSUPP; } -static int ksz_validate_eee(struct dsa_switch *ds, int port) +static bool ksz_support_eee(struct dsa_switch *ds, int port) { struct ksz_device *dev = ds->priv; if (!dev->info->internal_phy[port]) - return -EOPNOTSUPP; + return false; switch (dev->chip_id) { case KSZ8563_CHIP_ID: @@ -3471,21 +3471,15 @@ static int ksz_validate_eee(struct dsa_switch *ds, int port) case KSZ9896_CHIP_ID: case KSZ9897_CHIP_ID: case LAN9646_CHIP_ID: - return 0; + return true; } - return -EOPNOTSUPP; + return false; } static int ksz_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { - int ret; - - ret = ksz_validate_eee(ds, port); - if (ret) - return ret; - /* There is no documented control of Tx LPI configuration. */ e->tx_lpi_enabled = true; @@ -3501,11 +3495,6 @@ static int ksz_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { struct ksz_device *dev = ds->priv; - int ret; - - ret = ksz_validate_eee(ds, port); - if (ret) - return ret; if (!e->tx_lpi_enabled) { dev_err(dev->dev, "Disabling EEE Tx LPI is not supported\n"); @@ -4651,6 +4640,7 @@ static const struct dsa_switch_ops ksz_switch_ops = { .cls_flower_add = ksz_cls_flower_add, .cls_flower_del = ksz_cls_flower_del, .port_setup_tc = ksz_setup_tc, + .support_eee = ksz_support_eee, .get_mac_eee = ksz_get_mac_eee, .set_mac_eee = ksz_set_mac_eee, .port_get_default_prio = ksz_port_get_default_prio, From 88325a291a0cd077bf49b889af605e683b5f956e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:52 +0000 Subject: [PATCH 0458/1450] net: dsa: require .support_eee() method to be implemented Now that we have updated all drivers, switch DSA to require an implementation of the .support_eee() method for EEE to be usable, rather than defaulting to being permissive when not implemented. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL14e-006cZy-AT@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- net/dsa/user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dsa/user.c b/net/dsa/user.c index a743396800104..4a8de48a6f249 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1230,7 +1230,7 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) int ret; /* Check whether the switch supports EEE */ - if (ds->ops->support_eee && !ds->ops->support_eee(ds, dp->index)) + if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* Port's PHY and MAC both need to be EEE capable */ @@ -1254,7 +1254,7 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) int ret; /* Check whether the switch supports EEE */ - if (ds->ops->support_eee && !ds->ops->support_eee(ds, dp->index)) + if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* Port's PHY and MAC both need to be EEE capable */ From 4aa567b1df8b88e3d49a1896b0d4467d5bec6c89 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 10 Dec 2024 10:30:41 -0800 Subject: [PATCH 0459/1450] ionic: add asic codes to firmware interface file Now that the firmware has learned how to properly report the asic type id, add the values to our interface file. The sharp-eyed reviewers will catch that the CAPRI value changed here from 0 to 1. This comes with the FW actually defining it correctly. This is safe for us to change as nothing actually uses that value yet. Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Paolo Abeni --- drivers/net/ethernet/pensando/ionic/ionic.h | 2 -- drivers/net/ethernet/pensando/ionic/ionic_if.h | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h index 1c61390677f7a..0639bf56bd3af 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic.h +++ b/drivers/net/ethernet/pensando/ionic/ionic.h @@ -18,8 +18,6 @@ struct ionic_lif; #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_PF 0x1002 #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF 0x1003 -#define IONIC_ASIC_TYPE_ELBA 2 - #define DEVCMD_TIMEOUT 5 #define IONIC_ADMINQ_TIME_SLICE msecs_to_jiffies(100) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index 9c85c0706c6e1..6ea190f1a706c 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -3209,7 +3209,11 @@ union ionic_adminq_comp { #define IONIC_BAR0_INTR_CTRL_OFFSET 0x2000 #define IONIC_DEV_CMD_DONE 0x00000001 -#define IONIC_ASIC_TYPE_CAPRI 0 +#define IONIC_ASIC_TYPE_NONE 0 +#define IONIC_ASIC_TYPE_CAPRI 1 +#define IONIC_ASIC_TYPE_ELBA 2 +#define IONIC_ASIC_TYPE_GIGLIO 3 +#define IONIC_ASIC_TYPE_SALINA 4 /** * struct ionic_doorbell - Doorbell register layout From 33ce1d41c133b053ccea5ac4aaaab260d42706b3 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Tue, 10 Dec 2024 10:30:42 -0800 Subject: [PATCH 0460/1450] ionic: Use VLAN_ETH_HLEN when possible Replace when ETH_HLEN and VLAN_HLEN are used together with VLAN_ETH_HLEN since it's the same value and uses 1 define instead of 2. Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Paolo Abeni --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 40496587b2b31..052c767a2c757 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -3265,7 +3265,7 @@ int ionic_lif_alloc(struct ionic *ionic) lif->netdev->min_mtu = max_t(unsigned int, ETH_MIN_MTU, le32_to_cpu(lif->identity->eth.min_frame_size)); lif->netdev->max_mtu = - le32_to_cpu(lif->identity->eth.max_frame_size) - ETH_HLEN - VLAN_HLEN; + le32_to_cpu(lif->identity->eth.max_frame_size) - VLAN_ETH_HLEN; lif->neqs = ionic->neqs_per_lif; lif->nxqs = ionic->ntxqs_per_lif; From 7c372bac12b2003a44aa333773001c83bcb07d09 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Tue, 10 Dec 2024 10:30:43 -0800 Subject: [PATCH 0461/1450] ionic: Translate IONIC_RC_ENOSUPP to EOPNOTSUPP Instead of reporting -EINVAL when IONIC_RC_ENOSUPP is returned use the -EOPNOTSUPP value. This aligns better since the FW only returns IONIC_RC_ENOSUPP when operations aren't supported not when invalid values are used. Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Paolo Abeni --- drivers/net/ethernet/pensando/ionic/ionic_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index 0f817c3f92d82..daf1e82cb76b3 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -81,8 +81,9 @@ static int ionic_error_to_errno(enum ionic_status_code code) case IONIC_RC_EQTYPE: case IONIC_RC_EQID: case IONIC_RC_EINVAL: - case IONIC_RC_ENOSUPP: return -EINVAL; + case IONIC_RC_ENOSUPP: + return -EOPNOTSUPP; case IONIC_RC_EPERM: return -EPERM; case IONIC_RC_ENOENT: From a8b05dd3389f313b2ba858165a6ded53c274e5fd Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 10 Dec 2024 10:30:44 -0800 Subject: [PATCH 0462/1450] ionic: add speed defines for 200G and 400G Add higher speed defines to the ionic_if.h API and decode them in the ethtool get_link_ksettings callback. Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- .../ethernet/pensando/ionic/ionic_ethtool.c | 39 +++++++++++++++++++ .../net/ethernet/pensando/ionic/ionic_if.h | 16 +++++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index dda22fa4448cf..272317048cb93 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -158,6 +158,20 @@ static int ionic_get_link_ksettings(struct net_device *netdev, 25000baseCR_Full); copper_seen++; break; + case IONIC_XCVR_PID_QSFP_50G_CR2_FC: + case IONIC_XCVR_PID_QSFP_50G_CR2: + ethtool_link_ksettings_add_link_mode(ks, supported, + 50000baseCR2_Full); + copper_seen++; + break; + case IONIC_XCVR_PID_QSFP_200G_CR4: + ethtool_link_ksettings_add_link_mode(ks, supported, 200000baseCR4_Full); + copper_seen++; + break; + case IONIC_XCVR_PID_QSFP_400G_CR4: + ethtool_link_ksettings_add_link_mode(ks, supported, 400000baseCR4_Full); + copper_seen++; + break; case IONIC_XCVR_PID_SFP_10GBASE_AOC: case IONIC_XCVR_PID_SFP_10GBASE_CU: ethtool_link_ksettings_add_link_mode(ks, supported, @@ -196,6 +210,31 @@ static int ionic_get_link_ksettings(struct net_device *netdev, ethtool_link_ksettings_add_link_mode(ks, supported, 25000baseSR_Full); break; + case IONIC_XCVR_PID_QSFP_200G_AOC: + case IONIC_XCVR_PID_QSFP_200G_SR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 200000baseSR4_Full); + break; + case IONIC_XCVR_PID_QSFP_200G_FR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 200000baseLR4_ER4_FR4_Full); + break; + case IONIC_XCVR_PID_QSFP_200G_DR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 200000baseDR4_Full); + break; + case IONIC_XCVR_PID_QSFP_400G_FR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 400000baseLR4_ER4_FR4_Full); + break; + case IONIC_XCVR_PID_QSFP_400G_DR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 400000baseDR4_Full); + break; + case IONIC_XCVR_PID_QSFP_400G_SR4: + ethtool_link_ksettings_add_link_mode(ks, supported, + 400000baseSR4_Full); + break; case IONIC_XCVR_PID_SFP_10GBASE_SR: ethtool_link_ksettings_add_link_mode(ks, supported, 10000baseSR_Full); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index 6ea190f1a706c..830c8adbfbee0 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -1277,7 +1277,10 @@ enum ionic_xcvr_pid { IONIC_XCVR_PID_SFP_25GBASE_CR_S = 3, IONIC_XCVR_PID_SFP_25GBASE_CR_L = 4, IONIC_XCVR_PID_SFP_25GBASE_CR_N = 5, - + IONIC_XCVR_PID_QSFP_50G_CR2_FC = 6, + IONIC_XCVR_PID_QSFP_50G_CR2 = 7, + IONIC_XCVR_PID_QSFP_200G_CR4 = 8, + IONIC_XCVR_PID_QSFP_400G_CR4 = 9, /* Fiber */ IONIC_XCVR_PID_QSFP_100G_AOC = 50, IONIC_XCVR_PID_QSFP_100G_ACC = 51, @@ -1303,6 +1306,15 @@ enum ionic_xcvr_pid { IONIC_XCVR_PID_SFP_25GBASE_ACC = 71, IONIC_XCVR_PID_SFP_10GBASE_T = 72, IONIC_XCVR_PID_SFP_1000BASE_T = 73, + IONIC_XCVR_PID_QSFP_200G_AOC = 74, + IONIC_XCVR_PID_QSFP_200G_FR4 = 75, + IONIC_XCVR_PID_QSFP_200G_DR4 = 76, + IONIC_XCVR_PID_QSFP_200G_SR4 = 77, + IONIC_XCVR_PID_QSFP_200G_ACC = 78, + IONIC_XCVR_PID_QSFP_400G_FR4 = 79, + IONIC_XCVR_PID_QSFP_400G_DR4 = 80, + IONIC_XCVR_PID_QSFP_400G_SR4 = 81, + IONIC_XCVR_PID_QSFP_400G_VR4 = 82, }; /** @@ -1404,6 +1416,8 @@ struct ionic_xcvr_status { */ union ionic_port_config { struct { +#define IONIC_SPEED_400G 400000 /* 400G in Mbps */ +#define IONIC_SPEED_200G 200000 /* 200G in Mbps */ #define IONIC_SPEED_100G 100000 /* 100G in Mbps */ #define IONIC_SPEED_50G 50000 /* 50G in Mbps */ #define IONIC_SPEED_40G 40000 /* 40G in Mbps */ From a857c841e7ea0f8ac18bcd3944e2e32cfd82efed Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 10 Dec 2024 10:30:45 -0800 Subject: [PATCH 0463/1450] ionic: add support for QSFP_PLUS_CMIS Teach the driver to recognize and decode the sfp pid SFF8024_ID_QSFP_PLUS_CMIS correctly. Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Paolo Abeni --- drivers/net/ethernet/pensando/ionic/ionic_ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index 272317048cb93..720092b1633ae 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -968,6 +968,7 @@ static int ionic_get_module_info(struct net_device *netdev, break; case SFF8024_ID_QSFP_8436_8636: case SFF8024_ID_QSFP28_8636: + case SFF8024_ID_QSFP_PLUS_CMIS: modinfo->type = ETH_MODULE_SFF_8436; modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; break; From a3b16198d3df38aa2fc6de167b919ecb3fae74a6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 11 Dec 2024 01:35:40 +0200 Subject: [PATCH 0464/1450] selftests: forwarding: add a pvid_change test to bridge_vlan_unaware Historically, DSA drivers have seen problems with the model in which bridge VLANs work, particularly with them being offloaded to switchdev asynchronously relative to when they become active (vlan_filtering=1). This switchdev API peculiarity was papered over by commit 2ea7a679ca2a ("net: dsa: Don't add vlans when vlan filtering is disabled"), which introduced other problems, fixed by commit 54a0ed0df496 ("net: dsa: provide an option for drivers to always receive bridge VLANs") through an opt-in ds->configure_vlan_while_not_filtering bool (which later became an opt-out). The point is that some DSA drivers still skip VLAN configuration while VLAN-unaware, and there is a desire to get rid of that behavior. It's hard to deduce from the wording "at least one corner case" what Andrew saw, but my best guess is that there is a discrepancy of meaning between bridge pvid and hardware port pvid which caused breakage. On one side, the Linux bridge with vlan_filtering=0 is completely VLAN-unaware, and will accept and process a packet the same way irrespective of the VLAN groups on the ports or the bridge itself (there may not even be a pvid, and this makes no difference). On the other hand, DSA switches still do VLAN processing internally, even with vlan_filtering disabled, but they are expected to classify all packets to the port pvid. That pvid shouldn't be confused with the bridge pvid, and there lies the problem. When a switch port is under a VLAN-unaware bridge, the hardware pvid must be explicitly managed by the driver to classify all received packets to it, regardless of bridge VLAN groups. When under a VLAN-aware bridge, the hardware pvid must be synchronized to the bridge port pvid. To do this correctly, the pattern is unfortunately a bit complicated, and involves hooking the pvid change logic into quite a few places (the ones that change the input variables which determine the value to use as hardware pvid for a port). See mv88e6xxx_port_commit_pvid(), sja1105_commit_pvid(), ocelot_port_set_pvid() etc. The point is that not all drivers used to do that, especially in older kernels. If a driver is to blindly program a bridge pvid VLAN received from switchdev while it's VLAN-unaware, this might in turn change the hardware pvid used by a VLAN-unaware bridge port, which might result in packet loss depending which other ports have that pvid too (in that same note, it might also go unnoticed). To capture that condition, it is sufficient to take a VLAN-unaware bridge and change the [VLAN-aware] bridge pvid on a single port, to a VID that isn't present on any other port. This shouldn't have absolutely any effect on packet classification or forwarding. However, broken drivers will take the bait, and change their PVID to 3, causing packet loss. Signed-off-by: Vladimir Oltean Tested-by: Ido Schimmel Link: https://patch.msgid.link/20241210233541.1401837-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- .../net/forwarding/bridge_vlan_unaware.sh | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh index 1c8a26046589f..2b5700b61ffab 100755 --- a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh +++ b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding" +ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding pvid_change" NUM_NETIFS=4 source lib.sh @@ -77,12 +77,16 @@ cleanup() ping_ipv4() { - ping_test $h1 192.0.2.2 + local msg=$1 + + ping_test $h1 192.0.2.2 "$msg" } ping_ipv6() { - ping6_test $h1 2001:db8:1::2 + local msg=$1 + + ping6_test $h1 2001:db8:1::2 "$msg" } learning() @@ -95,6 +99,21 @@ flooding() flood_test $swp2 $h1 $h2 } +pvid_change() +{ + # Test that the changing of the VLAN-aware PVID does not affect + # VLAN-unaware forwarding + bridge vlan add vid 3 dev $swp1 pvid untagged + + ping_ipv4 " with bridge port $swp1 PVID changed" + ping_ipv6 " with bridge port $swp1 PVID changed" + + bridge vlan del vid 3 dev $swp1 + + ping_ipv4 " with bridge port $swp1 PVID deleted" + ping_ipv6 " with bridge port $swp1 PVID deleted" +} + trap cleanup EXIT setup_prepare From 27ef6a9981fe74191849966a6d5e0400a4008ab8 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 10:30:54 +0800 Subject: [PATCH 0465/1450] net/smc: support SMC-R V2 for rdma devices with max_recv_sge equals to 1 For SMC-R V2, llc msg can be larger than SMC_WR_BUF_SIZE, thus every recv wr has 2 sges, the first sge with length SMC_WR_BUF_SIZE is for V1/V2 compatible llc/cdc msg, and the second sge with length SMC_WR_BUF_V2_SIZE-SMC_WR_TX_SIZE is for V2 specific llc msg, like SMC_LLC_DELETE_RKEY and SMC_LLC_ADD_LINK for SMC-R V2. The memory buffer in the second sge is shared by all recv wr in one link and all link in one lgr for saving memory usage purpose. But not all RDMA devices with max_recv_sge greater than 1. Thus SMC-R V2 can not support on such RDMA devices and SMC_CLC_DECL_INTERR fallback happens because of the failure of create qp. This patch introduce the support for SMC-R V2 on RDMA devices with max_recv_sge equals to 1. Every recv wr has only one sge with individual buffer whose size is SMC_WR_BUF_V2_SIZE once the RDMA device's max_recv_sge equals to 1. It may use more memory, but it is better than SMC_CLC_DECL_INTERR fallback. Co-developed-by: Wen Gu Signed-off-by: Wen Gu Signed-off-by: Guangguan Wang Reviewed-by: Wenjia Zhang Signed-off-by: Paolo Abeni --- net/smc/smc_core.c | 5 +++++ net/smc/smc_core.h | 11 ++++++++++- net/smc/smc_ib.c | 3 +-- net/smc/smc_llc.c | 21 +++++++++++++++------ net/smc/smc_wr.c | 42 +++++++++++++++++++++--------------------- 5 files changed, 52 insertions(+), 30 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 500952c2e67ba..ede4d5f3111b5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -795,9 +795,14 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, if (lgr->smc_version == SMC_V2) { lnk->smcibdev = ini->smcrv2.ib_dev_v2; lnk->ibport = ini->smcrv2.ib_port_v2; + lnk->wr_rx_sge_cnt = lnk->smcibdev->ibdev->attrs.max_recv_sge < 2 ? 1 : 2; + lnk->wr_rx_buflen = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_BUF_SIZE : SMC_WR_BUF_V2_SIZE; } else { lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + lnk->wr_rx_sge_cnt = 1; + lnk->wr_rx_buflen = SMC_WR_BUF_SIZE; } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 69b54ecd65031..48a1b1dcb576e 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -122,10 +122,14 @@ struct smc_link { } ____cacheline_aligned_in_smp; struct completion tx_ref_comp; - struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ + u8 *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ + int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */ + int wr_rx_buflen; /* buffer len for the first sge, len for the + * second sge is lgr shared if rx sge is 2. + */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ @@ -506,6 +510,11 @@ static inline bool smc_link_active(struct smc_link *lnk) return lnk->state == SMC_LNK_ACTIVE; } +static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk) +{ + return lnk->wr_rx_sge_cnt > 1; +} + static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 9c563cdbea908..53828833a3f7f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -662,7 +662,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, @@ -676,7 +675,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_send_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, - .max_recv_sge = sges_per_buf, + .max_recv_sge = lnk->wr_rx_sge_cnt, .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 018ce8133b026..f865c58c3aa77 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -997,13 +997,14 @@ static int smc_llc_cli_conf_link(struct smc_link *link, } static void smc_llc_save_add_link_rkeys(struct smc_link *link, - struct smc_link *link_new) + struct smc_link *link_new, + u8 *llc_msg) { struct smc_llc_msg_add_link_v2_ext *ext; struct smc_link_group *lgr = link->lgr; int max, i; - ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + + ext = (struct smc_llc_msg_add_link_v2_ext *)(llc_msg + SMC_WR_TX_SIZE); max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); down_write(&lgr->rmbs_lock); @@ -1098,7 +1099,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (rc) goto out_clear_lnk; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, lnk_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)llc; + smc_llc_save_add_link_rkeys(link, lnk_new, llc_msg); } else { rc = smc_llc_cli_rkey_exchange(link, lnk_new); if (rc) { @@ -1498,7 +1501,9 @@ int smc_llc_srv_add_link(struct smc_link *link, if (rc) goto out_err; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, link_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)add_llc; + smc_llc_save_add_link_rkeys(link, link_new, llc_msg); } else { rc = smc_llc_srv_rkey_exchange(link, link_new); if (rc) @@ -1807,8 +1812,12 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) if (lgr->smc_version == SMC_V2) { struct smc_llc_msg_delete_rkey_v2 *llcv2; - memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); - llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + if (smc_link_shared_v2_rxbuf(link)) { + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + } else { + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)llc; + } llcv2->num_inval_rkeys = 0; max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 994c0cd4fddbf..b04a21b8c5111 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -439,7 +439,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) return; /* short message */ temp_wr_id = wc->wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); - wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; + wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen); hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { if (handler->type == wr_rx->type) handler->handler(wc, wr_rx); @@ -555,7 +555,6 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; @@ -608,13 +607,14 @@ static void smc_wr_init_sge(struct smc_link *lnk) * the larger spillover buffer, allowing easy data mapping. */ for (i = 0; i < lnk->wr_rx_cnt; i++) { - int x = i * sges_per_buf; + int x = i * lnk->wr_rx_sge_cnt; lnk->wr_rx_sges[x].addr = - lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_dma_addr + i * lnk->wr_rx_buflen; + lnk->wr_rx_sges[x].length = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_TX_SIZE : lnk->wr_rx_buflen; lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; - if (lnk->lgr->smc_version == SMC_V2) { + if (lnk->lgr->smc_version == SMC_V2 && smc_link_shared_v2_rxbuf(lnk)) { lnk->wr_rx_sges[x + 1].addr = lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; lnk->wr_rx_sges[x + 1].length = @@ -624,7 +624,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) } lnk->wr_rx_ibs[i].next = NULL; lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; - lnk->wr_rx_ibs[i].num_sge = sges_per_buf; + lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt; } lnk->wr_reg.wr.next = NULL; lnk->wr_reg.wr.num_sge = 0; @@ -655,7 +655,7 @@ void smc_wr_free_link(struct smc_link *lnk) if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; } @@ -740,13 +740,11 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) int smc_wr_alloc_link_mem(struct smc_link *link) { - int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; - /* allocate link related memory */ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -774,7 +772,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, - sizeof(link->wr_rx_sges[0]) * sges_per_buf, + sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; @@ -872,7 +870,7 @@ int smc_wr_create_link(struct smc_link *lnk) smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); lnk->wr_rx_id = 0; lnk->wr_rx_dma_addr = ib_dma_map_single( - ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { lnk->wr_rx_dma_addr = 0; @@ -880,13 +878,15 @@ int smc_wr_create_link(struct smc_link *lnk) goto out; } if (lnk->lgr->smc_version == SMC_V2) { - lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, - lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { - lnk->wr_rx_v2_dma_addr = 0; - rc = -EIO; - goto dma_unmap; + if (smc_link_shared_v2_rxbuf(lnk)) { + lnk->wr_rx_v2_dma_addr = + ib_dma_map_single(ibdev, lnk->lgr->wr_rx_buf_v2, + SMC_WR_BUF_V2_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } } lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, @@ -935,7 +935,7 @@ int smc_wr_create_link(struct smc_link *lnk) lnk->wr_tx_v2_dma_addr = 0; } ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; out: From c12b2704a678b8a116eeb03f5b91895b90b4dd6f Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 10:30:55 +0800 Subject: [PATCH 0466/1450] net/smc: support ipv4 mapped ipv6 addr client for smc-r v2 AF_INET6 is not supported for smc-r v2 client before, even if the ipv6 addr is ipv4 mapped. Thus, when using AF_INET6, smc-r connection will fallback to tcp, especially for java applications running smc-r. This patch support ipv4 mapped ipv6 addr client for smc-r v2. Clients using real global ipv6 addr is still not supported yet. Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: Dust Li Reviewed-by: D. Wythe Reviewed-by: Wenjia Zhang Reviewed-by: Halil Pasic Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9e6c69d18581c..19ebff1c2579f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1117,7 +1117,10 @@ static int smc_find_proposal_devices(struct smc_sock *smc, ini->check_smcrv2 = true; ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; if (!(ini->smcr_version & SMC_V2) || - smc->clcsock->sk->sk_family != AF_INET || +#if IS_ENABLED(CONFIG_IPV6) + (smc->clcsock->sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&smc->clcsock->sk->sk_v6_rcv_saddr)) || +#endif !smc_clc_ueid_count() || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V2; From 0cff90dec63da908fb16d9ea2872ebbcd2d18e6a Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Sun, 17 Nov 2024 17:03:25 +0000 Subject: [PATCH 0467/1450] dma-buf: Fix __dma_buf_debugfs_list_del argument for !CONFIG_DEBUG_FS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The arguments for __dma_buf_debugfs_list_del do not match for both the CONFIG_DEBUG_FS case and the !CONFIG_DEBUG_FS case. The !CONFIG_DEBUG_FS case should take a struct dma_buf *, but it's currently struct file *. This can lead to the build error: error: passing argument 1 of ‘__dma_buf_debugfs_list_del’ from incompatible pointer type [-Werror=incompatible-pointer-types] dma-buf.c:63:53: note: expected ‘struct file *’ but argument is of type ‘struct dma_buf *’ 63 | static void __dma_buf_debugfs_list_del(struct file *file) Fixes: bfc7bc539392 ("dma-buf: Do not build debugfs related code when !CONFIG_DEBUG_FS") Signed-off-by: T.J. Mercier Reviewed-by: Tvrtko Ursulin Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20241117170326.1971113-1-tjmercier@google.com --- drivers/dma-buf/dma-buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 5ad0e9e2e1b93..84bc321348622 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -60,7 +60,7 @@ static void __dma_buf_debugfs_list_add(struct dma_buf *dmabuf) { } -static void __dma_buf_debugfs_list_del(struct file *file) +static void __dma_buf_debugfs_list_del(struct dma_buf *dmabuf) { } #endif From b10a1e5643e505c367c7e16aa6d8a9a0dc07354b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 3 Dec 2024 15:28:21 +0800 Subject: [PATCH 0468/1450] erofs: fix rare pcluster memory leak after unmounting There may still exist some pcluster with valid reference counts during unmounting. Instead of introducing another synchronization primitive, just try again as unmounting is relatively rare. This approach is similar to z_erofs_cache_invalidate_folio(). It was also reported by syzbot as a UAF due to commit f5ad9f9a603f ("erofs: free pclusters if no cached folio is attached"): BUG: KASAN: slab-use-after-free in do_raw_spin_trylock+0x72/0x1f0 kernel/locking/spinlock_debug.c:123 .. queued_spin_trylock include/asm-generic/qspinlock.h:92 [inline] do_raw_spin_trylock+0x72/0x1f0 kernel/locking/spinlock_debug.c:123 __raw_spin_trylock include/linux/spinlock_api_smp.h:89 [inline] _raw_spin_trylock+0x20/0x80 kernel/locking/spinlock.c:138 spin_trylock include/linux/spinlock.h:361 [inline] z_erofs_put_pcluster fs/erofs/zdata.c:959 [inline] z_erofs_decompress_pcluster fs/erofs/zdata.c:1403 [inline] z_erofs_decompress_queue+0x3798/0x3ef0 fs/erofs/zdata.c:1425 z_erofs_decompressqueue_work+0x99/0xe0 fs/erofs/zdata.c:1437 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa68/0x1840 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f2/0x390 kernel/kthread.c:389 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 However, it seems a long outstanding memory leak. Fix it now. Fixes: f5ad9f9a603f ("erofs: free pclusters if no cached folio is attached") Reported-by: syzbot+7ff87b095e7ca0c5ac39@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/674c1235.050a0220.ad585.0032.GAE@google.com Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241203072821.1885740-1-hsiangkao@linux.alibaba.com --- fs/erofs/zutil.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 75704f58ecfa9..0dd65cefce33e 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -230,9 +230,10 @@ void erofs_shrinker_unregister(struct super_block *sb) struct erofs_sb_info *const sbi = EROFS_SB(sb); mutex_lock(&sbi->umount_mutex); - /* clean up all remaining pclusters in memory */ - z_erofs_shrink_scan(sbi, ~0UL); - + while (!xa_empty(&sbi->managed_pslots)) { + z_erofs_shrink_scan(sbi, ~0UL); + cond_resched(); + } spin_lock(&erofs_sb_list_lock); list_del(&sbi->list); spin_unlock(&erofs_sb_list_lock); From 1a2180f6859c73c674809f9f82e36c94084682ba Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 27 Nov 2024 16:52:36 +0800 Subject: [PATCH 0469/1450] erofs: fix PSI memstall accounting Max Kellermann recently reported psi_group_cpu.tasks[NR_MEMSTALL] is incorrect in the 6.11.9 kernel. The root cause appears to be that, since the problematic commit, bio can be NULL, causing psi_memstall_leave() to be skipped in z_erofs_submit_queue(). Reported-by: Max Kellermann Closes: https://lore.kernel.org/r/CAKPOu+8tvSowiJADW2RuKyofL_CSkm_SuyZA7ME5vMLWmL6pqw@mail.gmail.com Fixes: 9e2f9d34dd12 ("erofs: handle overlapped pclusters out of crafted images properly") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241127085236.3538334-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 01f1475054874..19ef4ff2a1345 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1792,9 +1792,9 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, erofs_fscache_submit_bio(bio); else submit_bio(bio); - if (memstall) - psi_memstall_leave(&pflags); } + if (memstall) + psi_memstall_leave(&pflags); /* * although background is preferred, no one is pending for submission. From 6d1917045ef4f584593ad30b3dbb887d95fc331f Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Wed, 11 Dec 2024 16:09:18 +0800 Subject: [PATCH 0470/1450] MAINTAINERS: erofs: update Yue Hu's email address The current email address is no longer valid, use my gmail instead. Signed-off-by: Yue Hu Acked-by: Gao Xiang Acked-by: Chao Yu Link: https://lore.kernel.org/r/20241211080918.8512-1-zbestahu@163.com Signed-off-by: Gao Xiang --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b13..ec84534ff87ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8451,7 +8451,7 @@ F: include/video/s1d13xxxfb.h EROFS FILE SYSTEM M: Gao Xiang M: Chao Yu -R: Yue Hu +R: Yue Hu R: Jeffle Xu R: Sandeep Dhavale L: linux-erofs@lists.ozlabs.org From e2de3c1bf6a0c99b089bd706a62da8f988918858 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Dec 2024 21:35:01 +0800 Subject: [PATCH 0471/1450] erofs: add erofs_sb_free() helper Unify the common parts of erofs_fc_free() and erofs_kill_sb() as erofs_sb_free(). Thus, fput() in erofs_fc_get_tree() is no longer needed, too. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212133504.2047178-1-hsiangkao@linux.alibaba.com --- fs/erofs/super.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index c235a8e4315ea..de8e3ecc6381d 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -703,16 +703,19 @@ static int erofs_fc_get_tree(struct fs_context *fc) GET_TREE_BDEV_QUIET_LOOKUP : 0); #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE if (ret == -ENOTBLK) { + struct file *file; + if (!fc->source) return invalf(fc, "No source specified"); - sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); - if (IS_ERR(sbi->fdev)) - return PTR_ERR(sbi->fdev); + + file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + sbi->fdev = file; if (S_ISREG(file_inode(sbi->fdev)->i_mode) && sbi->fdev->f_mapping->a_ops->read_folio) return get_tree_nodev(fc, erofs_fc_fill_super); - fput(sbi->fdev); } #endif return ret; @@ -763,19 +766,24 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs) kfree(devs); } -static void erofs_fc_free(struct fs_context *fc) +static void erofs_sb_free(struct erofs_sb_info *sbi) { - struct erofs_sb_info *sbi = fc->s_fs_info; - - if (!sbi) - return; - erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->fdev) + fput(sbi->fdev); kfree(sbi); } +static void erofs_fc_free(struct fs_context *fc) +{ + struct erofs_sb_info *sbi = fc->s_fs_info; + + if (sbi) /* free here if an error occurs before transferring to sb */ + erofs_sb_free(sbi); +} + static const struct fs_context_operations erofs_context_ops = { .parse_param = erofs_fc_parse_param, .get_tree = erofs_fc_get_tree, @@ -813,15 +821,9 @@ static void erofs_kill_sb(struct super_block *sb) kill_anon_super(sb); else kill_block_super(sb); - - erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_fs(sb); - kfree(sbi->fsid); - kfree(sbi->domain_id); - if (sbi->fdev) - fput(sbi->fdev); - kfree(sbi); + erofs_sb_free(sbi); sb->s_fs_info = NULL; } From 73e456b402faddf354ff587a859121163709ad2d Mon Sep 17 00:00:00 2001 From: Liu Jing Date: Mon, 9 Dec 2024 14:24:25 +0800 Subject: [PATCH 0472/1450] wifi: qtnfmac: fix spelling error in core.h Fix specific spelling error in core.h. Signed-off-by: Liu Jing Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241209062425.4139-1-liujing@cmss.chinamobile.com --- drivers/net/wireless/quantenna/qtnfmac/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.h b/drivers/net/wireless/quantenna/qtnfmac/core.h index b375a4751580d..a377d85c24518 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/core.h +++ b/drivers/net/wireless/quantenna/qtnfmac/core.h @@ -102,7 +102,7 @@ struct qtnf_wmac { struct qtnf_mac_info macinfo; struct qtnf_vif iflist[QTNF_MAX_INTF]; struct cfg80211_scan_request *scan_req; - struct mutex mac_lock; /* lock during wmac speicific ops */ + struct mutex mac_lock; /* lock during wmac specific ops */ struct delayed_work scan_timeout; struct ieee80211_regdomain *rd; struct platform_device *pdev; From a42d71e322a8066dcfa228ce8529bb073c521ae9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 11 Dec 2024 11:17:09 +0100 Subject: [PATCH 0473/1450] net_sched: sch_cake: Add drop reasons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three qdisc-specific drop reasons and use them in sch_cake: 1) SKB_DROP_REASON_QDISC_OVERLIMIT Whenever the total queue limit for a qdisc instance is exceeded and a packet is dropped to make room. 2) SKB_DROP_REASON_QDISC_CONGESTED Whenever a packet is dropped by the qdisc AQM algorithm because congestion is detected. 3) SKB_DROP_REASON_CAKE_FLOOD Whenever a packet is dropped by the flood protection part of the CAKE AQM algorithm (BLUE). Also use the existing SKB_DROP_REASON_QUEUE_PURGE in cake_clear_tin(). Reasons show up as: perf record -a -e skb:kfree_skb sleep 1; perf script iperf3 665 [005] 848.656964: skb:kfree_skb: skbaddr=0xffff98168a333500 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x10f0 reason: QDISC_OVERLIMIT swapper 0 [001] 909.166055: skb:kfree_skb: skbaddr=0xffff98168280cee0 rx_sk=(nil) protocol=34525 location=cake_dequeue+0x5ef reason: QDISC_CONGESTED Reviewed-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Acked-by: Dave Taht Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241211-cake-drop-reason-v2-1-920afadf4d1b@redhat.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 18 +++++++++++++++ net/sched/sch_cake.c | 43 +++++++++++++++++++---------------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index c29282fabae6c..ead4170a1d0a9 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -58,6 +58,9 @@ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ + FN(QDISC_OVERLIMIT) \ + FN(QDISC_CONGESTED) \ + FN(CAKE_FLOOD) \ FN(FQ_BAND_LIMIT) \ FN(FQ_HORIZON_LIMIT) \ FN(FQ_FLOW_LIMIT) \ @@ -314,6 +317,21 @@ enum skb_drop_reason { * failed to enqueue to current qdisc) */ SKB_DROP_REASON_QDISC_DROP, + /** + * @SKB_DROP_REASON_QDISC_OVERLIMIT: dropped by qdisc when a qdisc + * instance exceeds its total buffer size limit. + */ + SKB_DROP_REASON_QDISC_OVERLIMIT, + /** + * @SKB_DROP_REASON_QDISC_CONGESTED: dropped by a qdisc AQM algorithm + * due to congestion. + */ + SKB_DROP_REASON_QDISC_CONGESTED, + /** + * @SKB_DROP_REASON_CAKE_FLOOD: dropped by the flood protection part of + * CAKE qdisc AQM algorithm (BLUE). + */ + SKB_DROP_REASON_CAKE_FLOOD, /** * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band * limit is reached. diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 8d8b2db4653c0..deb0925f536dd 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -484,13 +484,14 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, /* Call this with a freshly dequeued packet for possible congestion marking. * Returns true as an instruction to drop the packet, false for delivery. */ -static bool cobalt_should_drop(struct cobalt_vars *vars, - struct cobalt_params *p, - ktime_t now, - struct sk_buff *skb, - u32 bulk_flows) +static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb, + u32 bulk_flows) { - bool next_due, over_target, drop = false; + enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; + bool next_due, over_target; ktime_t schedule; u64 sojourn; @@ -533,7 +534,8 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, if (next_due && vars->dropping) { /* Use ECN mark if possible, otherwise drop */ - drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); + if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) + reason = SKB_DROP_REASON_QDISC_CONGESTED; vars->count++; if (!vars->count) @@ -556,16 +558,17 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, } /* Simple BLUE implementation. Lack of ECN is deliberate. */ - if (vars->p_drop) - drop |= (get_random_u32() < vars->p_drop); + if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && + get_random_u32() < vars->p_drop) + reason = SKB_DROP_REASON_CAKE_FLOOD; /* Overload the drop_next field as an activity timeout */ if (!vars->count) vars->drop_next = ktime_add_ns(now, p->interval); - else if (ktime_to_ns(schedule) > 0 && !drop) + else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) vars->drop_next = now; - return drop; + return reason; } static bool cake_update_flowkeys(struct flow_keys *keys, @@ -1528,12 +1531,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) flow->dropped++; b->tin_dropped++; - sch->qstats.drops++; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); - __qdisc_drop(skb, to_free); + qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); sch->q.qlen--; qdisc_tree_reduce_backlog(sch, 1, len); @@ -1926,7 +1928,7 @@ static void cake_clear_tin(struct Qdisc *sch, u16 tin) q->cur_tin = tin; for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) while (!!(skb = cake_dequeue_one(sch))) - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); } static struct sk_buff *cake_dequeue(struct Qdisc *sch) @@ -1934,6 +1936,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; struct cake_host *srchost, *dsthost; + enum skb_drop_reason reason; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; @@ -2143,12 +2146,12 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) goto begin; } + reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, + (b->bulk_flow_count * + !!(q->rate_flags & + CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ - if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, - (b->bulk_flow_count * - !!(q->rate_flags & - CAKE_FLAG_INGRESS))) || - !flow->head) + if (reason == SKB_NOT_DROPPED_YET || !flow->head) break; /* drop this packet, get another one */ @@ -2162,7 +2165,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); - kfree_skb(skb); + kfree_skb_reason(skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } From fcc680a647ba77370480fe753664cc10d572b240 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Dec 2024 18:26:38 +0100 Subject: [PATCH 0474/1450] page_pool: allow mixing PPs within one bulk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main reason for this change was to allow mixing pages from different &page_pools within one &xdp_buff/&xdp_frame. Why not? With stuff like devmem and io_uring zerocopy Rx, it's required to have separate PPs for header buffers and payload buffers. Adjust xdp_return_frame_bulk() and page_pool_put_netmem_bulk(), so that they won't be tied to a particular pool. Let the latter create a separate bulk of pages which's PP is different from the first netmem of the bulk and process it after the main loop. This greatly optimizes xdp_return_frame_bulk(): no more hashtable lookups and forced flushes on PP mismatch. Also make xdp_flush_frame_bulk() inline, as it's just one if + function call + one u32 read, not worth extending the call ladder. Co-developed-by: Toke Høiland-Jørgensen # iterative Signed-off-by: Toke Høiland-Jørgensen Suggested-by: Jakub Kicinski # while (count) Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241211172649.761483-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 6 +- include/net/xdp.h | 16 +++-- net/core/page_pool.c | 109 ++++++++++++++++++++++------------ net/core/xdp.c | 29 +-------- 4 files changed, 87 insertions(+), 73 deletions(-) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 1ea16b0e9c793..05a8640312713 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -259,8 +259,7 @@ void page_pool_disable_direct_recycling(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem); -void page_pool_put_netmem_bulk(struct page_pool *pool, netmem_ref *data, - u32 count); +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -272,8 +271,7 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, { } -static inline void page_pool_put_netmem_bulk(struct page_pool *pool, - netmem_ref *data, u32 count) +static inline void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) { } #endif diff --git a/include/net/xdp.h b/include/net/xdp.h index f4020b29122fa..9e7eb82235133 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -11,6 +11,8 @@ #include #include /* skb_shared_info */ +#include + /** * DOC: XDP RX-queue information * @@ -193,14 +195,12 @@ xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; - void *xa; netmem_ref q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { - /* bq->count will be zero'ed when bq->xa gets updated */ - bq->xa = NULL; + bq->count = 0; } static inline struct skb_shared_info * @@ -317,10 +317,18 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); +static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + if (unlikely(!bq->count)) + return; + + page_pool_put_netmem_bulk(bq->q, bq->count); + bq->count = 0; +} + static __always_inline unsigned int xdp_get_frame_len(const struct xdp_frame *xdpf) { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 4c85b77cfdac9..8292e3edbbfd2 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -839,9 +839,41 @@ void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, } EXPORT_SYMBOL(page_pool_put_unrefed_page); +static void page_pool_recycle_ring_bulk(struct page_pool *pool, + netmem_ref *bulk, + u32 bulk_len) +{ + bool in_softirq; + u32 i; + + /* Bulk produce into ptr_ring page_pool cache */ + in_softirq = page_pool_producer_lock(pool); + + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { + /* ring full */ + recycle_stat_inc(pool, ring_full); + break; + } + } + + page_pool_producer_unlock(pool, in_softirq); + recycle_stat_add(pool, ring, i); + + /* Hopefully all pages were returned into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* + * ptr_ring cache is full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation. + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, bulk[i]); +} + /** * page_pool_put_netmem_bulk() - release references on multiple netmems - * @pool: pool from which pages were allocated * @data: array holding netmem references * @count: number of entries in @data * @@ -854,52 +886,55 @@ EXPORT_SYMBOL(page_pool_put_unrefed_page); * Please note the caller must not use data area after running * page_pool_put_netmem_bulk(), as this function overwrites it. */ -void page_pool_put_netmem_bulk(struct page_pool *pool, netmem_ref *data, - u32 count) +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) { - int i, bulk_len = 0; - bool allow_direct; - bool in_softirq; - - allow_direct = page_pool_napi_local(pool); + u32 bulk_len = 0; - for (i = 0; i < count; i++) { + for (u32 i = 0; i < count; i++) { netmem_ref netmem = netmem_compound_head(data[i]); - /* It is not the last user for the page frag case */ - if (!page_pool_is_last_ref(netmem)) - continue; - - netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); - /* Approved for bulk recycling in ptr_ring cache */ - if (netmem) + if (page_pool_is_last_ref(netmem)) data[bulk_len++] = netmem; } - if (!bulk_len) - return; - - /* Bulk producer into ptr_ring page_pool cache */ - in_softirq = page_pool_producer_lock(pool); - for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, (__force void *)data[i])) { - /* ring full */ - recycle_stat_inc(pool, ring_full); - break; + count = bulk_len; + while (count) { + netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; + struct page_pool *pool = NULL; + bool allow_direct; + u32 foreign = 0; + + bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + struct page_pool *netmem_pp; + netmem_ref netmem = data[i]; + + netmem_pp = netmem_get_pp(netmem); + if (unlikely(!pool)) { + pool = netmem_pp; + allow_direct = page_pool_napi_local(pool); + } else if (netmem_pp != pool) { + /* + * If the netmem belongs to a different + * page_pool, save it for another round. + */ + data[foreign++] = netmem; + continue; + } + + netmem = __page_pool_put_page(pool, netmem, -1, + allow_direct); + /* Approved for bulk recycling in ptr_ring cache */ + if (netmem) + bulk[bulk_len++] = netmem; } - } - recycle_stat_add(pool, ring, i); - page_pool_producer_unlock(pool, in_softirq); - /* Hopefully all pages was return into ptr_ring */ - if (likely(i == bulk_len)) - return; + if (bulk_len) + page_pool_recycle_ring_bulk(pool, bulk, bulk_len); - /* ptr_ring cache full, free remaining pages outside producer lock - * since put_page() with refcnt == 1 can be an expensive operation - */ - for (; i < bulk_len; i++) - page_pool_return_page(pool, data[i]); + count = foreign; + } } EXPORT_SYMBOL(page_pool_put_netmem_bulk); diff --git a/net/core/xdp.c b/net/core/xdp.c index 938ad15c98577..56127e8ec85fb 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -511,46 +511,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); * xdp_frame_bulk is usually stored/allocated on the function * call-stack to avoid locking penalties. */ -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) -{ - struct xdp_mem_allocator *xa = bq->xa; - - if (unlikely(!xa || !bq->count)) - return; - - page_pool_put_netmem_bulk(xa->page_pool, bq->q, bq->count); - /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ - bq->count = 0; -} -EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); /* Must be called with rcu_read_lock held */ void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq) { - struct xdp_mem_info *mem = &xdpf->mem; - struct xdp_mem_allocator *xa; - - if (mem->type != MEM_TYPE_PAGE_POOL) { + if (xdpf->mem.type != MEM_TYPE_PAGE_POOL) { xdp_return_frame(xdpf); return; } - xa = bq->xa; - if (unlikely(!xa)) { - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - bq->count = 0; - bq->xa = xa; - } - if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); - if (unlikely(mem->id != xa->mem.id)) { - xdp_flush_frame_bulk(bq); - bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - } - if (unlikely(xdp_frame_has_frags(xdpf))) { struct skb_shared_info *sinfo; int i; From 56d95b0adfa224bb1c67733dbcad30dd8debd39e Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Dec 2024 18:26:39 +0100 Subject: [PATCH 0475/1450] xdp: get rid of xdp_frame::mem.id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initially, xdp_frame::mem.id was used to search for the corresponding &page_pool to return the page correctly. However, after that struct page was extended to have a direct pointer to its PP (netmem has it as well), further keeping of this field makes no sense. xdp_return_frame_bulk() still used it to do a lookup, and this leftover is now removed. Remove xdp_frame::mem and replace it with ::mem_type, as only memory type still matters and we need to know it to be able to free the frame correctly. As a cute side effect, we can now make every scalar field in &xdp_frame of 4 byte width, speeding up accesses to them. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241211172649.761483-3-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- drivers/net/veth.c | 4 +-- include/net/xdp.h | 14 +++++----- kernel/bpf/cpumap.c | 2 +- net/bpf/test_run.c | 4 +-- net/core/filter.c | 12 ++++---- net/core/xdp.c | 28 +++++++++---------- 7 files changed, 33 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index bf5baef5c3e06..4948b4906584e 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2281,7 +2281,7 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, new_xdpf->len = xdpf->len; new_xdpf->headroom = priv->tx_headroom; new_xdpf->frame_sz = DPAA_BP_RAW_SIZE; - new_xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + new_xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; /* Release the initial buffer */ xdp_return_frame_rx_napi(xdpf); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 07ebb800edf17..01251868a9c27 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -634,7 +634,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, break; case XDP_TX: orig_frame = *frame; - xdp->rxq->mem = frame->mem; + xdp->rxq->mem.type = frame->mem_type; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); frame = &orig_frame; @@ -646,7 +646,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; - xdp->rxq->mem = frame->mem; + xdp->rxq->mem.type = frame->mem_type; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { frame = &orig_frame; stats->rx_drops++; diff --git a/include/net/xdp.h b/include/net/xdp.h index 9e7eb82235133..1c260869a3539 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -169,13 +169,13 @@ xdp_get_buff_len(const struct xdp_buff *xdp) struct xdp_frame { void *data; - u16 len; - u16 headroom; + u32 len; + u32 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem info is valid on remote CPU. + * while mem_type is valid on remote CPU. */ - struct xdp_mem_info mem; + enum xdp_mem_type mem_type:32; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ @@ -306,13 +306,13 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; - /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ - xdp_frame->mem = xdp->rxq->mem; + /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */ + xdp_frame->mem_type = xdp->rxq->mem.type; return xdp_frame; } -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, +void __xdp_return(void *data, enum xdp_mem_type mem_type, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a2f46785ac3b3..774accbd4a223 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -190,7 +190,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, int err; rxq.dev = xdpf->dev_rx; - rxq.mem = xdpf->mem; + rxq.mem.type = xdpf->mem_type; /* TODO: report queue_index to xdp_rxq_info */ xdp_convert_frame_to_buff(xdpf, &xdp); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 501ec4249fedc..9ae2a7f1738b3 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -153,7 +153,7 @@ static void xdp_test_run_init_page(netmem_ref netmem, void *arg) new_ctx->data = new_ctx->data_meta + meta_len; xdp_update_frame_from_buff(new_ctx, frm); - frm->mem = new_ctx->rxq->mem; + frm->mem_type = new_ctx->rxq->mem.type; memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); } @@ -246,7 +246,7 @@ static void reset_ctx(struct xdp_page_head *head) head->ctx.data_meta = head->orig_ctx.data_meta; head->ctx.data_end = head->orig_ctx.data_end; xdp_update_frame_from_buff(&head->ctx, head->frame); - head->frame->mem = head->orig_ctx.rxq->mem; + head->frame->mem_type = head->orig_ctx.rxq->mem.type; } static int xdp_recv_frames(struct xdp_frame **frames, int nframes, diff --git a/net/core/filter.c b/net/core/filter.c index fac245065b0a4..6c036708634bb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4119,13 +4119,13 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) } static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - struct xdp_mem_info *mem_info, bool release) + enum xdp_mem_type mem_type, bool release) { struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); if (release) { xsk_buff_del_tail(zc_frag); - __xdp_return(NULL, mem_info, false, zc_frag); + __xdp_return(NULL, mem_type, false, zc_frag); } else { zc_frag->data_end -= shrink; } @@ -4134,18 +4134,18 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) { - struct xdp_mem_info *mem_info = &xdp->rxq->mem; + enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; - if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); + if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { + bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); goto out; } if (release) { struct page *page = skb_frag_page(frag); - __xdp_return(page_address(page), mem_info, false, NULL); + __xdp_return(page_address(page), mem_type, false, NULL); } out: diff --git a/net/core/xdp.c b/net/core/xdp.c index 56127e8ec85fb..d367571c58386 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -430,12 +430,12 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, +void __xdp_return(void *data, enum xdp_mem_type mem_type, bool napi_direct, struct xdp_buff *xdp) { struct page *page; - switch (mem->type) { + switch (mem_type) { case MEM_TYPE_PAGE_POOL: page = virt_to_head_page(data); if (napi_direct && xdp_return_frame_no_direct()) @@ -458,7 +458,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ - WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); + WARN(1, "Incorrect XDP memory type (%d) usage", mem_type); break; } } @@ -475,10 +475,10 @@ void xdp_return_frame(struct xdp_frame *xdpf) for (i = 0; i < sinfo->nr_frags; i++) { struct page *page = skb_frag_page(&sinfo->frags[i]); - __xdp_return(page_address(page), &xdpf->mem, false, NULL); + __xdp_return(page_address(page), xdpf->mem_type, false, NULL); } out: - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + __xdp_return(xdpf->data, xdpf->mem_type, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); @@ -494,10 +494,10 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) for (i = 0; i < sinfo->nr_frags; i++) { struct page *page = skb_frag_page(&sinfo->frags[i]); - __xdp_return(page_address(page), &xdpf->mem, true, NULL); + __xdp_return(page_address(page), xdpf->mem_type, true, NULL); } out: - __xdp_return(xdpf->data, &xdpf->mem, true, NULL); + __xdp_return(xdpf->data, xdpf->mem_type, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -516,7 +516,7 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq) { - if (xdpf->mem.type != MEM_TYPE_PAGE_POOL) { + if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) { xdp_return_frame(xdpf); return; } @@ -553,10 +553,11 @@ void xdp_return_buff(struct xdp_buff *xdp) for (i = 0; i < sinfo->nr_frags; i++) { struct page *page = skb_frag_page(&sinfo->frags[i]); - __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp); + __xdp_return(page_address(page), xdp->rxq->mem.type, true, + xdp); } out: - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); + __xdp_return(xdp->data, xdp->rxq->mem.type, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); @@ -602,7 +603,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->headroom = 0; xdpf->metasize = metasize; xdpf->frame_sz = PAGE_SIZE; - xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; xsk_buff_free(xdp); return xdpf; @@ -672,7 +673,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, * - RX ring dev queue index (skb_record_rx_queue) */ - if (xdpf->mem.type == MEM_TYPE_PAGE_POOL) + if (xdpf->mem_type == MEM_TYPE_PAGE_POOL) skb_mark_for_recycle(skb); /* Allow SKB to reuse area used by xdp_frame */ @@ -719,8 +720,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) nxdpf = addr; nxdpf->data = addr + headroom; nxdpf->frame_sz = PAGE_SIZE; - nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - nxdpf->mem.id = 0; + nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0; return nxdpf; } From 207ff83cecaeaacf0d47c8ccbe927c8354ac1280 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Dec 2024 18:26:40 +0100 Subject: [PATCH 0476/1450] xdp: make __xdp_return() MP-agnostic Currently, __xdp_return() takes pointer to the virtual memory to free a buffer. Apart from that this sometimes provokes redundant data <--> page conversions, taking data pointer effectively prevents lots of XDP code to support non-page-backed buffers, as there's no mapping for the non-host memory (data is always NULL). Just convert it to always take netmem reference. For xdp_return_{buff,frame*}(), this chops off one page_address() per each frag and adds one virt_to_netmem() (same as virt_to_page()) per header buffer. For __xdp_return() itself, it removes one virt_to_page() for MEM_TYPE_PAGE_POOL and another one for MEM_TYPE_PAGE_ORDER0, adding one page_address() for [not really common nowadays] MEM_TYPE_PAGE_SHARED, but the main effect is that the abovementioned functions won't die or memleak anymore if the frame has non-host memory attached and will correctly free those. Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241211172649.761483-4-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 4 ++-- net/core/filter.c | 9 +++------ net/core/xdp.c | 47 +++++++++++++++++++---------------------------- 3 files changed, 24 insertions(+), 36 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 1c260869a3539..d2089cfecefd5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -312,8 +312,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) return xdp_frame; } -void __xdp_return(void *data, enum xdp_mem_type mem_type, bool napi_direct, - struct xdp_buff *xdp); +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); diff --git a/net/core/filter.c b/net/core/filter.c index 6c036708634bb..5fea874025d3a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4125,7 +4125,7 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, if (release) { xsk_buff_del_tail(zc_frag); - __xdp_return(NULL, mem_type, false, zc_frag); + __xdp_return(0, mem_type, false, zc_frag); } else { zc_frag->data_end -= shrink; } @@ -4142,11 +4142,8 @@ static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, goto out; } - if (release) { - struct page *page = skb_frag_page(frag); - - __xdp_return(page_address(page), mem_type, false, NULL); - } + if (release) + __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); out: return release; diff --git a/net/core/xdp.c b/net/core/xdp.c index d367571c58386..f1165a35411bc 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -430,27 +430,25 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -void __xdp_return(void *data, enum xdp_mem_type mem_type, bool napi_direct, - struct xdp_buff *xdp) +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp) { - struct page *page; - switch (mem_type) { case MEM_TYPE_PAGE_POOL: - page = virt_to_head_page(data); + netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) * as mem->type knows this a page_pool page */ - page_pool_put_full_page(page->pp, page, napi_direct); + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, + napi_direct); break; case MEM_TYPE_PAGE_SHARED: - page_frag_free(data); + page_frag_free(__netmem_address(netmem)); break; case MEM_TYPE_PAGE_ORDER0: - page = virt_to_page(data); /* Assumes order0 page*/ - put_page(page); + put_page(__netmem_to_page(netmem)); break; case MEM_TYPE_XSK_BUFF_POOL: /* NB! Only valid from an xdp_buff! */ @@ -466,38 +464,34 @@ void __xdp_return(void *data, enum xdp_mem_type mem_type, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + false, NULL); - __xdp_return(page_address(page), xdpf->mem_type, false, NULL); - } out: - __xdp_return(xdpf->data, xdpf->mem_type, false, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + true, NULL); - __xdp_return(page_address(page), xdpf->mem_type, true, NULL); - } out: - __xdp_return(xdpf->data, xdpf->mem_type, true, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -544,20 +538,17 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); void xdp_return_buff(struct xdp_buff *xdp) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), + xdp->rxq->mem.type, true, xdp); - __xdp_return(page_address(page), xdp->rxq->mem.type, true, - xdp); - } out: - __xdp_return(xdp->data, xdp->rxq->mem.type, true, xdp); + __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); From 0dffdb3b3366c932fb7d210f5032476c552f7000 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Dec 2024 18:26:47 +0100 Subject: [PATCH 0477/1450] skbuff: allow 2-4-argument skb_frag_dma_map() skb_frag_dma_map(dev, frag, 0, skb_frag_size(frag), DMA_TO_DEVICE) is repeated across dozens of drivers and really wants a shorthand. Add a macro which will count args and handle all possible number from 2 to 5. Semantics: skb_frag_dma_map(dev, frag) -> __skb_frag_dma_map(dev, frag, 0, skb_frag_size(frag), DMA_TO_DEVICE) skb_frag_dma_map(dev, frag, offset) -> __skb_frag_dma_map(dev, frag, offset, skb_frag_size(frag) - offset, DMA_TO_DEVICE) skb_frag_dma_map(dev, frag, offset, size) -> __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE) skb_frag_dma_map(dev, frag, offset, size, dir) -> __skb_frag_dma_map(dev, frag, offset, size, dir) No object code size changes for the existing callers. Users passing less arguments also won't have bigger size comparing to the full equivalent call. Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241211172649.761483-11-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 69624b394cd94..b2509cd0b9306 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3674,7 +3674,7 @@ static inline void skb_frag_page_copy(skb_frag_t *fragto, bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** - * skb_frag_dma_map - maps a paged fragment via the DMA API + * __skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to * @frag: the paged fragment to map * @offset: the offset within the fragment (starting at the @@ -3684,15 +3684,36 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); * * Maps the page associated with @frag to @device. */ -static inline dma_addr_t skb_frag_dma_map(struct device *dev, - const skb_frag_t *frag, - size_t offset, size_t size, - enum dma_data_direction dir) +static inline dma_addr_t __skb_frag_dma_map(struct device *dev, + const skb_frag_t *frag, + size_t offset, size_t size, + enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } +#define skb_frag_dma_map(dev, frag, ...) \ + CONCATENATE(_skb_frag_dma_map, \ + COUNT_ARGS(__VA_ARGS__))(dev, frag, ##__VA_ARGS__) + +#define __skb_frag_dma_map1(dev, frag, offset, uf, uo) ({ \ + const skb_frag_t *uf = (frag); \ + size_t uo = (offset); \ + \ + __skb_frag_dma_map(dev, uf, uo, skb_frag_size(uf) - uo, \ + DMA_TO_DEVICE); \ +}) +#define _skb_frag_dma_map1(dev, frag, offset) \ + __skb_frag_dma_map1(dev, frag, offset, __UNIQUE_ID(frag_), \ + __UNIQUE_ID(offset_)) +#define _skb_frag_dma_map0(dev, frag) \ + _skb_frag_dma_map1(dev, frag, 0) +#define _skb_frag_dma_map2(dev, frag, offset, size) \ + __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE) +#define _skb_frag_dma_map3(dev, frag, offset, size, dir) \ + __skb_frag_dma_map(dev, frag, offset, size, dir) + static inline struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { From 91a152cbb49c26609d217cf2f116d46143b9b8be Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 11 Dec 2024 21:20:28 +0000 Subject: [PATCH 0478/1450] net: page_pool: rename page_pool_alloc_netmem to *_netmems page_pool_alloc_netmem (without an s) was the mirror of page_pool_alloc_pages (with an s), which was confusing. Rename to page_pool_alloc_netmems so it's the mirror of page_pool_alloc_pages. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20241211212033.1684197-2-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 2 +- net/core/page_pool.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 05a8640312713..3270c92841b49 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -242,7 +242,7 @@ struct page_pool { }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp); +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 8292e3edbbfd2..7a17af286a9e1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -574,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { netmem_ref netmem; @@ -590,11 +590,11 @@ netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; } -EXPORT_SYMBOL(page_pool_alloc_netmem); +EXPORT_SYMBOL(page_pool_alloc_netmems); struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) { - return netmem_to_page(page_pool_alloc_netmem(pool, gfp)); + return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); @@ -992,7 +992,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, } if (!netmem) { - netmem = page_pool_alloc_netmem(pool, gfp); + netmem = page_pool_alloc_netmems(pool, gfp); if (unlikely(!netmem)) { pool->frag_page = 0; return 0; From 8156c310499a34c8f42b2e2b7360abb805683bbe Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 11 Dec 2024 21:20:29 +0000 Subject: [PATCH 0479/1450] net: page_pool: create page_pool_alloc_netmem Create page_pool_alloc_netmem to be the mirror of page_pool_alloc. This enables drivers that want currently use page_pool_alloc to transition to netmem by converting the call sites to page_pool_alloc_netmem. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20241211212033.1684197-3-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 26caa2c209125..95af7f0b029e4 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -115,22 +115,22 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, return page_pool_alloc_frag(pool, offset, size, gfp); } -static inline struct page *page_pool_alloc(struct page_pool *pool, - unsigned int *offset, - unsigned int *size, gfp_t gfp) +static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; - struct page *page; + netmem_ref netmem; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; - return page_pool_alloc_pages(pool, gfp); + return page_pool_alloc_netmems(pool, gfp); } - page = page_pool_alloc_frag(pool, offset, *size, gfp); - if (unlikely(!page)) - return NULL; + netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); + if (unlikely(!netmem)) + return 0; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize @@ -141,7 +141,14 @@ static inline struct page *page_pool_alloc(struct page_pool *pool, pool->frag_offset = max_size; } - return page; + return netmem; +} + +static inline struct page *page_pool_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); } /** From b400f4b87430c105d92550cee5a72aea01fdf3d6 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 11 Dec 2024 21:20:30 +0000 Subject: [PATCH 0480/1450] page_pool: Set `dma_sync` to false for devmem memory provider Move the `dma_map` and `dma_sync` checks to `page_pool_init` to make them generic. Set dma_sync to false for devmem memory provider because the dma_sync APIs should not be used for dma_buf backed devmem memory provider. Cc: Jason Gunthorpe Signed-off-by: Samiullah Khawaja Signed-off-by: Mina Almasry Link: https://patch.msgid.link/20241211212033.1684197-4-almasrymina@google.com Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 9 ++++----- net/core/page_pool.c | 3 +++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/net/core/devmem.c b/net/core/devmem.c index 11b91c12ee113..3ebdeed2bf188 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -331,11 +331,10 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) if (!binding) return -EINVAL; - if (!pool->dma_map) - return -EOPNOTSUPP; - - if (pool->dma_sync) - return -EOPNOTSUPP; + /* dma-buf dma addresses do not need and should not be used with + * dma_sync_for_cpu/device. Force disable dma_sync. + */ + pool->dma_sync = false; if (pool->p.order != 0) return -E2BIG; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 7a17af286a9e1..275a7fd209d7a 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -287,6 +287,9 @@ static int page_pool_init(struct page_pool *pool, } if (pool->mp_priv) { + if (!pool->dma_map || !pool->dma_sync) + return -EOPNOTSUPP; + err = mp_dmabuf_devmem_init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, From 7dba339faae991a23c54f7b93a58798c58f8c16f Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 11 Dec 2024 21:20:31 +0000 Subject: [PATCH 0481/1450] page_pool: disable sync for cpu for dmabuf memory provider dmabuf dma-addresses should not be dma_sync'd for CPU/device. Typically its the driver responsibility to dma_sync for CPU, but the driver should not dma_sync for CPU if the netmem is actually coming from a dmabuf memory provider. The page_pool already exposes a helper for dma_sync_for_cpu: page_pool_dma_sync_for_cpu. Upgrade this existing helper to handle netmem, and have it skip dma_sync if the memory is from a dmabuf memory provider. Drivers should migrate to using this helper when adding support for netmem. Also minimize the impact on the dma syncing performance for pages. Special case the dma-sync path for pages to not go through the overhead checks for dma-syncing and conversion to netmem. Cc: Alexander Lobakin Cc: Jason Gunthorpe Signed-off-by: Mina Almasry Link: https://patch.msgid.link/20241211212033.1684197-5-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 35 ++++++++++++++++++++++++++++----- include/net/page_pool/types.h | 3 ++- net/core/devmem.c | 1 + net/core/page_pool.c | 1 + 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 95af7f0b029e4..e555921e52339 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -422,7 +422,21 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { - return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page)); + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) + ret <<= PAGE_SHIFT; + + return ret; +} + +static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const dma_addr_t dma_addr, + u32 offset, u32 dma_sync_size) +{ + dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, + offset + pool->p.offset, dma_sync_size, + page_pool_get_dma_dir(pool)); } /** @@ -441,10 +455,21 @@ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { - dma_sync_single_range_for_cpu(pool->p.dev, - page_pool_get_dma_addr(page), - offset + pool->p.offset, dma_sync_size, - page_pool_get_dma_dir(pool)); + __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, + dma_sync_size); +} + +static inline void +page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, + const netmem_ref netmem, u32 offset, + u32 dma_sync_size) +{ + if (!pool->dma_sync_for_cpu) + return; + + __page_pool_dma_sync_for_cpu(pool, + page_pool_get_dma_addr_netmem(netmem), + offset, dma_sync_size); } static inline bool page_pool_put(struct page_pool *pool) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 3270c92841b49..ed4cd114180ae 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -164,7 +164,8 @@ struct page_pool { bool has_init_callback:1; /* slow::init_callback is set */ bool dma_map:1; /* Perform DMA mapping */ - bool dma_sync:1; /* Perform DMA sync */ + bool dma_sync:1; /* Perform DMA sync for device */ + bool dma_sync_for_cpu:1; /* Perform DMA sync for cpu */ #ifdef CONFIG_PAGE_POOL_STATS bool system:1; /* This is a global percpu pool */ #endif diff --git a/net/core/devmem.c b/net/core/devmem.c index 3ebdeed2bf188..0b6ed7525b22a 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -335,6 +335,7 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) * dma_sync_for_cpu/device. Force disable dma_sync. */ pool->dma_sync = false; + pool->dma_sync_for_cpu = false; if (pool->p.order != 0) return -E2BIG; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 275a7fd209d7a..e07ad73159550 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -201,6 +201,7 @@ static int page_pool_init(struct page_pool *pool, memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); pool->cpuid = cpuid; + pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ if (pool->slow.flags & ~PP_FLAG_ALL) From f9244fb55f37356f75c739c57323d9422d7aa0f8 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 7 Nov 2024 16:17:00 +0100 Subject: [PATCH 0482/1450] xen/netfront: fix crash when removing device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When removing a netfront device directly after a suspend/resume cycle it might happen that the queues have not been setup again, causing a crash during the attempt to stop the queues another time. Fix that by checking the queues are existing before trying to stop them. This is XSA-465 / CVE-2024-53240. Reported-by: Marek Marczykowski-Górecki Fixes: d50b7914fae0 ("xen-netfront: Fix NULL sring after live migration") Signed-off-by: Juergen Gross --- drivers/net/xen-netfront.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 4265c1cd0ff71..63fe51d0e64db 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -867,7 +867,7 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev static int xennet_close(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); - unsigned int num_queues = dev->real_num_tx_queues; + unsigned int num_queues = np->queues ? dev->real_num_tx_queues : 0; unsigned int i; struct netfront_queue *queue; netif_tx_stop_all_queues(np->netdev); @@ -882,6 +882,9 @@ static void xennet_destroy_queues(struct netfront_info *info) { unsigned int i; + if (!info->queues) + return; + for (i = 0; i < info->netdev->real_num_tx_queues; i++) { struct netfront_queue *queue = &info->queues[i]; From efbcd61d9bebb771c836a3b8bfced8165633db7c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 08:29:48 +0200 Subject: [PATCH 0483/1450] x86: make get_cpu_vendor() accessible from Xen code In order to be able to differentiate between AMD and Intel based systems for very early hypercalls without having to rely on the Xen hypercall page, make get_cpu_vendor() non-static. Refactor early_cpu_init() for the same reason by splitting out the loop initializing cpu_devs() into an externally callable function. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/cpu/common.c | 38 ++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0975815980c8..20e6009381ed6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -230,6 +230,8 @@ static inline unsigned long long l1tf_pfn_limit(void) return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT); } +void init_cpu_devs(void); +void get_cpu_vendor(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5c28975c608e..3e9037690814b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -867,7 +867,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -static void get_cpu_vendor(struct cpuinfo_x86 *c) +void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -1649,15 +1649,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) detect_nopl(); } -void __init early_cpu_init(void) +void __init init_cpu_devs(void) { const struct cpu_dev *const *cdev; int count = 0; -#ifdef CONFIG_PROCESSOR_SELECT - pr_info("KERNEL supported cpus:\n"); -#endif - for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; @@ -1665,20 +1661,30 @@ void __init early_cpu_init(void) break; cpu_devs[count] = cpudev; count++; + } +} +void __init early_cpu_init(void) +{ #ifdef CONFIG_PROCESSOR_SELECT - { - unsigned int j; - - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - pr_info(" %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); - } - } + unsigned int i, j; + + pr_info("KERNEL supported cpus:\n"); #endif + + init_cpu_devs(); + +#ifdef CONFIG_PROCESSOR_SELECT + for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) { + for (j = 0; j < 2; j++) { + if (!cpu_devs[i]->c_ident[j]) + continue; + pr_info(" %s %s\n", cpu_devs[i]->c_vendor, + cpu_devs[i]->c_ident[j]); + } } +#endif + early_identify_cpu(&boot_cpu_data); } From dda014ba59331dee4f3b773a020e109932f4bd24 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 15:47:49 +0100 Subject: [PATCH 0484/1450] objtool/x86: allow syscall instruction The syscall instruction is used in Xen PV mode for doing hypercalls. Allow syscall to be used in the kernel in case it is tagged with an unwind hint for objtool. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra --- tools/objtool/check.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4ce176ad411fb..76060da755b5c 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3820,9 +3820,12 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, break; case INSN_CONTEXT_SWITCH: - if (func && (!next_insn || !next_insn->hint)) { - WARN_INSN(insn, "unsupported instruction in callable function"); - return 1; + if (func) { + if (!next_insn || !next_insn->hint) { + WARN_INSN(insn, "unsupported instruction in callable function"); + return 1; + } + break; } return 0; From 0ef8047b737d7480a5d4c46d956e97c190f13050 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 16:15:54 +0100 Subject: [PATCH 0485/1450] x86/static-call: provide a way to do very early static-call updates Add static_call_update_early() for updating static-call targets in very early boot. This will be needed for support of Xen guest type specific hypercall functions. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- arch/x86/include/asm/static_call.h | 15 ++++++++++++ arch/x86/include/asm/sync_core.h | 6 ++--- arch/x86/kernel/static_call.c | 9 ++++++++ include/linux/compiler.h | 37 +++++++++++++++++++++--------- include/linux/static_call.h | 1 + kernel/static_call_inline.c | 2 +- 6 files changed, 55 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 125c407e2abe6..41502bd2afd64 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -65,4 +65,19 @@ extern bool __static_call_fixup(void *tramp, u8 op, void *dest); +extern void __static_call_update_early(void *tramp, void *func); + +#define static_call_update_early(name, _func) \ +({ \ + typeof(&STATIC_CALL_TRAMP(name)) __F = (_func); \ + if (static_call_initialized) { \ + __static_call_update(&STATIC_CALL_KEY(name), \ + STATIC_CALL_TRAMP_ADDR(name), __F);\ + } else { \ + WRITE_ONCE(STATIC_CALL_KEY(name).func, _func); \ + __static_call_update_early(STATIC_CALL_TRAMP_ADDR(name),\ + __F); \ + } \ +}) + #endif /* _ASM_STATIC_CALL_H */ diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h index ab7382f92aff2..96bda43538ee7 100644 --- a/arch/x86/include/asm/sync_core.h +++ b/arch/x86/include/asm/sync_core.h @@ -8,7 +8,7 @@ #include #ifdef CONFIG_X86_32 -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { asm volatile ( "pushfl\n\t" @@ -19,7 +19,7 @@ static inline void iret_to_self(void) : ASM_CALL_CONSTRAINT : : "memory"); } #else -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { unsigned int tmp; @@ -55,7 +55,7 @@ static inline void iret_to_self(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ -static inline void sync_core(void) +static __always_inline void sync_core(void) { /* * The SERIALIZE instruction is the most straightforward way to diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 4eefaac64c6cb..9eed0c144dad5 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -172,6 +172,15 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) } EXPORT_SYMBOL_GPL(arch_static_call_transform); +noinstr void __static_call_update_early(void *tramp, void *func) +{ + BUG_ON(system_state != SYSTEM_BOOTING); + BUG_ON(!early_boot_irqs_disabled); + BUG_ON(static_call_initialized); + __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE); + sync_core(); +} + #ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495f..240c632c5b957 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -216,28 +216,43 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_64BIT +#define ARCH_SEL(a,b) a +#else +#define ARCH_SEL(a,b) b +#endif + /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined * otherwise, or eliminated entirely due to lack of references that are * visible to the compiler. */ -#define ___ADDRESSABLE(sym, __attrs) \ - static void * __used __attrs \ +#define ___ADDRESSABLE(sym, __attrs) \ + static void * __used __attrs \ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -/** - * offset_to_ptr - convert a relative memory offset to an absolute pointer - * @off: the address of the 32-bit offset value - */ -static inline void *offset_to_ptr(const int *off) -{ - return (void *)((unsigned long)off + *off); -} +#define __ADDRESSABLE_ASM(sym) \ + .pushsection .discard.addressable,"aw"; \ + .align ARCH_SEL(8,4); \ + ARCH_SEL(.quad, .long) __stringify(sym); \ + .popsection; -#endif /* __ASSEMBLY__ */ +#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) #ifdef __CHECKER__ #define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 141e6b176a1b3..785980af89729 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,6 +138,7 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include +extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index 5259cda486d05..bb7d066a7c397 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -15,7 +15,7 @@ extern struct static_call_site __start_static_call_sites[], extern struct static_call_tramp_key __start_static_call_tramp_key[], __stop_static_call_tramp_key[]; -static int static_call_initialized; +int static_call_initialized; /* * Must be called before early_initcall() to be effective. From a2796dff62d6c6bfc5fbebdf2bee0d5ac0438906 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 16 Oct 2024 10:40:26 +0200 Subject: [PATCH 0486/1450] x86/xen: don't do PV iret hypercall through hypercall page Instead of jumping to the Xen hypercall page for doing the iret hypercall, directly code the required sequence in xen-asm.S. This is done in preparation of no longer using hypercall page at all, as it has shown to cause problems with speculation mitigations. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- arch/x86/xen/xen-asm.S | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 83189cf5cdce9..ca6edfe4c14b1 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -176,7 +176,6 @@ SYM_CODE_START(xen_early_idt_handler_array) SYM_CODE_END(xen_early_idt_handler_array) __FINIT -hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* * Xen64 iret frame: * @@ -186,17 +185,28 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 * cs * rip <-- standard iret frame * - * flags + * flags <-- xen_iret must push from here on * - * rcx } - * r11 }<-- pushed by hypercall page - * rsp->rax } + * rcx + * r11 + * rsp->rax */ +.macro xen_hypercall_iret + pushq $0 /* Flags */ + push %rcx + push %r11 + push %rax + mov $__HYPERVISOR_iret, %eax + syscall /* Do the IRET. */ +#ifdef CONFIG_MITIGATION_SLS + int3 +#endif +.endm + SYM_CODE_START(xen_iret) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_iret) /* @@ -301,8 +311,7 @@ SYM_CODE_START(xen_entry_SYSENTER_compat) ENDBR lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_entry_SYSENTER_compat) SYM_CODE_END(xen_entry_SYSCALL_compat) From e1e1af9148dc4c866eda3fb59cd6ec3c7ea34b1d Mon Sep 17 00:00:00 2001 From: Zhang Zekun Date: Fri, 25 Oct 2024 15:34:08 +0800 Subject: [PATCH 0487/1450] drm/panel: himax-hx83102: Add a check to prevent NULL pointer dereference drm_mode_duplicate() could return NULL due to lack of memory, which will then call NULL pointer dereference. Add a check to prevent it. Fixes: 0ef94554dc40 ("drm/panel: himax-hx83102: Break out as separate driver") Signed-off-by: Zhang Zekun Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20241025073408.27481-3-zhangzekun11@huawei.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241025073408.27481-3-zhangzekun11@huawei.com --- drivers/gpu/drm/panel/panel-himax-hx83102.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/panel/panel-himax-hx83102.c b/drivers/gpu/drm/panel/panel-himax-hx83102.c index 8b48bba181316..3644a7544b935 100644 --- a/drivers/gpu/drm/panel/panel-himax-hx83102.c +++ b/drivers/gpu/drm/panel/panel-himax-hx83102.c @@ -565,6 +565,8 @@ static int hx83102_get_modes(struct drm_panel *panel, struct drm_display_mode *mode; mode = drm_mode_duplicate(connector->dev, m); + if (!mode) + return -ENOMEM; mode->type = DRM_MODE_TYPE_DRIVER | DRM_MODE_TYPE_PREFERRED; drm_mode_set_name(mode); From f8fd0968eff52cf092c0d517d17507ea2f6e5ea5 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 29 Oct 2024 20:39:57 +0800 Subject: [PATCH 0488/1450] drm/panel: novatek-nt35950: fix return value check in nt35950_probe() mipi_dsi_device_register_full() never returns NULL pointer, it will return ERR_PTR() when it fails, so replace the check with IS_ERR(). Fixes: 623a3531e9cf ("drm/panel: Add driver for Novatek NT35950 DSI DriverIC panels") Signed-off-by: Yang Yingliang Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20241029123957.1588-1-yangyingliang@huaweicloud.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241029123957.1588-1-yangyingliang@huaweicloud.com --- drivers/gpu/drm/panel/panel-novatek-nt35950.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-novatek-nt35950.c b/drivers/gpu/drm/panel/panel-novatek-nt35950.c index b036208f93560..08b22b592ab04 100644 --- a/drivers/gpu/drm/panel/panel-novatek-nt35950.c +++ b/drivers/gpu/drm/panel/panel-novatek-nt35950.c @@ -481,9 +481,9 @@ static int nt35950_probe(struct mipi_dsi_device *dsi) return dev_err_probe(dev, -EPROBE_DEFER, "Cannot get secondary DSI host\n"); nt->dsi[1] = mipi_dsi_device_register_full(dsi_r_host, info); - if (!nt->dsi[1]) { + if (IS_ERR(nt->dsi[1])) { dev_err(dev, "Cannot get secondary DSI node\n"); - return -ENODEV; + return PTR_ERR(nt->dsi[1]); } num_dsis++; } From 406dd4c7984a457567ca652455d5efad81983f02 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 24 Nov 2024 23:48:07 +0100 Subject: [PATCH 0489/1450] drm/panel: st7701: Add prepare_prev_first flag to drm_panel The DSI host must be enabled for the panel to be initialized in prepare(). Set the prepare_prev_first flag to guarantee this. This fixes the panel operation on NXP i.MX8MP SoC / Samsung DSIM DSI host. Fixes: 849b2e3ff969 ("drm/panel: Add Sitronix ST7701 panel driver") Signed-off-by: Marek Vasut Reviewed-by: Jessica Zhang Link: https://lore.kernel.org/r/20241124224812.150263-1-marex@denx.de Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241124224812.150263-1-marex@denx.de --- drivers/gpu/drm/panel/panel-sitronix-st7701.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-sitronix-st7701.c b/drivers/gpu/drm/panel/panel-sitronix-st7701.c index eef03d04e0cd2..1f72ef7ca74c9 100644 --- a/drivers/gpu/drm/panel/panel-sitronix-st7701.c +++ b/drivers/gpu/drm/panel/panel-sitronix-st7701.c @@ -1177,6 +1177,7 @@ static int st7701_probe(struct device *dev, int connector_type) return dev_err_probe(dev, ret, "Failed to get orientation\n"); drm_panel_init(&st7701->panel, dev, &st7701_funcs, connector_type); + st7701->panel.prepare_prev_first = true; /** * Once sleep out has been issued, ST7701 IC required to wait 120ms From d2bd3fcb825725a59c8880070b1206b1710922bd Mon Sep 17 00:00:00 2001 From: Michael Trimarchi Date: Thu, 5 Dec 2024 17:29:58 +0100 Subject: [PATCH 0490/1450] drm/panel: synaptics-r63353: Fix regulator unbalance The shutdown function can be called when the display is already unprepared. For example during reboot this trigger a kernel backlog. Calling the drm_panel_unprepare, allow us to avoid to trigger the kernel warning. Fixes: 2e87bad7cd33 ("drm/panel: Add Synaptics R63353 panel driver") Tested-by: Dario Binacchi Signed-off-by: Michael Trimarchi Signed-off-by: Dario Binacchi Reviewed-by: Neil Armstrong Reviewed-by: Jessica Zhang Link: https://lore.kernel.org/r/20241205163002.1804784-1-dario.binacchi@amarulasolutions.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20241205163002.1804784-1-dario.binacchi@amarulasolutions.com --- drivers/gpu/drm/panel/panel-synaptics-r63353.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panel/panel-synaptics-r63353.c b/drivers/gpu/drm/panel/panel-synaptics-r63353.c index 169c629746c71..17349825543fe 100644 --- a/drivers/gpu/drm/panel/panel-synaptics-r63353.c +++ b/drivers/gpu/drm/panel/panel-synaptics-r63353.c @@ -325,7 +325,7 @@ static void r63353_panel_shutdown(struct mipi_dsi_device *dsi) { struct r63353_panel *rpanel = mipi_dsi_get_drvdata(dsi); - r63353_panel_unprepare(&rpanel->base); + drm_panel_unprepare(&rpanel->base); } static const struct r63353_desc sharp_ls068b3sx02_data = { From 0e8c52091633b354b12d0c29a27a22077584c111 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 12 Dec 2024 13:29:42 +0200 Subject: [PATCH 0491/1450] wifi: iwlwifi: fix CRF name for Bz We had BE201 hard coded. Look at the RF_ID and decide based on its value. Fixes: 6795a37161fb ("wifi: iwlwifi: Print a specific device name.") Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241212132940.b9eebda1ca60.I36791a134ed5e538e059418eb6520761da97b44c@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/cfg/bz.c | 1 + .../net/wireless/intel/iwlwifi/iwl-config.h | 1 + drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 37 ++++++++++++++++++- 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/bz.c b/drivers/net/wireless/intel/iwlwifi/cfg/bz.c index cd1fe8490ae56..1c43f283ac4ab 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/bz.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/bz.c @@ -161,6 +161,7 @@ const struct iwl_cfg_trans_params iwl_gl_trans_cfg = { const char iwl_bz_name[] = "Intel(R) TBD Bz device"; const char iwl_fm_name[] = "Intel(R) Wi-Fi 7 BE201 320MHz"; +const char iwl_wh_name[] = "Intel(R) Wi-Fi 7 BE211 320MHz"; const char iwl_gl_name[] = "Intel(R) Wi-Fi 7 BE200 320MHz"; const char iwl_mtp_name[] = "Intel(R) Wi-Fi 7 BE202 160MHz"; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index 34c91deca57b1..17721bb47e251 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -545,6 +545,7 @@ extern const char iwl_ax231_name[]; extern const char iwl_ax411_name[]; extern const char iwl_bz_name[]; extern const char iwl_fm_name[]; +extern const char iwl_wh_name[]; extern const char iwl_gl_name[]; extern const char iwl_mtp_name[]; extern const char iwl_sc_name[]; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 805fb249a0c6a..8fb2aa2822421 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1106,19 +1106,54 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { iwlax210_2ax_cfg_so_jf_b0, iwl9462_name), /* Bz */ -/* FIXME: need to change the naming according to the actual CRF */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax201_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax211_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_fm_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_wh_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax201_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_ax211_name), + + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_fm_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + iwl_cfg_bz, iwl_wh_name), + /* Ga (Gl) */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_GL, IWL_CFG_ANY, From aa21f333c86c8a09d39189de87abb0153d338190 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 11 Dec 2024 13:11:17 +0100 Subject: [PATCH 0492/1450] fs: fix is_mnt_ns_file() Commit 1fa08aece425 ("nsfs: convert to path_from_stashed() helper") reused nsfs dentry's d_fsdata, which no longer contains a pointer to proc_ns_operations. Fix the remaining use in is_mnt_ns_file(). Fixes: 1fa08aece425 ("nsfs: convert to path_from_stashed() helper") Cc: stable@vger.kernel.org # v6.9 Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20241211121118.85268-1-mszeredi@redhat.com Acked-by: Al Viro Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/namespace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 23e81c2a1e3fe..6eec7794f707b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2055,9 +2055,15 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static bool is_mnt_ns_file(struct dentry *dentry) { + struct ns_common *ns; + /* Is this a proxy for a mount namespace? */ - return dentry->d_op == &ns_dentry_operations && - dentry->d_fsdata == &mntns_operations; + if (dentry->d_op != &ns_dentry_operations) + return false; + + ns = d_inode(dentry)->i_private; + + return ns->ops == &mntns_operations; } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) From b83accfec0811421df065f820e73ca8df7f6439a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 12 Dec 2024 08:27:05 -0800 Subject: [PATCH 0493/1450] MAINTAINERS: wifi: ath: add Jeff Johnson as maintainer The "ATHEROS ATH GENERIC UTILITIES" entry shares the same git tree as the ATH10K, ATH11K, and ATH12K entries which I already maintain, so add me to that entry as well. Signed-off-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241212-ath-maintainer-v1-1-7ea5e86780a8@kernel.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e6e71b05710ba..158740a571b33 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3608,6 +3608,7 @@ F: drivers/phy/qualcomm/phy-ath79-usb.c ATHEROS ATH GENERIC UTILITIES M: Kalle Valo +M: Jeff Johnson L: linux-wireless@vger.kernel.org S: Supported F: drivers/net/wireless/ath/* From 3f4a0948c3524ae50f166dbc6572a3296b014e62 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 10 Dec 2024 10:04:41 +0300 Subject: [PATCH 0494/1450] wifi: brcmsmac: add gain range check to wlc_phy_iqcal_gainparams_nphy() In 'wlc_phy_iqcal_gainparams_nphy()', add gain range check to WARN() instead of possible out-of-bounds 'tbl_iqcal_gainparams_nphy' access. Compile tested only. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Dmitry Antipov Acked-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241210070441.836362-1-dmantipov@yandex.ru --- drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c index d69879e1bd870..d362c4337616b 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_n.c @@ -23423,6 +23423,9 @@ wlc_phy_iqcal_gainparams_nphy(struct brcms_phy *pi, u16 core_no, break; } + if (WARN_ON(k == NPHY_IQCAL_NUMGAINS)) + return; + params->txgm = tbl_iqcal_gainparams_nphy[band_idx][k][1]; params->pga = tbl_iqcal_gainparams_nphy[band_idx][k][2]; params->pad = tbl_iqcal_gainparams_nphy[band_idx][k][3]; From b05d30c2b6df7e2172b18bf1baee9b202f9c6b53 Mon Sep 17 00:00:00 2001 From: Marcel Hamer Date: Wed, 11 Dec 2024 14:36:18 +0100 Subject: [PATCH 0495/1450] wifi: brcmfmac: add missing header include for brcmf_dbg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Including the fwil.h header file can lead to a build error: drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil.h: \ In function ‘brcmf_fil_cmd_int_set’: drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil.h:90:9: error: implicit \ declaration of function ‘brcmf_dbg’ [-Werror=implicit-function-declaration] 90 | brcmf_dbg(FIL, "ifidx=%d, cmd=%d, value=%d\n", ifp->ifidx, cmd, data); | ^~~~~~~~~ The error is often avoided because the debug.h header file is included before the fwil.h header file. This makes sure the header include order is irrelevant by explicitly adding the debug.h header. Fixes: 31343230abb1 ("wifi: brcmfmac: export firmware interface functions") Signed-off-by: Marcel Hamer Acked-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241211133618.2014083-1-marcel.hamer@windriver.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil.h index 31e080e4da669..ab3d6cfcb02bd 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil.h @@ -6,6 +6,8 @@ #ifndef _fwil_h_ #define _fwil_h_ +#include "debug.h" + /******************************************************************************* * Dongle command codes that are interpreted by firmware ******************************************************************************/ From f2893c0804d86230ffb8f1c8703fdbb18648abc8 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 5 Dec 2024 19:41:51 +0800 Subject: [PATCH 0496/1450] dm array: fix releasing a faulty array block twice in dm_array_cursor_end When dm_bm_read_lock() fails due to locking or checksum errors, it releases the faulty block implicitly while leaving an invalid output pointer behind. The caller of dm_bm_read_lock() should not operate on this invalid dm_block pointer, or it will lead to undefined result. For example, the dm_array_cursor incorrectly caches the invalid pointer on reading a faulty array block, causing a double release in dm_array_cursor_end(), then hitting the BUG_ON in dm-bufio cache_put(). Reproduce steps: 1. initialize a cache device dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 65536 linear /dev/sdc 8192" dmsetup create corig --table "0 524288 linear /dev/sdc $262144" dd if=/dev/zero of=/dev/mapper/cmeta bs=4k count=1 dmsetup create cache --table "0 524288 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" 2. wipe the second array block offline dmsteup remove cache cmeta cdata corig mapping_root=$(dd if=/dev/sdc bs=1c count=8 skip=192 \ 2>/dev/null | hexdump -e '1/8 "%u\n"') ablock=$(dd if=/dev/sdc bs=1c count=8 skip=$((4096*mapping_root+2056)) \ 2>/dev/null | hexdump -e '1/8 "%u\n"') dd if=/dev/zero of=/dev/sdc bs=4k count=1 seek=$ablock 3. try reopen the cache device dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 65536 linear /dev/sdc 8192" dmsetup create corig --table "0 524288 linear /dev/sdc $262144" dmsetup create cache --table "0 524288 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" Kernel logs: (snip) device-mapper: array: array_block_check failed: blocknr 0 != wanted 10 device-mapper: block manager: array validator check failed for block 10 device-mapper: array: get_ablock failed device-mapper: cache metadata: dm_array_cursor_next for mapping failed ------------[ cut here ]------------ kernel BUG at drivers/md/dm-bufio.c:638! Fix by setting the cached block pointer to NULL on errors. In addition to the reproducer described above, this fix can be verified using the "array_cursor/damaged" test in dm-unit: dm-unit run /pdata/array_cursor/damaged --kernel-dir Signed-off-by: Ming-Hung Tsai Fixes: fdd1315aa5f0 ("dm array: introduce cursor api") Reviewed-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-array.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 157c9bd2fed74..4866ff56125f5 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -917,23 +917,27 @@ static int load_ablock(struct dm_array_cursor *c) if (c->block) unlock_ablock(c->info, c->block); - c->block = NULL; - c->ab = NULL; c->index = 0; r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le); if (r) { DMERR("dm_btree_cursor_get_value failed"); - dm_btree_cursor_end(&c->cursor); + goto out; } else { r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab); if (r) { DMERR("get_ablock failed"); - dm_btree_cursor_end(&c->cursor); + goto out; } } + return 0; + +out: + dm_btree_cursor_end(&c->cursor); + c->block = NULL; + c->ab = NULL; return r; } From 626f128ee9c4133b1cfce4be2b34a1508949370e Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 5 Dec 2024 19:41:52 +0800 Subject: [PATCH 0497/1450] dm array: fix unreleased btree blocks on closing a faulty array cursor The cached block pointer in dm_array_cursor might be NULL if it reaches an unreadable array block, or the array is empty. Therefore, dm_array_cursor_end() should call dm_btree_cursor_end() unconditionally, to prevent leaving unreleased btree blocks. This fix can be verified using the "array_cursor/iterate/empty" test in dm-unit: dm-unit run /pdata/array_cursor/iterate/empty --kernel-dir Signed-off-by: Ming-Hung Tsai Fixes: fdd1315aa5f0 ("dm array: introduce cursor api") Reviewed-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-array.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 4866ff56125f5..0850dfdffc8ce 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -960,10 +960,10 @@ EXPORT_SYMBOL_GPL(dm_array_cursor_begin); void dm_array_cursor_end(struct dm_array_cursor *c) { - if (c->block) { + if (c->block) unlock_ablock(c->info, c->block); - dm_btree_cursor_end(&c->cursor); - } + + dm_btree_cursor_end(&c->cursor); } EXPORT_SYMBOL_GPL(dm_array_cursor_end); From 0bb1968da2737ba68fd63857d1af2b301a18d3bf Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 5 Dec 2024 19:41:53 +0800 Subject: [PATCH 0498/1450] dm array: fix cursor index when skipping across block boundaries dm_array_cursor_skip() seeks to the target position by loading array blocks iteratively until the specified number of entries to skip is reached. When seeking across block boundaries, it uses dm_array_cursor_next() to step into the next block. dm_array_cursor_skip() must first move the cursor index to the end of the current block; otherwise, the cursor position could incorrectly remain in the same block, causing the actual number of skipped entries to be much smaller than expected. This bug affects cache resizing in v2 metadata and could lead to data loss if the fast device is shrunk during the first-time resume. For example: 1. create a cache metadata consists of 32768 blocks, with a dirty block assigned to the second bitmap block. cache_restore v1.0 is required. cat <> cmeta.xml EOF dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" cache_restore -i cmeta.xml -o /dev/mapper/cmeta --metadata-version=2 2. bring up the cache while attempt to discard all the blocks belonging to the second bitmap block (block# 32576 to 32767). The last command is expected to fail, but it actually succeeds. dmsetup create cdata --table "0 2084864 linear /dev/sdc 8192" dmsetup create corig --table "0 65536 linear /dev/sdc 2105344" dmsetup create cache --table "0 65536 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 64 2 metadata2 writeback smq \ 2 migration_threshold 0" In addition to the reproducer described above, this fix can be verified using the "array_cursor/skip" tests in dm-unit: dm-unit run /pdata/array_cursor/skip/ --kernel-dir Signed-off-by: Ming-Hung Tsai Fixes: 9b696229aa7d ("dm persistent data: add cursor skip functions to the cursor APIs") Reviewed-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-array.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 0850dfdffc8ce..8f8792e558063 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -1003,6 +1003,7 @@ int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count) } count -= remaining; + c->index += (remaining - 1); r = dm_array_cursor_next(c); } while (!r); From 8b55f8818900c99dd4f55a59a103f5b29e41eb2c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 18 Oct 2024 15:14:42 +0000 Subject: [PATCH 0499/1450] media: mediatek: vcodec: mark vdec_vp9_slice_map_counts_eob_coef noinline With KASAN enabled, clang fails to optimize the inline version of vdec_vp9_slice_map_counts_eob_coef() properly, leading to kilobytes of temporary values spilled to the stack: drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c:1526:12: error: stack frame size (2160) exceeds limit (2048) in 'vdec_vp9_slice_update_prob' [-Werror,-Wframe-larger-than] This seems to affect all versions of clang including the latest (clang-20), but the degree of stack overhead is different per release. Marking the function as noinline_for_stack is harmless here and avoids the problem completely. Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Signed-off-by: Sebastian Fricke Signed-off-by: Mauro Carvalho Chehab --- .../mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c index eea709d938209..47c302745c1de 100644 --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec/vdec_vp9_req_lat_if.c @@ -1188,7 +1188,8 @@ static int vdec_vp9_slice_setup_lat(struct vdec_vp9_slice_instance *instance, return ret; } -static +/* clang stack usage explodes if this is inlined */ +static noinline_for_stack void vdec_vp9_slice_map_counts_eob_coef(unsigned int i, unsigned int j, unsigned int k, struct vdec_vp9_slice_frame_counts *counts, struct v4l2_vp9_frame_symbol_counts *counts_helper) From 080b2e7b5e9ad23343e4b11f0751e4c724a78958 Mon Sep 17 00:00:00 2001 From: Krzysztof Karas Date: Thu, 12 Dec 2024 11:00:41 +0000 Subject: [PATCH 0500/1450] drm/display: use ERR_PTR on DP tunnel manager creation fail Instead of returning a generic NULL on error from drm_dp_tunnel_mgr_create(), use error pointers with informative codes to align the function with stub that is executed when CONFIG_DRM_DISPLAY_DP_TUNNEL is unset. This will also trigger IS_ERR() in current caller (intel_dp_tunnerl_mgr_init()) instead of bypassing it via NULL pointer. v2: use error codes inside drm_dp_tunnel_mgr_create() instead of handling on caller's side (Michal, Imre) v3: fixup commit message and add "CC"/"Fixes" lines (Andi), mention aligning function code with stub Fixes: 91888b5b1ad2 ("drm/i915/dp: Add support for DP tunnel BW allocation") Cc: Imre Deak Cc: # v6.9+ Signed-off-by: Krzysztof Karas Reviewed-by: Andi Shyti Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/7q4fpnmmztmchczjewgm6igy55qt6jsm7tfd4fl4ucfq6yg2oy@q4lxtsu6445c --- drivers/gpu/drm/display/drm_dp_tunnel.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/display/drm_dp_tunnel.c b/drivers/gpu/drm/display/drm_dp_tunnel.c index 48b2df120086c..90fe07a89260e 100644 --- a/drivers/gpu/drm/display/drm_dp_tunnel.c +++ b/drivers/gpu/drm/display/drm_dp_tunnel.c @@ -1896,8 +1896,8 @@ static void destroy_mgr(struct drm_dp_tunnel_mgr *mgr) * * Creates a DP tunnel manager for @dev. * - * Returns a pointer to the tunnel manager if created successfully or NULL in - * case of an error. + * Returns a pointer to the tunnel manager if created successfully or error + * pointer in case of failure. */ struct drm_dp_tunnel_mgr * drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) @@ -1907,7 +1907,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) mgr = kzalloc(sizeof(*mgr), GFP_KERNEL); if (!mgr) - return NULL; + return ERR_PTR(-ENOMEM); mgr->dev = dev; init_waitqueue_head(&mgr->bw_req_queue); @@ -1916,7 +1916,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) if (!mgr->groups) { kfree(mgr); - return NULL; + return ERR_PTR(-ENOMEM); } #ifdef CONFIG_DRM_DISPLAY_DP_TUNNEL_STATE_DEBUG @@ -1927,7 +1927,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count) if (!init_group(mgr, &mgr->groups[i])) { destroy_mgr(mgr); - return NULL; + return ERR_PTR(-ENOMEM); } mgr->group_count++; From 9398332f23fab10c5ec57c168b44e72997d6318e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 29 Nov 2024 06:26:28 +0200 Subject: [PATCH 0501/1450] drm/modes: Avoid divide by zero harder in drm_mode_vrefresh() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_mode_vrefresh() is trying to avoid divide by zero by checking whether htotal or vtotal are zero. But we may still end up with a div-by-zero of vtotal*htotal*... Cc: stable@vger.kernel.org Reported-by: syzbot+622bba18029bcde672e1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=622bba18029bcde672e1 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20241129042629.18280-2-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula --- drivers/gpu/drm/drm_modes.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index 6ba167a334613..71573b85d9242 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -1287,14 +1287,11 @@ EXPORT_SYMBOL(drm_mode_set_name); */ int drm_mode_vrefresh(const struct drm_display_mode *mode) { - unsigned int num, den; + unsigned int num = 1, den = 1; if (mode->htotal == 0 || mode->vtotal == 0) return 0; - num = mode->clock; - den = mode->htotal * mode->vtotal; - if (mode->flags & DRM_MODE_FLAG_INTERLACE) num *= 2; if (mode->flags & DRM_MODE_FLAG_DBLSCAN) @@ -1302,6 +1299,12 @@ int drm_mode_vrefresh(const struct drm_display_mode *mode) if (mode->vscan > 1) den *= mode->vscan; + if (check_mul_overflow(mode->clock, num, &num)) + return 0; + + if (check_mul_overflow(mode->htotal * mode->vtotal, den, &den)) + return 0; + return DIV_ROUND_CLOSEST_ULL(mul_u32_u32(num, 1000), den); } EXPORT_SYMBOL(drm_mode_vrefresh); From 9bc5c9515b4817e994579b21c32c033cbb3b0e6c Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Thu, 12 Dec 2024 11:33:25 +0800 Subject: [PATCH 0502/1450] net: stmmac: Drop redundant dwxgmac_tc_ops variable dwmac510_tc_ops and dwxgmac_tc_ops are completely identical, keep dwmac510_tc_ops to provide better backward compatibility. Signed-off-by: Furong Xu <0x1207@gmail.com> Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20241212033325.282817-1-0x1207@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/hwif.c | 4 ++-- drivers/net/ethernet/stmicro/stmmac/hwif.h | 1 - drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 11 ----------- 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index 4bd79de2e2220..31bdbab9a46ca 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -267,7 +267,7 @@ static const struct stmmac_hwif_entry { .hwtimestamp = &stmmac_ptp, .ptp = &stmmac_ptp_clock_ops, .mode = NULL, - .tc = &dwxgmac_tc_ops, + .tc = &dwmac510_tc_ops, .mmc = &dwxgmac_mmc_ops, .est = &dwmac510_est_ops, .setup = dwxgmac2_setup, @@ -290,7 +290,7 @@ static const struct stmmac_hwif_entry { .hwtimestamp = &stmmac_ptp, .ptp = &stmmac_ptp_clock_ops, .mode = NULL, - .tc = &dwxgmac_tc_ops, + .tc = &dwmac510_tc_ops, .mmc = &dwxgmac_mmc_ops, .est = &dwmac510_est_ops, .setup = dwxlgmac2_setup, diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index e428c82b7d317..2f7295b6c1c54 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -685,7 +685,6 @@ extern const struct stmmac_dma_ops dwmac410_dma_ops; extern const struct stmmac_ops dwmac510_ops; extern const struct stmmac_tc_ops dwmac4_tc_ops; extern const struct stmmac_tc_ops dwmac510_tc_ops; -extern const struct stmmac_tc_ops dwxgmac_tc_ops; #define GMAC_VERSION 0x00000020 /* GMAC CORE Version */ #define GMAC4_VERSION 0x00000110 /* GMAC4+ CORE Version */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 6a79e6a111ed9..694d6ee143819 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -1284,14 +1284,3 @@ const struct stmmac_tc_ops dwmac510_tc_ops = { .query_caps = tc_query_caps, .setup_mqprio = tc_setup_dwmac510_mqprio, }; - -const struct stmmac_tc_ops dwxgmac_tc_ops = { - .init = tc_init, - .setup_cls_u32 = tc_setup_cls_u32, - .setup_cbs = tc_setup_cbs, - .setup_cls = tc_setup_cls, - .setup_taprio = tc_setup_taprio, - .setup_etf = tc_setup_etf, - .query_caps = tc_query_caps, - .setup_mqprio = tc_setup_dwmac510_mqprio, -}; From 429fde2d81bcef0ebab002215358955704586457 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Dec 2024 22:22:47 +0000 Subject: [PATCH 0503/1450] net: tun: fix tun_napi_alloc_frags() syzbot reported the following crash [1] Issue came with the blamed commit. Instead of going through all the iov components, we keep using the first one and end up with a malformed skb. [1] kernel BUG at net/core/skbuff.c:2849 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 6230 Comm: syz-executor132 Not tainted 6.13.0-rc1-syzkaller-00407-g96b6fcc0ee41 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 RIP: 0010:__pskb_pull_tail+0x1568/0x1570 net/core/skbuff.c:2848 Code: 38 c1 0f 8c 32 f1 ff ff 4c 89 f7 e8 92 96 74 f8 e9 25 f1 ff ff e8 e8 ae 09 f8 48 8b 5c 24 08 e9 eb fb ff ff e8 d9 ae 09 f8 90 <0f> 0b 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffc90004cbef30 EFLAGS: 00010293 RAX: ffffffff8995c347 RBX: 00000000fffffff2 RCX: ffff88802cf45a00 RDX: 0000000000000000 RSI: 00000000fffffff2 RDI: 0000000000000000 RBP: ffff88807df0c06a R08: ffffffff8995b084 R09: 1ffff1100fbe185c R10: dffffc0000000000 R11: ffffed100fbe185d R12: ffff888076e85d50 R13: ffff888076e85c80 R14: ffff888076e85cf4 R15: ffff888076e85c80 FS: 00007f0dca6ea6c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0dca6ead58 CR3: 00000000119da000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_cow_data+0x2da/0xcb0 net/core/skbuff.c:5284 tipc_aead_decrypt net/tipc/crypto.c:894 [inline] tipc_crypto_rcv+0x402/0x24e0 net/tipc/crypto.c:1844 tipc_rcv+0x57e/0x12a0 net/tipc/node.c:2109 tipc_l2_rcv_msg+0x2bd/0x450 net/tipc/bearer.c:668 __netif_receive_skb_list_ptype net/core/dev.c:5720 [inline] __netif_receive_skb_list_core+0x8b7/0x980 net/core/dev.c:5762 __netif_receive_skb_list net/core/dev.c:5814 [inline] netif_receive_skb_list_internal+0xa51/0xe30 net/core/dev.c:5905 gro_normal_list include/net/gro.h:515 [inline] napi_complete_done+0x2b5/0x870 net/core/dev.c:6256 napi_complete include/linux/netdevice.h:567 [inline] tun_get_user+0x2ea0/0x4890 drivers/net/tun.c:1982 tun_chr_write_iter+0x10d/0x1f0 drivers/net/tun.c:2057 do_iter_readv_writev+0x600/0x880 vfs_writev+0x376/0xba0 fs/read_write.c:1050 do_writev+0x1b6/0x360 fs/read_write.c:1096 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: de4f5fed3f23 ("iov_iter: add iter_iovec() helper") Reported-by: syzbot+4f66250f6663c0c1d67e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/675b61aa.050a0220.599f4.00bb.GAE@google.com/T/#u Cc: stable@vger.kernel.org Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Jens Axboe Acked-by: Willem de Bruijn Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20241212222247.724674-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d7a865ef370b6..e816aaba8e5f2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1481,7 +1481,7 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { - const struct iovec *iov = iter_iov(it); + const struct iovec *iov = iter_iov(it) + i; size_t fragsz = iov->iov_len; struct page *page; void *frag; From 2c2b61d2138f472e50b5531ec0cb4a1485837e21 Mon Sep 17 00:00:00 2001 From: Yuyang Huang Date: Wed, 11 Dec 2024 17:22:41 +0900 Subject: [PATCH 0504/1450] netlink: add IGMP/MLD join/leave notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces netlink notifications for multicast address changes. The following features are included: * Addition and deletion of multicast addresses are reported using RTM_NEWMULTICAST and RTM_DELMULTICAST messages with AF_INET and AF_INET6. * Two new notification groups: RTNLGRP_IPV4_MCADDR and RTNLGRP_IPV6_MCADDR are introduced for receiving these events. This change allows user space applications (e.g., ip monitor) to efficiently track multicast group memberships by listening for netlink events. Previously, applications relied on inefficient polling of procfs, introducing delays. With netlink notifications, applications receive realtime updates on multicast group membership changes, enabling more precise metrics collection and system monitoring.  This change also unlocks the potential for implementing a wide range of sophisticated multicast related features in user space by allowing applications to combine kernel provided multicast address information with user space data and communicate decisions back to the kernel for more fine grained control. This mechanism can be used for various purposes, including multicast filtering, IGMP/MLD offload, and IGMP/MLD snooping. Cc: Maciej Żenczykowski Cc: Lorenzo Colitti Co-developed-by: Patrick Ruddy Signed-off-by: Patrick Ruddy Link: https://lore.kernel.org/r/20180906091056.21109-1-pruddy@vyatta.att-mail.com Signed-off-by: Yuyang Huang Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 ++ include/net/addrconf.h | 21 +++++++++++ include/uapi/linux/rtnetlink.h | 10 +++++- net/ipv4/igmp.c | 64 ++++++++++++++++++++++++++++++++++ net/ipv6/addrconf.c | 29 +++++---------- net/ipv6/mcast.c | 39 +++++++++++++++++++++ 6 files changed, 144 insertions(+), 21 deletions(-) diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 5171231f70a8b..073b30a9b8508 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -87,6 +87,8 @@ struct ip_mc_list { char loaded; unsigned char gsquery; /* check source marks? */ unsigned char crcount; + unsigned long mca_cstamp; + unsigned long mca_tstamp; struct rcu_head rcu; }; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 363dd63babe72..58337898fa21d 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -88,6 +88,23 @@ struct ifa6_config { u16 scope; }; +enum addr_type_t { + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +struct inet6_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; + enum addr_type_t type; + bool force_rt_scope_universe; +}; + int addrconf_init(void); void addrconf_cleanup(void); @@ -525,4 +542,8 @@ int if6_proc_init(void); void if6_proc_exit(void); #endif +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args); + #endif diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index db7254d52d935..eccc0e7dcb7db 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -93,7 +93,11 @@ enum { RTM_NEWPREFIX = 52, #define RTM_NEWPREFIX RTM_NEWPREFIX - RTM_GETMULTICAST = 58, + RTM_NEWMULTICAST = 56, +#define RTM_NEWMULTICAST RTM_NEWMULTICAST + RTM_DELMULTICAST, +#define RTM_DELMULTICAST RTM_DELMULTICAST + RTM_GETMULTICAST, #define RTM_GETMULTICAST RTM_GETMULTICAST RTM_GETANYCAST = 62, @@ -774,6 +778,10 @@ enum rtnetlink_groups { #define RTNLGRP_TUNNEL RTNLGRP_TUNNEL RTNLGRP_STATS, #define RTNLGRP_STATS RTNLGRP_STATS + RTNLGRP_IPV4_MCADDR, +#define RTNLGRP_IPV4_MCADDR RTNLGRP_IPV4_MCADDR + RTNLGRP_IPV6_MCADDR, +#define RTNLGRP_IPV6_MCADDR RTNLGRP_IPV6_MCADDR __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6a238398acc9d..8a370ef37d3f6 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -88,6 +88,8 @@ #include #include +#include +#include #include #include #include @@ -1430,6 +1432,63 @@ static void ip_mc_hash_remove(struct in_device *in_dev, *mc_hash = im->next_hash; } +static int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, + const struct ip_mc_list *im, int event) +{ + struct ifa_cacheinfo ci; + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct ifaddrmsg), 0); + if (!nlh) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifa_family = AF_INET; + ifm->ifa_prefixlen = 32; + ifm->ifa_flags = IFA_F_PERMANENT; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + ifm->ifa_index = dev->ifindex; + + ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ; + ci.tstamp = ci.cstamp; + ci.ifa_prefered = INFINITY_LIFE_TIME; + ci.ifa_valid = INFINITY_LIFE_TIME; + + if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 || + nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static void inet_ifmcaddr_notify(struct net_device *dev, + const struct ip_mc_list *im, int event) +{ + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(__be32)), GFP_ATOMIC); + if (!skb) + goto error; + + err = inet_fill_ifmcaddr(skb, dev, im, event); + if (err < 0) { + WARN_ON_ONCE(err == -EMSGSIZE); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_ATOMIC); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err); +} /* * A socket has joined a multicast group on device dev. @@ -1473,6 +1532,8 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, im->interface = in_dev; in_dev_hold(in_dev); im->multiaddr = addr; + im->mca_cstamp = jiffies; + im->mca_tstamp = im->mca_cstamp; /* initial mode is (EX, empty) */ im->sfmode = mode; im->sfcount[mode] = 1; @@ -1492,6 +1553,7 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); + inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: @@ -1705,6 +1767,8 @@ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) *ip = i->next_rcu; in_dev->mc_count--; __igmp_group_dropped(i, gfp); + inet_ifmcaddr_notify(in_dev->dev, i, + RTM_DELMULTICAST); ip_mc_clear_src(i); if (!in_dev->dead) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0e765466d7f79..2e2684886953d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5127,22 +5127,6 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(4) /* IFA_RT_PRIORITY */; } -enum addr_type_t { - UNICAST_ADDR, - MULTICAST_ADDR, - ANYCAST_ADDR, -}; - -struct inet6_fill_args { - u32 portid; - u32 seq; - int event; - unsigned int flags; - int netnsid; - int ifindex; - enum addr_type_t type; -}; - static int inet6_fill_ifaddr(struct sk_buff *skb, const struct inet6_ifaddr *ifa, struct inet6_fill_args *args) @@ -5221,15 +5205,16 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, return -EMSGSIZE; } -static int inet6_fill_ifmcaddr(struct sk_buff *skb, - const struct ifmcaddr6 *ifmca, - struct inet6_fill_args *args) +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args) { int ifindex = ifmca->idev->dev->ifindex; u8 scope = RT_SCOPE_UNIVERSE; struct nlmsghdr *nlh; - if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) + if (!args->force_rt_scope_universe && + ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, @@ -5254,6 +5239,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, nlmsg_end(skb, nlh); return 0; } +EXPORT_SYMBOL(inet6_fill_ifmcaddr); static int inet6_fill_ifacaddr(struct sk_buff *skb, const struct ifacaddr6 *ifaca, @@ -5418,6 +5404,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, .flags = NLM_F_MULTI, .netnsid = -1, .type = type, + .force_rt_scope_universe = false, }; struct { unsigned long ifindex; @@ -5546,6 +5533,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, .event = RTM_NEWADDR, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; @@ -5617,6 +5605,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) .event = event, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; int err = -ENOBUFS; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 5ca8692d565d5..587831c148dee 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -33,8 +33,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -47,6 +49,7 @@ #include #include +#include #include #include @@ -901,6 +904,39 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, return mc; } +static void inet6_ifmcaddr_notify(struct net_device *dev, + const struct ifmcaddr6 *ifmca, int event) +{ + struct inet6_fill_args fillargs = { + .portid = 0, + .seq = 0, + .event = event, + .flags = 0, + .netnsid = -1, + .force_rt_scope_universe = true, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(16), GFP_ATOMIC); + if (!skb) + goto error; + + err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); + if (err < 0) { + WARN_ON_ONCE(err == -EMSGSIZE); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_ATOMIC); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err); +} + /* * device multicast group inc (add if not found) */ @@ -948,6 +984,7 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, mld_del_delrec(idev, mc); igmp6_group_added(mc); + inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST); mutex_unlock(&idev->mc_lock); ma_put(mc); return 0; @@ -977,6 +1014,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) *map = ma->next; igmp6_group_dropped(ma); + inet6_ifmcaddr_notify(idev->dev, ma, + RTM_DELMULTICAST); ip6_mc_clear_src(ma); mutex_unlock(&idev->mc_lock); From 2b33eb8f1b3e8c2f87cfdbc8cc117f6bdfabc6ec Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:16 +0800 Subject: [PATCH 0505/1450] net/smc: protect link down work from execute after lgr freed link down work may be scheduled before lgr freed but execute after lgr freed, which may result in crash. So it is need to hold a reference before shedule link down work, and put the reference after work executed or canceled. The relevant crash call stack as follows: list_del corruption. prev->next should be ffffb638c9c0fe20, but was 0000000000000000 ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:51! invalid opcode: 0000 [#1] SMP NOPTI CPU: 6 PID: 978112 Comm: kworker/6:119 Kdump: loaded Tainted: G #1 Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 2221b89 04/01/2014 Workqueue: events smc_link_down_work [smc] RIP: 0010:__list_del_entry_valid.cold+0x31/0x47 RSP: 0018:ffffb638c9c0fdd8 EFLAGS: 00010086 RAX: 0000000000000054 RBX: ffff942fb75e5128 RCX: 0000000000000000 RDX: ffff943520930aa0 RSI: ffff94352091fc80 RDI: ffff94352091fc80 RBP: 0000000000000000 R08: 0000000000000000 R09: ffffb638c9c0fc38 R10: ffffb638c9c0fc30 R11: ffffffffa015eb28 R12: 0000000000000002 R13: ffffb638c9c0fe20 R14: 0000000000000001 R15: ffff942f9cd051c0 FS: 0000000000000000(0000) GS:ffff943520900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4f25214000 CR3: 000000025fbae004 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: rwsem_down_write_slowpath+0x17e/0x470 smc_link_down_work+0x3c/0x60 [smc] process_one_work+0x1ac/0x350 worker_thread+0x49/0x2f0 ? rescuer_thread+0x360/0x360 kthread+0x118/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x1f/0x30 Fixes: 541afa10c126 ("net/smc: add smcr_port_err() and smcr_link_down() processing") Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 500952c2e67ba..3b125d348b4ae 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1818,7 +1818,9 @@ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); - schedule_work(&lnk->link_down_wrk); + smcr_link_hold(lnk); /* smcr_link_put in link_down_wrk */ + if (!schedule_work(&lnk->link_down_wrk)) + smcr_link_put(lnk); } } @@ -1850,11 +1852,14 @@ static void smc_link_down_work(struct work_struct *work) struct smc_link_group *lgr = link->lgr; if (list_empty(&lgr->list)) - return; + goto out; wake_up_all(&lgr->llc_msg_waiter); down_write(&lgr->llc_conf_mutex); smcr_link_down(link); up_write(&lgr->llc_conf_mutex); + +out: + smcr_link_put(link); /* smcr_link_hold by schedulers of link_down_work */ } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, From 679e9ddcf90dbdf98aaaa71a492454654b627bcb Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:17 +0800 Subject: [PATCH 0506/1450] net/smc: check sndbuf_space again after NOSPACE flag is set in smc_poll When application sending data more than sndbuf_space, there have chances application will sleep in epoll_wait, and will never be wakeup again. This is caused by a race between smc_poll and smc_cdc_tx_handler. application tasklet smc_tx_sendmsg(len > sndbuf_space) | epoll_wait for EPOLL_OUT,timeout=0 | smc_poll | if (!smc->conn.sndbuf_space) | | smc_cdc_tx_handler | atomic_add sndbuf_space | smc_tx_sndbuf_nonfull | if (!test_bit SOCK_NOSPACE) | do not sk_write_space; set_bit SOCK_NOSPACE; | return mask=0; | Application will sleep in epoll_wait as smc_poll returns 0. And smc_cdc_tx_handler will not call sk_write_space because the SOCK_NOSPACE has not be set. If there is no inflight cdc msg, sk_write_space will not be called any more, and application will sleep in epoll_wait forever. So check sndbuf_space again after NOSPACE flag is set to break the race. Fixes: 8dce2786a290 ("net/smc: smc_poll improvements") Signed-off-by: Guangguan Wang Suggested-by: Paolo Abeni Signed-off-by: David S. Miller --- net/smc/af_smc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9e6c69d18581c..92448f2c362cb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2881,6 +2881,13 @@ __poll_t smc_poll(struct file *file, struct socket *sock, } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + if (sk->sk_state != SMC_INIT) { + /* Race breaker the same way as tcp_poll(). */ + smp_mb__after_atomic(); + if (atomic_read(&smc->conn.sndbuf_space)) + mask |= EPOLLOUT | EPOLLWRNORM; + } } if (atomic_read(&smc->conn.bytes_to_rcv)) mask |= EPOLLIN | EPOLLRDNORM; From a29e220d3c8edbf0e1beb0f028878a4a85966556 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:18 +0800 Subject: [PATCH 0507/1450] net/smc: check iparea_offset and ipv6_prefixes_cnt when receiving proposal msg When receiving proposal msg in server, the field iparea_offset and the field ipv6_prefixes_cnt in proposal msg are from the remote client and can not be fully trusted. Especially the field iparea_offset, once exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks iparea_offset and ipv6_prefixes_cnt before using them. Fixes: e7b7a64a8493 ("smc: support variable CLC proposal messages") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 +++++- net/smc/smc_clc.c | 4 ++++ net/smc/smc_clc.h | 6 +++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 92448f2c362cb..9a74c9693f09e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2032,6 +2032,8 @@ static int smc_listen_prfx_check(struct smc_sock *new_smc, if (pclc->hdr.typev1 == SMC_TYPE_N) return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx) + return -EPROTO; if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; @@ -2221,7 +2223,9 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, int rc = 0; /* check if ISM V1 is available */ - if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) + if (!(ini->smcd_version & SMC_V1) || + !smcd_indicated(ini->smc_type_v1) || + !pclc_smcd) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 33fa787c28ebb..66a43b97eede0 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -354,6 +354,10 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx || + pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) + return false; + if (hdr->version == SMC_V1) { if (hdr->typev1 == SMC_TYPE_N) return false; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5fd6f5b8ef034..ac8de6a177fac 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -336,8 +336,12 @@ struct smc_clc_msg_decline_v2 { /* clc decline message */ static inline struct smc_clc_msg_proposal_prefix * smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) { + u16 offset = ntohs(pclc->iparea_offset); + + if (offset > sizeof(struct smc_clc_msg_smcd)) + return NULL; return (struct smc_clc_msg_proposal_prefix *) - ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); + ((u8 *)pclc + sizeof(*pclc) + offset); } static inline bool smcr_indicated(int smc_type) From 7863c9f3d24ba49dbead7e03dfbe40deb5888fdf Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:19 +0800 Subject: [PATCH 0508/1450] net/smc: check v2_ext_offset/eid_cnt/ism_gid_cnt when receiving proposal msg When receiving proposal msg in server, the fields v2_ext_offset/ eid_cnt/ism_gid_cnt in proposal msg are from the remote client and can not be fully trusted. Especially the field v2_ext_offset, once exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks the fields v2_ext_offset/eid_cnt/ism_gid_cnt before using them. Fixes: 8c3dca341aea ("net/smc: build and send V2 CLC proposal") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 3 ++- net/smc/smc_clc.c | 8 +++++++- net/smc/smc_clc.h | 8 +++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9a74c9693f09e..5d96f9de5b5df 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2276,7 +2276,8 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, goto not_found; smc_v2_ext = smc_get_clc_v2_ext(pclc); - if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + if (!smc_v2_ext || + !smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) goto not_found; /* prepare RDMA check */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 66a43b97eede0..f721d03efcbd7 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -352,7 +352,6 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) struct smc_clc_msg_hdr *hdr = &pclc->hdr; struct smc_clc_v2_extension *v2_ext; - v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (!pclc_prfx || pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) @@ -369,6 +368,13 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) sizeof(struct smc_clc_msg_trail)) return false; } else { + v2_ext = smc_get_clc_v2_ext(pclc); + if ((hdr->typev2 != SMC_TYPE_N && + (!v2_ext || v2_ext->hdr.eid_cnt > SMC_CLC_MAX_UEID)) || + (smcd_indicated(hdr->typev2) && + v2_ext->hdr.ism_gid_cnt > SMCD_CLC_MAX_V2_GID_ENTRIES)) + return false; + if (ntohs(hdr->length) != sizeof(*pclc) + sizeof(struct smc_clc_msg_smcd) + diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ac8de6a177fac..23afa4df862ed 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -380,8 +380,14 @@ static inline struct smc_clc_v2_extension * smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) { struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop); + u16 max_offset; - if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset)) + max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_smcd) - + offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); + + if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset) || + ntohs(prop_smcd->v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_v2_extension *) From 9ab332deb671d8f7e66d82a2ff2b3f715bc3a4ad Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:20 +0800 Subject: [PATCH 0509/1450] net/smc: check smcd_v2_ext_offset when receiving proposal msg When receiving proposal msg in server, the field smcd_v2_ext_offset in proposal msg is from the remote client and can not be fully trusted. Once the value of smcd_v2_ext_offset exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks the value of smcd_v2_ext_offset before using it. Fixes: 5c21c4ccafe8 ("net/smc: determine accepted ISM devices") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ net/smc/smc_clc.h | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5d96f9de5b5df..6cc7b846cff1b 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2147,6 +2147,8 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); + if (!pclc_smcd || !smc_v2_ext || !smcd_v2_ext) + goto not_found; mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) { diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 23afa4df862ed..7672899254108 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -400,9 +400,15 @@ smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) static inline struct smc_clc_smcd_v2_extension * smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) { + u16 max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_smcd_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_v2_extension, hdr) - + offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset); + if (!prop_v2ext) return NULL; - if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)) + if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) || + ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_smcd_v2_extension *) From c5b8ee5022a19464783058dc6042e8eefa34e8cd Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:21 +0800 Subject: [PATCH 0510/1450] net/smc: check return value of sock_recvmsg when draining clc data When receiving clc msg, the field length in smc_clc_msg_hdr indicates the length of msg should be received from network and the value should not be fully trusted as it is from the network. Once the value of length exceeds the value of buflen in function smc_clc_wait_msg it may run into deadloop when trying to drain the remaining data exceeding buflen. This patch checks the return value of sock_recvmsg when draining data in case of deadloop in draining. Fixes: fb4f79264c0f ("net/smc: tolerate future SMCD versions") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index f721d03efcbd7..521f5df80e10c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -774,6 +774,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, SMC_CLC_RECV_BUF_LEN : datlen; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); len = sock_recvmsg(smc->clcsock, &msg, krflags); + if (len < recvlen) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } datlen -= len; } if (clcm->type == SMC_CLC_DECLINE) { From a2d8af57452e60ff93a3525704788ad566433070 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Thu, 12 Dec 2024 09:44:06 +0100 Subject: [PATCH 0511/1450] dt-bindings: net: dp83822: Add support for GPIO2 clock output The GPIO2 pin on the DP83822 can be configured as clock output. Add binding to support this feature. Signed-off-by: Dimitri Fedrau Reviewed-by: Krzysztof Kozlowski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- .../devicetree/bindings/net/ti,dp83822.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Documentation/devicetree/bindings/net/ti,dp83822.yaml b/Documentation/devicetree/bindings/net/ti,dp83822.yaml index 784866ea392b2..50c24248df266 100644 --- a/Documentation/devicetree/bindings/net/ti,dp83822.yaml +++ b/Documentation/devicetree/bindings/net/ti,dp83822.yaml @@ -96,6 +96,32 @@ properties: - master - slave + ti,gpio2-clk-out: + description: | + DP83822 PHY only. + The GPIO2 pin on the DP83822 can be configured as clock output. When + omitted, the PHY's default will be left as is. + + - 'mac-if': In MII mode the clock frequency is 25-MHz, in RMII Mode the + clock frequency is 50-MHz and in RGMII Mode the clock frequency is + 25-MHz. + - 'xi': XI clock(pass-through clock from XI pin). + - 'int-ref': Internal reference clock 25-MHz. + - 'rmii-master-mode-ref': RMII master mode reference clock 50-MHz. RMII + master mode reference clock is identical to MAC IF clock in RMII master + mode. + - 'free-running': Free running clock 125-MHz. + - 'recovered': Recovered clock is a 125-MHz recovered clock from a + connected link partner. + $ref: /schemas/types.yaml#/definitions/string + enum: + - mac-if + - xi + - int-ref + - rmii-master-mode-ref + - free-running + - recovered + required: - reg @@ -110,6 +136,7 @@ examples: reg = <0>; rx-internal-delay-ps = <1>; tx-internal-delay-ps = <1>; + ti,gpio2-clk-out = "xi"; }; }; From 53e3b540952c14aa190233c173ba56d2987aa527 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Thu, 12 Dec 2024 09:44:07 +0100 Subject: [PATCH 0512/1450] net: phy: dp83822: Add support for GPIO2 clock output The GPIO2 pin on the DP83822 can be configured as clock output. Add support for configuration via DT. Signed-off-by: Dimitri Fedrau Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/dp83822.c | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 25ee09c48027c..334c17a68edd7 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -30,6 +30,7 @@ #define MII_DP83822_FCSCR 0x14 #define MII_DP83822_RCSR 0x17 #define MII_DP83822_RESET_CTRL 0x1f +#define MII_DP83822_IOCTRL2 0x463 #define MII_DP83822_GENCFG 0x465 #define MII_DP83822_SOR1 0x467 @@ -104,6 +105,18 @@ #define DP83822_RX_CLK_SHIFT BIT(12) #define DP83822_TX_CLK_SHIFT BIT(11) +/* IOCTRL2 bits */ +#define DP83822_IOCTRL2_GPIO2_CLK_SRC GENMASK(6, 4) +#define DP83822_IOCTRL2_GPIO2_CTRL GENMASK(2, 0) +#define DP83822_IOCTRL2_GPIO2_CTRL_CLK_REF GENMASK(1, 0) + +#define DP83822_CLK_SRC_MAC_IF 0x0 +#define DP83822_CLK_SRC_XI 0x1 +#define DP83822_CLK_SRC_INT_REF 0x2 +#define DP83822_CLK_SRC_RMII_MASTER_MODE_REF 0x4 +#define DP83822_CLK_SRC_FREE_RUNNING 0x6 +#define DP83822_CLK_SRC_RECOVERED 0x7 + /* SOR1 mode */ #define DP83822_STRAP_MODE1 0 #define DP83822_STRAP_MODE2 BIT(0) @@ -139,6 +152,8 @@ struct dp83822_private { u8 cfg_dac_minus; u8 cfg_dac_plus; struct ethtool_wolinfo wol; + bool set_gpio2_clk_out; + u32 gpio2_clk_out; }; static int dp83822_config_wol(struct phy_device *phydev, @@ -413,6 +428,15 @@ static int dp83822_config_init(struct phy_device *phydev) int err = 0; int bmcr; + if (dp83822->set_gpio2_clk_out) + phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL2, + DP83822_IOCTRL2_GPIO2_CTRL | + DP83822_IOCTRL2_GPIO2_CLK_SRC, + FIELD_PREP(DP83822_IOCTRL2_GPIO2_CTRL, + DP83822_IOCTRL2_GPIO2_CTRL_CLK_REF) | + FIELD_PREP(DP83822_IOCTRL2_GPIO2_CLK_SRC, + dp83822->gpio2_clk_out)); + if (phy_interface_is_rgmii(phydev)) { rx_int_delay = phy_get_internal_delay(phydev, dev, NULL, 0, true); @@ -611,6 +635,7 @@ static int dp83822_of_init(struct phy_device *phydev) { struct dp83822_private *dp83822 = phydev->priv; struct device *dev = &phydev->mdio.dev; + const char *of_val; /* Signal detection for the PHY is only enabled if the FX_EN and the * SD_EN pins are strapped. Signal detection can only enabled if FX_EN @@ -623,6 +648,29 @@ static int dp83822_of_init(struct phy_device *phydev) dp83822->fx_enabled = device_property_present(dev, "ti,fiber-mode"); + if (!device_property_read_string(dev, "ti,gpio2-clk-out", &of_val)) { + if (strcmp(of_val, "mac-if") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_MAC_IF; + } else if (strcmp(of_val, "xi") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_XI; + } else if (strcmp(of_val, "int-ref") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_INT_REF; + } else if (strcmp(of_val, "rmii-master-mode-ref") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_RMII_MASTER_MODE_REF; + } else if (strcmp(of_val, "free-running") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_FREE_RUNNING; + } else if (strcmp(of_val, "recovered") == 0) { + dp83822->gpio2_clk_out = DP83822_CLK_SRC_RECOVERED; + } else { + phydev_err(phydev, + "Invalid value for ti,gpio2-clk-out property (%s)\n", + of_val); + return -EINVAL; + } + + dp83822->set_gpio2_clk_out = true; + } + return 0; } From 2d5df3a680ffdaf606baa10636bdb1daf757832e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 12 Dec 2024 18:55:45 +0200 Subject: [PATCH 0513/1450] net: mscc: ocelot: fix incorrect IFH SRC_PORT field in ocelot_ifh_set_basic() Packets injected by the CPU should have a SRC_PORT field equal to the CPU port module index in the Analyzer block (ocelot->num_phys_ports). The blamed commit copied the ocelot_ifh_set_basic() call incorrectly from ocelot_xmit_common() in net/dsa/tag_ocelot.c. Instead of calling with "x", it calls with BIT_ULL(x), but the field is not a port mask, but rather a single port index. [ side note: this is the technical debt of code duplication :( ] The error used to be silent and doesn't appear to have other user-visible manifestations, but with new changes in the packing library, it now fails loudly as follows: ------------[ cut here ]------------ Cannot store 0x40 inside bits 46-43 - will truncate sja1105 spi2.0: xmit timed out WARNING: CPU: 1 PID: 102 at lib/packing.c:98 __pack+0x90/0x198 sja1105 spi2.0: timed out polling for tstamp CPU: 1 UID: 0 PID: 102 Comm: felix_xmit Tainted: G W N 6.13.0-rc1-00372-gf706b85d972d-dirty #2605 Call trace: __pack+0x90/0x198 (P) __pack+0x90/0x198 (L) packing+0x78/0x98 ocelot_ifh_set_basic+0x260/0x368 ocelot_port_inject_frame+0xa8/0x250 felix_port_deferred_xmit+0x14c/0x258 kthread_worker_fn+0x134/0x350 kthread+0x114/0x138 The code path pertains to the ocelot switchdev driver and to the felix secondary DSA tag protocol, ocelot-8021q. Here seen with ocelot-8021q. The messenger (packing) is not really to blame, so fix the original commit instead. Fixes: e1b9e80236c5 ("net: mscc: ocelot: fix QoS class for injected packets with "ocelot-8021q"") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241212165546.879567-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 3d72aa7b13050..ef93df5208871 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1432,7 +1432,7 @@ void ocelot_ifh_set_basic(void *ifh, struct ocelot *ocelot, int port, memset(ifh, 0, OCELOT_TAG_LEN); ocelot_ifh_set_bypass(ifh, 1); - ocelot_ifh_set_src(ifh, BIT_ULL(ocelot->num_phys_ports)); + ocelot_ifh_set_src(ifh, ocelot->num_phys_ports); ocelot_ifh_set_dest(ifh, BIT_ULL(port)); ocelot_ifh_set_qos_class(ifh, qos_class); ocelot_ifh_set_tag_type(ifh, tag_type); From 329365dc46b8cedb9c4fd5cfb80b29cb85b84c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20G=C3=BCntner?= Date: Thu, 12 Dec 2024 17:19:11 +0100 Subject: [PATCH 0514/1450] ipv4: output metric as unsigned int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit adding a route metric greater than 0x7fff_ffff leads to an unintended wrap when printing the underlying u32 as an unsigned int (`%d`) thus incorrectly rendering the metric as negative. Formatting using `%u` corrects the issue. Signed-off-by: Maximilian Güntner Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241212161911.51598-1-code@mguentner.de Signed-off-by: Jakub Kicinski --- net/ipv4/fib_trie.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 161f5526b86c6..d6411ac810961 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2999,7 +2999,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", + "%u\t%08X\t%d\t%u\t%u", nhc->nhc_dev ? nhc->nhc_dev->name : "*", prefix, gw, flags, 0, 0, fi->fib_priority, @@ -3011,7 +3011,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", + "%u\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); } From ee76746387f6233bdfa93d7406990f923641568f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Dec 2024 17:25:18 +0000 Subject: [PATCH 0515/1450] netdevsim: prevent bad user input in nsim_dev_health_break_write() If either a zero count or a large one is provided, kernel can crash. Fixes: 82c93a87bf8b ("netdevsim: implement couple of testing devlink health reporters") Reported-by: syzbot+ea40e4294e58b0292f74@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/675c6862.050a0220.37aaf.00b1.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Jiri Pirko Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213172518.2415666-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/health.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/netdevsim/health.c b/drivers/net/netdevsim/health.c index 70e8bdf34be90..688f05316b5e1 100644 --- a/drivers/net/netdevsim/health.c +++ b/drivers/net/netdevsim/health.c @@ -149,6 +149,8 @@ static ssize_t nsim_dev_health_break_write(struct file *file, char *break_msg; int err; + if (count == 0 || count > PAGE_SIZE) + return -EINVAL; break_msg = memdup_user_nul(data, count); if (IS_ERR(break_msg)) return PTR_ERR(break_msg); From 663ad7481f068057f6f692c5368c47150e855370 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 13 Dec 2024 13:07:11 +0000 Subject: [PATCH 0516/1450] tools/net/ynl: fix sub-message key lookup for nested attributes Use the correct attribute space for sub-message key lookup in nested attributes when adding attributes. This fixes rt_link where the "kind" key and "data" sub-message are nested attributes in "linkinfo". For example: ./tools/net/ynl/cli.py \ --create \ --spec Documentation/netlink/specs/rt_link.yaml \ --do newlink \ --json '{"link": 99, "linkinfo": { "kind": "vlan", "data": {"id": 4 } } }' Signed-off-by: Donald Hunter Fixes: ab463c4342d1 ("tools/net/ynl: Add support for encoding sub-messages") Link: https://patch.msgid.link/20241213130711.40267-1-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index 01ec01a90e763..eea29359a899a 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -556,10 +556,10 @@ def _add_attr(self, space, name, value, search_attrs): if attr["type"] == 'nest': nl_type |= Netlink.NLA_F_NESTED attr_payload = b'' - sub_attrs = SpaceAttrs(self.attr_sets[space], value, search_attrs) + sub_space = attr['nested-attributes'] + sub_attrs = SpaceAttrs(self.attr_sets[sub_space], value, search_attrs) for subname, subvalue in value.items(): - attr_payload += self._add_attr(attr['nested-attributes'], - subname, subvalue, sub_attrs) + attr_payload += self._add_attr(sub_space, subname, subvalue, sub_attrs) elif attr["type"] == 'flag': if not value: # If value is absent or false then skip attribute creation. From 36e32b33d81152e1911a70750e0fe5c7621797ba Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Thu, 12 Dec 2024 13:59:08 +0200 Subject: [PATCH 0517/1450] net: ena: Fix incorrect indentation The assignment was accidentally aligned to the string one line before. This was raised by the kernel bot. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412101739.umNl7yYu-lkp@intel.com/ Signed-off-by: David Arinzon Signed-off-by: Shay Agroskin Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241212115910.2485851-1-shayagr@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 63c8a2328142d..c1295dfad0d07 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -74,7 +74,7 @@ static void ena_tx_timeout(struct net_device *dev, unsigned int txqueue) if (threshold < time_since_last_napi && napi_scheduled) { netdev_err(dev, "napi handler hasn't been called for a long time but is scheduled\n"); - reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION; + reset_reason = ENA_REGS_RESET_SUSPECTED_POLL_STARVATION; } schedule_reset: /* Change the state of the device to trigger reset From ffcbfb5f9779637792547356a4fb8b0cbf645fa9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 12 Dec 2024 16:08:34 +0200 Subject: [PATCH 0518/1450] net: phylink: improve phylink_sfp_config_phy() error message with missing PHY driver It seems that phylink does not support driving PHYs in SFP modules using the Generic PHY or Generic Clause 45 PHY driver. I've come to this conclusion after analyzing these facts: - sfp_sm_probe_phy(), who is our caller here, first calls phy_device_register() and then sfp_add_phy() -> ... -> phylink_sfp_connect_phy(). - phydev->supported is populated by phy_probe() - phy_probe() is usually called synchronously from phy_device_register() via phy_bus_match(), if a precise device driver is found for the PHY. In that case, phydev->supported has a good chance of being set to a non-zero mask. - There is an exceptional case for the PHYs for which phy_bus_match() didn't find a driver. Those devices sit for a while without a driver, then phy_attach_direct() force-binds the genphy_c45_driver or genphy_driver to them. Again, this triggers phy_probe() and renders a good chance of phydev->supported being populated, assuming compatibility with genphy_read_abilities() or genphy_c45_pma_read_abilities(). - phylink_sfp_config_phy() does not support the exceptional case of retrieving phydev->supported from the Generic PHY driver, due to its code flow. It expects the phydev->supported mask to already be non-empty, because it first calls phylink_validate() on it, and only calls phylink_attach_phy() if that succeeds. Thus, phylink_attach_phy() -> phy_attach_direct() has no chance of running. It is not my wish to change the state of affairs by altering the code flow, but merely to document the limitation rather than have the current unspecific error: [ 61.800079] mv88e6085 d0032004.mdio-mii:12 sfp: validation with support 00,00000000,00000000,00000000 failed: -EINVAL [ 61.820743] sfp sfp: sfp_add_phy failed: -EINVAL On the premise that an empty phydev->supported is going to make phylink_validate() fail anyway, and that this is caused by a missing PHY driver, it would be more informative to single out that case, undercut the entire phylink_sfp_config_phy() call, including phylink_validate(), and print a more specific message for this common gotcha: [ 37.076403] mv88e6085 d0032004.mdio-mii:12 sfp: PHY i2c:sfp:16 (id 0x01410cc2) has no driver loaded [ 37.089157] mv88e6085 d0032004.mdio-mii:12 sfp: Drivers which handle known common cases: CONFIG_BCM84881_PHY, CONFIG_MARVELL_PHY [ 37.108047] sfp sfp: sfp_add_phy failed: -EINVAL Link: https://lore.kernel.org/netdev/20241113144229.3ff4bgsalvj7spb7@skbuf/ Signed-off-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20241212140834.278894-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 95fbc363f9a68..6d50c2fdb190e 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -3623,6 +3623,13 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy) { struct phylink *pl = upstream; + if (!phy->drv) { + phylink_err(pl, "PHY %s (id 0x%.8lx) has no driver loaded\n", + phydev_name(phy), (unsigned long)phy->phy_id); + phylink_err(pl, "Drivers which handle known common cases: CONFIG_BCM84881_PHY, CONFIG_MARVELL_PHY\n"); + return -EINVAL; + } + /* * This is the new way of dealing with flow control for PHYs, * as described by Timur Tabi in commit 529ed1275263 ("net: phy: From 0193eebbb1fcade01331f9d7cc24e57fc28a577d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 12 Dec 2024 10:11:43 +0100 Subject: [PATCH 0519/1450] ethernet: Make OA_TC6 config symbol invisible Commit aa58bec064ab1622 ("net: ethernet: oa_tc6: implement register write operation") introduced a library that implements the OPEN Alliance TC6 10BASE-T1x MAC-PHY Serial Interface protocol for supporting 10BASE-T1x MAC-PHYs. There is no need to ask the user about enabling this library, as all drivers that use it select the OA_TC6 symbol. Hence make the symbol invisible, unless when compile-testing. Signed-off-by: Geert Uytterhoeven Reviewed-by: Simon Horman Link: https://patch.msgid.link/3b600550745af10ab7d7c3526353931c1d39f641.1733994552.git.geert+renesas@glider.be Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index 9a542e3c9b05d..977b42bc1e8c1 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -159,7 +159,7 @@ config ETHOC Say Y here if you want to use the OpenCores 10/100 Mbps Ethernet MAC. config OA_TC6 - tristate "OPEN Alliance TC6 10BASE-T1x MAC-PHY support" + tristate "OPEN Alliance TC6 10BASE-T1x MAC-PHY support" if COMPILE_TEST depends on SPI select PHYLIB help From a63bb695396641d91201b9226b09652c1a647ff4 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 12 Dec 2024 13:20:42 -0800 Subject: [PATCH 0520/1450] ionic: remove the unused nb_work Remove the empty and unused nb_work and associated ionic_lif_notify_work() function. v2: separated from previous net patch Link: https://lore.kernel.org/netdev/20241210174828.69525-2-shannon.nelson@amd.com/ Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Link: https://patch.msgid.link/20241212212042.9348-1-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic.h | 1 - drivers/net/ethernet/pensando/ionic/ionic_lif.c | 7 ------- 2 files changed, 8 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h index 0639bf56bd3af..04f00ea94230a 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic.h +++ b/drivers/net/ethernet/pensando/ionic/ionic.h @@ -57,7 +57,6 @@ struct ionic { DECLARE_BITMAP(intrs, IONIC_INTR_CTRL_REGS_MAX); cpumask_var_t *affinity_masks; struct delayed_work doorbell_check_dwork; - struct work_struct nb_work; struct notifier_block nb; struct rw_semaphore vf_op_lock; /* lock for VF operations */ struct ionic_vf *vfs; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 052c767a2c757..05fb46effe0df 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -3804,10 +3804,6 @@ int ionic_lif_init(struct ionic_lif *lif) return err; } -static void ionic_lif_notify_work(struct work_struct *ws) -{ -} - static void ionic_lif_set_netdev_info(struct ionic_lif *lif) { struct ionic_admin_ctx ctx = { @@ -3858,8 +3854,6 @@ int ionic_lif_register(struct ionic_lif *lif) ionic_lif_register_phc(lif); - INIT_WORK(&lif->ionic->nb_work, ionic_lif_notify_work); - lif->ionic->nb.notifier_call = ionic_lif_notify; err = register_netdevice_notifier(&lif->ionic->nb); @@ -3885,7 +3879,6 @@ void ionic_lif_unregister(struct ionic_lif *lif) { if (lif->ionic->nb.notifier_call) { unregister_netdevice_notifier(&lif->ionic->nb); - cancel_work_sync(&lif->ionic->nb_work); lif->ionic->nb.notifier_call = NULL; } From 410cd938511ff18a13bea39e1af80e4821dca14a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Dec 2024 09:32:18 +0100 Subject: [PATCH 0521/1450] octeontx2-af: fix build regression without CONFIG_DCB When DCB is disabled, the pfc_en struct member cannot be accessed: drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c: In function 'otx2_is_pfc_enabled': drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c:22:48: error: 'struct otx2_nic' has no member named 'pfc_en' 22 | return IS_ENABLED(CONFIG_DCB) && !!pfvf->pfc_en; | ^~ drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c: In function 'otx2_nix_config_bp': drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c:1755:33: error: 'IEEE_8021QAZ_MAX_TCS' undeclared (first use in this function) 1755 | req->chan_cnt = IEEE_8021QAZ_MAX_TCS; | ^~~~~~~~~~~~~~~~~~~~ Move the member out of the #ifdef block to avoid putting back another check in the source file and add the missing include file unconditionally. Fixes: a7ef63dbd588 ("octeontx2-af: Disable backpressure between CPT and NIX") Signed-off-by: Arnd Bergmann Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241213083228.2645757-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 1 + drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index bf56888e7fe7d..2b49bfec78692 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "otx2_reg.h" diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 44d737a0dd092..65814e3dc93f5 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -505,9 +505,9 @@ struct otx2_nic { /* Devlink */ struct otx2_devlink *dl; -#ifdef CONFIG_DCB /* PFC */ u8 pfc_en; +#ifdef CONFIG_DCB u8 *queue_to_pfc_map; u16 pfc_schq_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC]; bool pfc_alloc_status[NIX_PF_PFC_PRIO_MAX]; From a35d00d5512accd337510fa4de756b743d331a87 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 13 Dec 2024 11:08:27 +0000 Subject: [PATCH 0522/1450] netlink: specs: add uint, sint to netlink-raw schema Add uint, sint to the list of attr types in the netlink-raw schema. This fixes the rt_link spec which had a uint attr added in commit f858cc9eed5b ("net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute") Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20241213110827.32250-1-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/netlink-raw.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml index 914aa1c0a2736..1b0772c8e333a 100644 --- a/Documentation/netlink/netlink-raw.yaml +++ b/Documentation/netlink/netlink-raw.yaml @@ -221,7 +221,7 @@ properties: type: &attr-type description: The netlink attribute type enum: [ unused, pad, flag, binary, bitfield32, - u8, u16, u32, u64, s8, s16, s32, s64, + uint, sint, u8, u16, u32, u64, s8, s16, s32, s64, string, nest, indexed-array, nest-type-value, sub-message ] doc: From 734ff310d38cfdc27a1b3eac9fa83ff754356ae7 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Thu, 12 Dec 2024 17:33:01 +0000 Subject: [PATCH 0523/1450] gve: Convert timeouts to secs_to_jiffies() Commit b35108a51cf7 ("jiffies: Define secs_to_jiffies()") introduced secs_to_jiffies(). As the value here is a multiple of 1000, use secs_to_jiffies() instead of msecs_to_jiffies to avoid the multiplication. This is converted using scripts/coccinelle/misc/secs_to_jiffies.cocci with the following Coccinelle rules: @@ constant C; @@ - msecs_to_jiffies(C * 1000) + secs_to_jiffies(C) @@ constant C; @@ - msecs_to_jiffies(C * MSEC_PER_SEC) + secs_to_jiffies(C) Signed-off-by: Easwar Hariharan Reviewed-by: Praveen Kaligineedi Link: https://patch.msgid.link/20241212-netdev-converge-secs-to-jiffies-v4-1-6dac97a6d6ab@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_tx_dqo.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c index f879426cb5523..394debc62268a 100644 --- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -1146,8 +1146,7 @@ static void gve_handle_miss_completion(struct gve_priv *priv, /* jiffies can wraparound but time comparisons can handle overflows. */ pending_packet->timeout_jiffies = jiffies + - msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT * - MSEC_PER_SEC); + secs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT); add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); *bytes += pending_packet->skb->len; @@ -1191,8 +1190,7 @@ static void remove_miss_completions(struct gve_priv *priv, pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; pending_packet->timeout_jiffies = jiffies + - msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT * - MSEC_PER_SEC); + secs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT); /* Maintain pending packet in another list so the packet can be * unallocated at a later time. */ From dcacb364772eb463bde225176086bd7738b7102f Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:09:11 +0800 Subject: [PATCH 0524/1450] net: wan: framer: Simplify API framer_provider_simple_of_xlate() implementation Simplify framer_provider_simple_of_xlate() implementation by API class_find_device_by_of_node(). Also correct comments to mark its parameter @dev as unused instead of @args in passing. Cc: Greg Kroah-Hartman Signed-off-by: Zijun Hu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241213-net_fix-v2-1-6d06130d630f@quicinc.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/framer/framer-core.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/net/wan/framer/framer-core.c b/drivers/net/wan/framer/framer-core.c index f547c22e26ac2..58f5143359dfd 100644 --- a/drivers/net/wan/framer/framer-core.c +++ b/drivers/net/wan/framer/framer-core.c @@ -732,8 +732,8 @@ EXPORT_SYMBOL_GPL(devm_framer_create); /** * framer_provider_simple_of_xlate() - returns the framer instance from framer provider - * @dev: the framer provider device - * @args: of_phandle_args (not used here) + * @dev: the framer provider device (not used here) + * @args: of_phandle_args * * Intended to be used by framer provider for the common case where #framer-cells is * 0. For other cases where #framer-cells is greater than '0', the framer provider @@ -743,21 +743,14 @@ EXPORT_SYMBOL_GPL(devm_framer_create); struct framer *framer_provider_simple_of_xlate(struct device *dev, const struct of_phandle_args *args) { - struct class_dev_iter iter; - struct framer *framer; - - class_dev_iter_init(&iter, &framer_class, NULL, NULL); - while ((dev = class_dev_iter_next(&iter))) { - framer = dev_to_framer(dev); - if (args->np != framer->dev.of_node) - continue; + struct device *target_dev; - class_dev_iter_exit(&iter); - return framer; - } + target_dev = class_find_device_by_of_node(&framer_class, args->np); + if (!target_dev) + return ERR_PTR(-ENODEV); - class_dev_iter_exit(&iter); - return ERR_PTR(-ENODEV); + put_device(target_dev); + return dev_to_framer(target_dev); } EXPORT_SYMBOL_GPL(framer_provider_simple_of_xlate); From e7b4083b90b7213902124d13fd1ed808360e32b1 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 13 Dec 2024 20:52:52 +0100 Subject: [PATCH 0525/1450] mptcp: add mptcp_userspace_pm_lookup_addr helper Like __lookup_addr() helper in pm_netlink.c, a new helper mptcp_userspace_pm_lookup_addr() is also defined in pm_userspace.c. It looks up the corresponding mptcp_pm_addr_entry address in userspace_pm_local_addr_list through the passed "addr" parameter and returns the found address entry. This helper can be used in mptcp_userspace_pm_delete_local_addr(), mptcp_userspace_pm_set_flags(), mptcp_userspace_pm_get_local_id() and mptcp_userspace_pm_is_backup() to simplify the code. Please note that with this change now list_for_each_entry() is used in mptcp_userspace_pm_append_new_local_addr(), not list_for_each_entry_safe(), but that's OK to do so because mptcp_userspace_pm_lookup_addr() only returns an entry from the list, the list hasn't been modified here. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241213-net-next-mptcp-pm-misc-cleanup-v1-1-ddb6d00109a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 71 ++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index e35178f5205fa..3664f3c1572e2 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -26,6 +26,19 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk) } } +static struct mptcp_pm_addr_entry * +mptcp_userspace_pm_lookup_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, addr, false)) + return entry; + } + return NULL; +} + static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry, bool needs_id) @@ -90,22 +103,20 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *addr) { - struct mptcp_pm_addr_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; - list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) { - /* TODO: a refcount is needed because the entry can - * be used multiple times (e.g. fullmesh mode). - */ - list_del_rcu(&entry->list); - sock_kfree_s(sk, entry, sizeof(*entry)); - msk->pm.local_addr_used--; - return 0; - } - } - - return -EINVAL; + entry = mptcp_userspace_pm_lookup_addr(msk, &addr->addr); + if (!entry) + return -EINVAL; + + /* TODO: a refcount is needed because the entry can + * be used multiple times (e.g. fullmesh mode). + */ + list_del_rcu(&entry->list); + sock_kfree_s(sk, entry, sizeof(*entry)); + msk->pm.local_addr_used--; + return 0; } static struct mptcp_pm_addr_entry * @@ -123,17 +134,12 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { - struct mptcp_pm_addr_entry *entry = NULL, *e, new_entry; + struct mptcp_pm_addr_entry *entry = NULL, new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&e->addr, skc, false)) { - entry = e; - break; - } - } + entry = mptcp_userspace_pm_lookup_addr(msk, skc); spin_unlock_bh(&msk->pm.lock); if (entry) return entry->addr.id; @@ -153,15 +159,11 @@ bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry *entry; - bool backup = false; + bool backup; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, skc, false)) { - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - break; - } - } + entry = mptcp_userspace_pm_lookup_addr(msk, skc); + backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); spin_unlock_bh(&msk->pm.lock); return backup; @@ -606,13 +608,12 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) bkup = 1; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, &loc.addr, false)) { - if (bkup) - entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; - else - entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; - } + entry = mptcp_userspace_pm_lookup_addr(msk, &loc.addr); + if (entry) { + if (bkup) + entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; } spin_unlock_bh(&msk->pm.lock); From a28717d8414e965a3ce6c83f744aa1c70ac8722f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 13 Dec 2024 20:52:53 +0100 Subject: [PATCH 0526/1450] mptcp: add mptcp_for_each_userspace_pm_addr macro Similar to mptcp_for_each_subflow() macro, this patch adds a new macro mptcp_for_each_userspace_pm_addr() for userspace PM to iterate over the address entries on the local address list userspace_pm_local_addr_list of the mptcp socket. This patch doesn't change the behaviour of the code, just refactoring. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241213-net-next-mptcp-pm-misc-cleanup-v1-2-ddb6d00109a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 3664f3c1572e2..6a27fab238f15 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -8,6 +8,10 @@ #include "mib.h" #include "mptcp_pm_gen.h" +#define mptcp_for_each_userspace_pm_addr(__msk, __entry) \ + list_for_each_entry(__entry, \ + &((__msk)->pm.userspace_pm_local_addr_list), list) + void mptcp_free_local_addr_list(struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *tmp; @@ -32,7 +36,7 @@ mptcp_userspace_pm_lookup_addr(struct mptcp_sock *msk, { struct mptcp_pm_addr_entry *entry; - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, entry) { if (mptcp_addresses_equal(&entry->addr, addr, false)) return entry; } @@ -54,7 +58,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_lock_bh(&msk->pm.lock); - list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, e) { addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); if (addr_match && entry->addr.id == 0 && needs_id) entry->addr.id = e->addr.id; @@ -124,7 +128,7 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, entry) { if (entry->addr.id == id) return entry; } @@ -659,7 +663,7 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, lock_sock(sk); spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, entry) { if (test_bit(entry->addr.id, bitmap->map)) continue; From 6a389c8ceeb75cf3c523ebf652a90958267c7b13 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 13 Dec 2024 20:52:54 +0100 Subject: [PATCH 0527/1450] mptcp: add mptcp_userspace_pm_get_sock helper Each userspace pm netlink function uses nla_get_u32() to get the msk token value, then pass it to mptcp_token_get_sock() to get the msk. Finally check whether userspace PM is selected on this msk. It makes sense to wrap them into a helper, named mptcp_userspace_pm_get_sock(), to do this. This patch doesn't change the behaviour of the code, just refactoring. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241213-net-next-mptcp-pm-misc-cleanup-v1-3-ddb6d00109a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 144 +++++++++++++-------------------------- 1 file changed, 47 insertions(+), 97 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 6a27fab238f15..afb04343e74d2 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -173,36 +173,50 @@ bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, return backup; } -int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) +static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct mptcp_sock *msk; + + if (!token) { + GENL_SET_ERR_MSG(info, "missing required token"); + return NULL; + } + + msk = mptcp_token_get_sock(genl_info_net(info), nla_get_u32(token)); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return NULL; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + sock_put((struct sock *)msk); + return NULL; + } + + return msk; +} + +int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) +{ struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_val; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; - u32 token_val; - if (!addr || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (!addr) { + GENL_SET_ERR_MSG(info, "missing required address"); return err; } - token_val = nla_get_u32(token); - - msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto announce_err; - } - err = mptcp_pm_parse_entry(addr, info, true, &addr_val); if (err < 0) { GENL_SET_ERR_MSG(info, "error parsing local address"); @@ -275,7 +289,6 @@ static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; struct mptcp_pm_addr_entry *match; struct mptcp_pm_addr_entry *entry; @@ -283,30 +296,21 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) LIST_HEAD(free_list); int err = -EINVAL; struct sock *sk; - u32 token_val; u8 id_val; - if (!id || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (!id) { + GENL_SET_ERR_MSG(info, "missing required ID"); return err; } id_val = nla_get_u8(id); - token_val = nla_get_u32(token); - msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto out; - } - if (id_val == 0) { err = mptcp_userspace_pm_remove_id_zero_address(msk, info); goto out; @@ -343,7 +347,6 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry entry = { 0 }; struct mptcp_addr_info addr_r; @@ -351,28 +354,18 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; - u32 token_val; - if (!laddr || !raddr || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (!laddr || !raddr) { + GENL_SET_ERR_MSG(info, "missing required address(es)"); return err; } - token_val = nla_get_u32(token); - - msk = mptcp_token_get_sock(genl_info_net(info), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto create_err; - } - err = mptcp_pm_parse_entry(laddr, info, true, &entry); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); @@ -475,35 +468,24 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_addr_info addr_l; struct mptcp_addr_info addr_r; struct mptcp_sock *msk; struct sock *sk, *ssk; int err = -EINVAL; - u32 token_val; - if (!laddr || !raddr || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (!laddr || !raddr) { + GENL_SET_ERR_MSG(info, "missing required address(es)"); return err; } - token_val = nla_get_u32(token); - - msk = mptcp_token_get_sock(genl_info_net(info), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto destroy_err; - } - err = mptcp_pm_parse_addr(laddr, info, &addr_l); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); @@ -566,31 +548,19 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; struct mptcp_pm_addr_entry rem = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; - struct net *net = sock_net(skb->sk); struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; - u32 token_val; u8 bkup = 0; - token_val = nla_get_u32(token); - - msk = mptcp_token_get_sock(net, token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return ret; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "userspace PM not selected"); - goto set_flags_err; - } - ret = mptcp_pm_parse_entry(attr, info, false, &loc); if (ret < 0) goto set_flags_err; @@ -637,30 +607,20 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, DECLARE_BITMAP(map, MPTCP_PM_MAX_ADDR_ID + 1); } *bitmap; const struct genl_info *info = genl_info_dump(cb); - struct net *net = sock_net(msg->sk); struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; - struct nlattr *token; int ret = -EINVAL; struct sock *sk; void *hdr; bitmap = (struct id_bitmap *)cb->ctx; - token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - msk = mptcp_token_get_sock(net, nla_get_u32(token)); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return ret; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto out; - } - lock_sock(sk); spin_lock_bh(&msk->pm.lock); mptcp_for_each_userspace_pm_addr(msk, entry) { @@ -685,7 +645,6 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, release_sock(sk); ret = msg->len; -out: sock_put(sk); return ret; } @@ -694,28 +653,19 @@ int mptcp_userspace_pm_get_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct mptcp_pm_addr_entry addr, *entry; - struct net *net = sock_net(skb->sk); struct mptcp_sock *msk; struct sk_buff *msg; int ret = -EINVAL; struct sock *sk; void *reply; - msk = mptcp_token_get_sock(net, nla_get_u32(token)); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return ret; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto out; - } - ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) goto out; From 8008e77e07418a2a43235c2288430602b0d4c8da Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 13 Dec 2024 20:52:55 +0100 Subject: [PATCH 0528/1450] mptcp: move mptcp_pm_remove_addrs into pm_userspace Since mptcp_pm_remove_addrs() is only called from the userspace PM, this patch moves it into pm_userspace.c. For this, lookup_subflow_by_saddr() and remove_anno_list_by_saddr() helpers need to be exported in protocol.h. Also add "mptcp_" prefix for these helpers. Here, mptcp_pm_remove_addrs() is not changed to a static function because it will be used in BPF Path Manager. This patch doesn't change the behaviour of the code, just refactoring. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241213-net-next-mptcp-pm-misc-cleanup-v1-4-ddb6d00109a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 46 +++++++--------------------------------- net/mptcp/pm_userspace.c | 28 ++++++++++++++++++++++++ net/mptcp/protocol.h | 4 ++++ 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7a0f7998376a5..98ac73938bd81 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -107,8 +107,8 @@ static void remote_address(const struct sock_common *skc, #endif } -static bool lookup_subflow_by_saddr(const struct list_head *list, - const struct mptcp_addr_info *saddr) +bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, + const struct mptcp_addr_info *saddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; @@ -1447,8 +1447,8 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) return ret; } -static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) +bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; @@ -1476,7 +1476,7 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - ret = remove_anno_list_by_saddr(msk, addr); + ret = mptcp_remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); if (ret) { @@ -1520,7 +1520,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, } lock_sock(sk); - remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); + remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); @@ -1633,36 +1633,6 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return ret; } -/* Called from the userspace PM only */ -void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) -{ - struct mptcp_rm_list alist = { .nr = 0 }; - struct mptcp_pm_addr_entry *entry; - int anno_nr = 0; - - list_for_each_entry(entry, rm_list, list) { - if (alist.nr >= MPTCP_RM_IDS_MAX) - break; - - /* only delete if either announced or matching a subflow */ - if (remove_anno_list_by_saddr(msk, &entry->addr)) - anno_nr++; - else if (!lookup_subflow_by_saddr(&msk->conn_list, - &entry->addr)) - continue; - - alist.ids[alist.nr++] = entry->addr.id; - } - - if (alist.nr) { - spin_lock_bh(&msk->pm.lock); - msk->pm.add_addr_signaled -= anno_nr; - mptcp_pm_remove_addr(msk, &alist); - spin_unlock_bh(&msk->pm.lock); - } -} - -/* Called from the in-kernel PM only */ static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list) { @@ -1671,11 +1641,11 @@ static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, list_for_each_entry(entry, rm_list, list) { if (slist.nr < MPTCP_RM_IDS_MAX && - lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) + mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); if (alist.nr < MPTCP_RM_IDS_MAX && - remove_anno_list_by_saddr(msk, &entry->addr)) + mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); } diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index afb04343e74d2..cac4b4a7b1e58 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -287,6 +287,34 @@ static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, return err; } +void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) +{ + struct mptcp_rm_list alist = { .nr = 0 }; + struct mptcp_pm_addr_entry *entry; + int anno_nr = 0; + + list_for_each_entry(entry, rm_list, list) { + if (alist.nr >= MPTCP_RM_IDS_MAX) + break; + + /* only delete if either announced or matching a subflow */ + if (mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) + anno_nr++; + else if (!mptcp_lookup_subflow_by_saddr(&msk->conn_list, + &entry->addr)) + continue; + + alist.ids[alist.nr++] = entry->addr.id; + } + + if (alist.nr) { + spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= anno_nr; + mptcp_pm_remove_addr(msk, &alist); + spin_unlock_bh(&msk->pm.lock); + } +} + int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a93e661ef5c43..5ba67cb601e02 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1027,6 +1027,10 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr); +bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, + const struct mptcp_addr_info *saddr); +bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info); From 88d0973163711a5313ddd479a1ff543b5ac93d51 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 13 Dec 2024 20:52:56 +0100 Subject: [PATCH 0529/1450] mptcp: drop free_list for deleting entries mptcp_pm_remove_addrs() actually only deletes one address, which does not match its name. This patch renames it to mptcp_pm_remove_addr_entry() and changes the parameter "rm_list" to "entry". With the help of mptcp_pm_remove_addr_entry(), it's no longer necessary to move the entry to be deleted to free_list and then traverse the list to delete the entry, which is not allowed in BPF. The entry can be directly deleted through list_del_rcu() and sock_kfree_s() now. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241213-net-next-mptcp-pm-misc-cleanup-v1-5-ddb6d00109a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 42 ++++++++++++++-------------------------- net/mptcp/protocol.h | 3 ++- 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index cac4b4a7b1e58..7689ea987be35 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -287,41 +287,31 @@ static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, return err; } -void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) +void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry) { struct mptcp_rm_list alist = { .nr = 0 }; - struct mptcp_pm_addr_entry *entry; int anno_nr = 0; - list_for_each_entry(entry, rm_list, list) { - if (alist.nr >= MPTCP_RM_IDS_MAX) - break; - - /* only delete if either announced or matching a subflow */ - if (mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) - anno_nr++; - else if (!mptcp_lookup_subflow_by_saddr(&msk->conn_list, - &entry->addr)) - continue; + /* only delete if either announced or matching a subflow */ + if (mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) + anno_nr++; + else if (!mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) + return; - alist.ids[alist.nr++] = entry->addr.id; - } + alist.ids[alist.nr++] = entry->addr.id; - if (alist.nr) { - spin_lock_bh(&msk->pm.lock); - msk->pm.add_addr_signaled -= anno_nr; - mptcp_pm_remove_addr(msk, &alist); - spin_unlock_bh(&msk->pm.lock); - } + spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= anno_nr; + mptcp_pm_remove_addr(msk, &alist); + spin_unlock_bh(&msk->pm.lock); } int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; struct mptcp_pm_addr_entry *match; - struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; - LIST_HEAD(free_list); int err = -EINVAL; struct sock *sk; u8 id_val; @@ -355,16 +345,14 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) goto out; } - list_move(&match->list, &free_list); + list_del_rcu(&match->list); spin_unlock_bh(&msk->pm.lock); - mptcp_pm_remove_addrs(msk, &free_list); + mptcp_pm_remove_addr_entry(msk, match); release_sock(sk); - list_for_each_entry_safe(match, entry, &free_list, list) { - sock_kfree_s(sk, match, sizeof(*match)); - } + sock_kfree_s(sk, match, sizeof(*match)); err = 0; out: diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5ba67cb601e02..cd5132fe7d220 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1038,7 +1038,8 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); -void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list); +void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry); void mptcp_free_local_addr_list(struct mptcp_sock *msk); From 1c670b39cec7603893e7d0169578409dccf63e94 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 13 Dec 2024 20:52:57 +0100 Subject: [PATCH 0530/1450] mptcp: change local addr type of subflow_destroy Generally, in the path manager interfaces, the local address is defined as an mptcp_pm_addr_entry type address, while the remote address is defined as an mptcp_addr_info type one: (struct mptcp_pm_addr_entry *local, struct mptcp_addr_info *remote) But subflow_destroy() interface uses two mptcp_addr_info type parameters. This patch changes the first one to mptcp_pm_addr_entry type and use helper mptcp_pm_parse_entry() to parse it instead of using mptcp_pm_parse_addr(). This patch doesn't change the behaviour of the code, just refactoring. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241213-net-next-mptcp-pm-misc-cleanup-v1-6-ddb6d00109a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 7689ea987be35..1d5b77e0a722d 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -485,7 +485,7 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; - struct mptcp_addr_info addr_l; + struct mptcp_pm_addr_entry addr_l; struct mptcp_addr_info addr_r; struct mptcp_sock *msk; struct sock *sk, *ssk; @@ -502,7 +502,7 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info sk = (struct sock *)msk; - err = mptcp_pm_parse_addr(laddr, info, &addr_l); + err = mptcp_pm_parse_entry(laddr, info, true, &addr_l); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto destroy_err; @@ -515,35 +515,34 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (addr_l.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { - ipv6_addr_set_v4mapped(addr_l.addr.s_addr, &addr_l.addr6); - addr_l.family = AF_INET6; + if (addr_l.addr.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { + ipv6_addr_set_v4mapped(addr_l.addr.addr.s_addr, &addr_l.addr.addr6); + addr_l.addr.family = AF_INET6; } - if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr6)) { - ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_r.addr6); + if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr.addr6)) { + ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_l.addr.addr6); addr_r.family = AF_INET6; } #endif - if (addr_l.family != addr_r.family) { + if (addr_l.addr.family != addr_r.family) { GENL_SET_ERR_MSG(info, "address families do not match"); err = -EINVAL; goto destroy_err; } - if (!addr_l.port || !addr_r.port) { + if (!addr_l.addr.port || !addr_r.port) { GENL_SET_ERR_MSG(info, "missing local or remote port"); err = -EINVAL; goto destroy_err; } lock_sock(sk); - ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); + ssk = mptcp_nl_find_ssk(msk, &addr_l.addr, &addr_r); if (ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct mptcp_pm_addr_entry entry = { .addr = addr_l }; spin_lock_bh(&msk->pm.lock); - mptcp_userspace_pm_delete_local_addr(msk, &entry); + mptcp_userspace_pm_delete_local_addr(msk, &addr_l); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); mptcp_close_ssk(sk, ssk, subflow); From 5409fd6fec680d59111708ba337b09c1d36db0a8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 13 Dec 2024 20:52:58 +0100 Subject: [PATCH 0531/1450] mptcp: drop useless "err = 0" in subflow_destroy Upon successful return, mptcp_pm_parse_addr() returns 0. There is no need to set "err = 0" after this. So after mptcp_nl_find_ssk() returns, just need to set "err = -ESRCH", then release and free msk socket if it returns NULL. Also, no need to define the variable "subflow" in subflow_destroy(), use mptcp_subflow_ctx(ssk) directly. This patch doesn't change the behaviour of the code, just refactoring. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241213-net-next-mptcp-pm-misc-cleanup-v1-7-ddb6d00109a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 1d5b77e0a722d..740a10d669f85 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -538,19 +538,18 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info lock_sock(sk); ssk = mptcp_nl_find_ssk(msk, &addr_l.addr, &addr_r); - if (ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - - spin_lock_bh(&msk->pm.lock); - mptcp_userspace_pm_delete_local_addr(msk, &addr_l); - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); - mptcp_close_ssk(sk, ssk, subflow); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); - err = 0; - } else { + if (!ssk) { err = -ESRCH; + goto release_sock; } + + spin_lock_bh(&msk->pm.lock); + mptcp_userspace_pm_delete_local_addr(msk, &addr_l); + spin_unlock_bh(&msk->pm.lock); + mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); + mptcp_close_ssk(sk, ssk, mptcp_subflow_ctx(ssk)); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); +release_sock: release_sock(sk); destroy_err: From 9590d32e090ea2751e131ae5273859ca22f5ac14 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 12 Dec 2024 13:31:55 -0800 Subject: [PATCH 0532/1450] ionic: Fix netdev notifier unregister on failure If register_netdev() fails, then the driver leaks the netdev notifier. Fix this by calling ionic_lif_unregister() on register_netdev() failure. This will also call ionic_lif_unregister_phc() if it has already been registered. Fixes: 30b87ab4c0b3 ("ionic: remove lif list concept") Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241212213157.12212-2-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 40496587b2b31..3d3f936779f7d 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -3869,8 +3869,8 @@ int ionic_lif_register(struct ionic_lif *lif) /* only register LIF0 for now */ err = register_netdev(lif->netdev); if (err) { - dev_err(lif->ionic->dev, "Cannot register net device, aborting\n"); - ionic_lif_unregister_phc(lif); + dev_err(lif->ionic->dev, "Cannot register net device: %d, aborting\n", err); + ionic_lif_unregister(lif); return err; } From 746e6ae2e202b062b9deee7bd86d94937997ecd7 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 12 Dec 2024 13:31:56 -0800 Subject: [PATCH 0533/1450] ionic: no double destroy workqueue There are some FW error handling paths that can cause us to try to destroy the workqueue more than once, so let's be sure we're checking for that. The case where this popped up was in an AER event where the handlers got called in such a way that ionic_reset_prepare() and thus ionic_dev_teardown() got called twice in a row. The second time through the workqueue was already destroyed, and destroy_workqueue() choked on the bad wq pointer. We didn't hit this in AER handler testing before because at that time we weren't using a private workqueue. Later we replaced the use of the system workqueue with our own private workqueue but hadn't rerun the AER handler testing since then. Fixes: 9e25450da700 ("ionic: add private workqueue per-device") Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241212213157.12212-3-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c index 9e42d599840de..57edcde9e6f8c 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c @@ -277,7 +277,10 @@ void ionic_dev_teardown(struct ionic *ionic) idev->phy_cmb_pages = 0; idev->cmb_npages = 0; - destroy_workqueue(ionic->wq); + if (ionic->wq) { + destroy_workqueue(ionic->wq); + ionic->wq = NULL; + } mutex_destroy(&idev->cmb_inuse_lock); } From b096d62ba1323391b2db98b7704e2468cf3b1588 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 12 Dec 2024 13:31:57 -0800 Subject: [PATCH 0534/1450] ionic: use ee->offset when returning sprom data Some calls into ionic_get_module_eeprom() don't use a single full buffer size, but instead multiple calls with an offset. Teach our driver to use the offset correctly so we can respond appropriately to the caller. Fixes: 4d03e00a2140 ("ionic: Add initial ethtool support") Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241212213157.12212-4-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index dda22fa4448cf..9b7f78b6cdb1e 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -961,8 +961,8 @@ static int ionic_get_module_eeprom(struct net_device *netdev, len = min_t(u32, sizeof(xcvr->sprom), ee->len); do { - memcpy(data, xcvr->sprom, len); - memcpy(tbuf, xcvr->sprom, len); + memcpy(data, &xcvr->sprom[ee->offset], len); + memcpy(tbuf, &xcvr->sprom[ee->offset], len); /* Let's make sure we got a consistent copy */ if (!memcmp(data, tbuf, len)) From 282da38b465395c930687974627c24f47ddce5ff Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 10 Dec 2024 12:35:34 +0100 Subject: [PATCH 0535/1450] s390/mm: Consider KMSAN modules metadata for paging levels The calculation determining whether to use three- or four-level paging didn't account for KMSAN modules metadata. Include this metadata in the virtual memory size calculation to ensure correct paging mode selection and avoiding potentially unnecessary physical memory size limitations. Fixes: 65ca73f9fb36 ("s390/mm: define KMSAN metadata for vmalloc and modules") Acked-by: Heiko Carstens Reviewed-by: Alexander Gordeev Reviewed-by: Ilya Leoshkevich Signed-off-by: Vasily Gorbik Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index abe6e6c0ab986..6087d38c72351 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -234,6 +234,8 @@ static unsigned long get_vmem_size(unsigned long identity_size, vsize = round_up(SZ_2G + max_mappable, rte_size) + round_up(vmemmap_size, rte_size) + FIXMAP_SIZE + MODULES_LEN + KASLR_LEN; + if (IS_ENABLED(CONFIG_KMSAN)) + vsize += MODULES_LEN * 2; return size_add(vsize, vmalloc_size); } From 922b4b955a03d19fea98938f33ef0e62d01f5159 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Thu, 12 Dec 2024 11:25:58 +0500 Subject: [PATCH 0536/1450] net: renesas: rswitch: rework ts tags management The existing linked list based implementation of how ts tags are assigned and managed is unsafe against concurrency and corner cases: - element addition in tx processing can race against element removal in ts queue completion, - element removal in ts queue completion can race against element removal in device close, - if a large number of frames gets added to tx queue without ts queue completions in between, elements with duplicate tag values can get added. Use a different implementation, based on per-port used tags bitmaps and saved skb arrays. Safety for addition in tx processing vs removal in ts completion is provided by: tag = find_first_zero_bit(...); smp_mb(); ts_skb[tag]> set_bit(...); vs ts_skb[tag]> smp_mb(); clear_bit(...); Safety for removal in ts completion vs removal in device close is provided by using atomic read-and-clear for rdev->ts_skb[tag]: ts_skb = xchg(&rdev->ts_skb[tag], NULL); if (ts_skb) Fixes: 33f5d733b589 ("net: renesas: rswitch: Improve TX timestamp accuracy") Signed-off-by: Nikita Yushchenko Link: https://patch.msgid.link/20241212062558.436455-1-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 74 ++++++++++++++------------ drivers/net/ethernet/renesas/rswitch.h | 13 ++--- 2 files changed, 42 insertions(+), 45 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index dbbbf024e7abc..9ac6e2aad18f6 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -547,7 +547,6 @@ static int rswitch_gwca_ts_queue_alloc(struct rswitch_private *priv) desc = &gq->ts_ring[gq->ring_size]; desc->desc.die_dt = DT_LINKFIX; rswitch_desc_set_dptr(&desc->desc, gq->ring_dma); - INIT_LIST_HEAD(&priv->gwca.ts_info_list); return 0; } @@ -1003,9 +1002,10 @@ static int rswitch_gwca_request_irqs(struct rswitch_private *priv) static void rswitch_ts(struct rswitch_private *priv) { struct rswitch_gwca_queue *gq = &priv->gwca.ts_queue; - struct rswitch_gwca_ts_info *ts_info, *ts_info2; struct skb_shared_hwtstamps shhwtstamps; struct rswitch_ts_desc *desc; + struct rswitch_device *rdev; + struct sk_buff *ts_skb; struct timespec64 ts; unsigned int num; u32 tag, port; @@ -1015,23 +1015,28 @@ static void rswitch_ts(struct rswitch_private *priv) dma_rmb(); port = TS_DESC_DPN(__le32_to_cpu(desc->desc.dptrl)); - tag = TS_DESC_TSUN(__le32_to_cpu(desc->desc.dptrl)); - - list_for_each_entry_safe(ts_info, ts_info2, &priv->gwca.ts_info_list, list) { - if (!(ts_info->port == port && ts_info->tag == tag)) - continue; - - memset(&shhwtstamps, 0, sizeof(shhwtstamps)); - ts.tv_sec = __le32_to_cpu(desc->ts_sec); - ts.tv_nsec = __le32_to_cpu(desc->ts_nsec & cpu_to_le32(0x3fffffff)); - shhwtstamps.hwtstamp = timespec64_to_ktime(ts); - skb_tstamp_tx(ts_info->skb, &shhwtstamps); - dev_consume_skb_irq(ts_info->skb); - list_del(&ts_info->list); - kfree(ts_info); - break; - } + if (unlikely(port >= RSWITCH_NUM_PORTS)) + goto next; + rdev = priv->rdev[port]; + tag = TS_DESC_TSUN(__le32_to_cpu(desc->desc.dptrl)); + if (unlikely(tag >= TS_TAGS_PER_PORT)) + goto next; + ts_skb = xchg(&rdev->ts_skb[tag], NULL); + smp_mb(); /* order rdev->ts_skb[] read before bitmap update */ + clear_bit(tag, rdev->ts_skb_used); + + if (unlikely(!ts_skb)) + goto next; + + memset(&shhwtstamps, 0, sizeof(shhwtstamps)); + ts.tv_sec = __le32_to_cpu(desc->ts_sec); + ts.tv_nsec = __le32_to_cpu(desc->ts_nsec & cpu_to_le32(0x3fffffff)); + shhwtstamps.hwtstamp = timespec64_to_ktime(ts); + skb_tstamp_tx(ts_skb, &shhwtstamps); + dev_consume_skb_irq(ts_skb); + +next: gq->cur = rswitch_next_queue_index(gq, true, 1); desc = &gq->ts_ring[gq->cur]; } @@ -1576,8 +1581,9 @@ static int rswitch_open(struct net_device *ndev) static int rswitch_stop(struct net_device *ndev) { struct rswitch_device *rdev = netdev_priv(ndev); - struct rswitch_gwca_ts_info *ts_info, *ts_info2; + struct sk_buff *ts_skb; unsigned long flags; + unsigned int tag; netif_tx_stop_all_queues(ndev); @@ -1594,12 +1600,13 @@ static int rswitch_stop(struct net_device *ndev) if (bitmap_empty(rdev->priv->opened_ports, RSWITCH_NUM_PORTS)) iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDID); - list_for_each_entry_safe(ts_info, ts_info2, &rdev->priv->gwca.ts_info_list, list) { - if (ts_info->port != rdev->port) - continue; - dev_kfree_skb_irq(ts_info->skb); - list_del(&ts_info->list); - kfree(ts_info); + for (tag = find_first_bit(rdev->ts_skb_used, TS_TAGS_PER_PORT); + tag < TS_TAGS_PER_PORT; + tag = find_next_bit(rdev->ts_skb_used, TS_TAGS_PER_PORT, tag + 1)) { + ts_skb = xchg(&rdev->ts_skb[tag], NULL); + clear_bit(tag, rdev->ts_skb_used); + if (ts_skb) + dev_kfree_skb(ts_skb); } return 0; @@ -1612,20 +1619,17 @@ static bool rswitch_ext_desc_set_info1(struct rswitch_device *rdev, desc->info1 = cpu_to_le64(INFO1_DV(BIT(rdev->etha->index)) | INFO1_IPV(GWCA_IPV_NUM) | INFO1_FMT); if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) { - struct rswitch_gwca_ts_info *ts_info; + unsigned int tag; - ts_info = kzalloc(sizeof(*ts_info), GFP_ATOMIC); - if (!ts_info) + tag = find_first_zero_bit(rdev->ts_skb_used, TS_TAGS_PER_PORT); + if (tag == TS_TAGS_PER_PORT) return false; + smp_mb(); /* order bitmap read before rdev->ts_skb[] write */ + rdev->ts_skb[tag] = skb_get(skb); + set_bit(tag, rdev->ts_skb_used); skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; - rdev->ts_tag++; - desc->info1 |= cpu_to_le64(INFO1_TSUN(rdev->ts_tag) | INFO1_TXC); - - ts_info->skb = skb_get(skb); - ts_info->port = rdev->port; - ts_info->tag = rdev->ts_tag; - list_add_tail(&ts_info->list, &rdev->priv->gwca.ts_info_list); + desc->info1 |= cpu_to_le64(INFO1_TSUN(tag) | INFO1_TXC); skb_tx_timestamp(skb); } diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index e020800dcc570..d8d4ed7d7f8b6 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -972,14 +972,6 @@ struct rswitch_gwca_queue { }; }; -struct rswitch_gwca_ts_info { - struct sk_buff *skb; - struct list_head list; - - int port; - u8 tag; -}; - #define RSWITCH_NUM_IRQ_REGS (RSWITCH_MAX_NUM_QUEUES / BITS_PER_TYPE(u32)) struct rswitch_gwca { unsigned int index; @@ -989,7 +981,6 @@ struct rswitch_gwca { struct rswitch_gwca_queue *queues; int num_queues; struct rswitch_gwca_queue ts_queue; - struct list_head ts_info_list; DECLARE_BITMAP(used, RSWITCH_MAX_NUM_QUEUES); u32 tx_irq_bits[RSWITCH_NUM_IRQ_REGS]; u32 rx_irq_bits[RSWITCH_NUM_IRQ_REGS]; @@ -997,6 +988,7 @@ struct rswitch_gwca { }; #define NUM_QUEUES_PER_NDEV 2 +#define TS_TAGS_PER_PORT 256 struct rswitch_device { struct rswitch_private *priv; struct net_device *ndev; @@ -1004,7 +996,8 @@ struct rswitch_device { void __iomem *addr; struct rswitch_gwca_queue *tx_queue; struct rswitch_gwca_queue *rx_queue; - u8 ts_tag; + struct sk_buff *ts_skb[TS_TAGS_PER_PORT]; + DECLARE_BITMAP(ts_skb_used, TS_TAGS_PER_PORT); bool disabled; int port; From 900f83cf376bdaf798b6f5dcb2eae0c822e908b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thi=C3=A9baud=20Weksteen?= Date: Thu, 5 Dec 2024 12:09:19 +1100 Subject: [PATCH 0537/1450] selinux: ignore unknown extended permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When evaluating extended permissions, ignore unknown permissions instead of calling BUG(). This commit ensures that future permissions can be added without interfering with older kernels. Cc: stable@vger.kernel.org Fixes: fa1aa143ac4a ("selinux: extended permissions for ioctls") Signed-off-by: Thiébaud Weksteen Signed-off-by: Paul Moore --- security/selinux/ss/services.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 971c45d576ba1..3d5c563cfc4c8 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -979,7 +979,10 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, return; break; default: - BUG(); + pr_warn_once( + "SELinux: unknown extended permission (%u) will be ignored\n", + node->datum.u.xperms->specified); + return; } if (node->key.specified == AVTAB_XPERMS_ALLOWED) { @@ -998,7 +1001,8 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, &node->datum.u.xperms->perms, xpermd->dontaudit); } else { - BUG(); + pr_warn_once("SELinux: unknown specified key (%u)\n", + node->key.specified); } } From 83c47d9e0ce79b5d7c0b21b9f35402dbde0fa15c Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Sat, 14 Dec 2024 12:16:45 +0900 Subject: [PATCH 0538/1450] ksmbd: count all requests in req_running counter This changes the semantics of req_running to count all in-flight requests on a given connection, rather than the number of elements in the conn->request list. The latter is used only in smb2_cancel, and the counter is not used Signed-off-by: Marios Makassikis Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index c14dd72e1b301..be9656d524a4c 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -120,8 +120,8 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work) if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE) requests_queue = &conn->requests; + atomic_inc(&conn->req_running); if (requests_queue) { - atomic_inc(&conn->req_running); spin_lock(&conn->request_lock); list_add_tail(&work->request_entry, requests_queue); spin_unlock(&conn->request_lock); @@ -132,11 +132,12 @@ void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; + atomic_dec(&conn->req_running); + if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) return; - atomic_dec(&conn->req_running); spin_lock(&conn->request_lock); list_del_init(&work->request_entry); spin_unlock(&conn->request_lock); From 43fb7bce8866e793275c4f9f25af6a37745f3416 Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Sat, 14 Dec 2024 12:17:23 +0900 Subject: [PATCH 0539/1450] ksmbd: fix broken transfers when exceeding max simultaneous operations Since commit 0a77d947f599 ("ksmbd: check outstanding simultaneous SMB operations"), ksmbd enforces a maximum number of simultaneous operations for a connection. The problem is that reaching the limit causes ksmbd to close the socket, and the client has no indication that it should have slowed down. This behaviour can be reproduced by setting "smb2 max credits = 128" (or lower), and transferring a large file (25GB). smbclient fails as below: $ smbclient //192.168.1.254/testshare -U user%pass smb: \> put file.bin cli_push returned NT_STATUS_USER_SESSION_DELETED putting file file.bin as \file.bin smb2cli_req_compound_submit: Insufficient credits. 0 available, 1 needed NT_STATUS_INTERNAL_ERROR closing remote file \file.bin smb: \> smb2cli_req_compound_submit: Insufficient credits. 0 available, 1 needed Windows clients fail with 0x8007003b (with smaller files even). Fix this by delaying reading from the socket until there's room to allocate a request. This effectively applies backpressure on the client, so the transfer completes, albeit at a slower rate. Fixes: 0a77d947f599 ("ksmbd: check outstanding simultaneous SMB operations") Signed-off-by: Marios Makassikis Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 13 +++++++++++-- fs/smb/server/connection.h | 1 - fs/smb/server/server.c | 7 +------ fs/smb/server/server.h | 1 + fs/smb/server/transport_ipc.c | 5 ++++- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index be9656d524a4c..f8a40f65db6ae 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -70,7 +70,6 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); atomic_set(&conn->refcnt, 1); - atomic_set(&conn->mux_smb_requests, 0); conn->total_credits = 1; conn->outstanding_credits = 0; @@ -133,6 +132,8 @@ void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; atomic_dec(&conn->req_running); + if (waitqueue_active(&conn->req_running_q)) + wake_up(&conn->req_running_q); if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) @@ -309,7 +310,7 @@ int ksmbd_conn_handler_loop(void *p) { struct ksmbd_conn *conn = (struct ksmbd_conn *)p; struct ksmbd_transport *t = conn->transport; - unsigned int pdu_size, max_allowed_pdu_size; + unsigned int pdu_size, max_allowed_pdu_size, max_req; char hdr_buf[4] = {0,}; int size; @@ -319,6 +320,7 @@ int ksmbd_conn_handler_loop(void *p) if (t->ops->prepare && t->ops->prepare(t)) goto out; + max_req = server_conf.max_inflight_req; conn->last_active = jiffies; set_freezable(); while (ksmbd_conn_alive(conn)) { @@ -328,6 +330,13 @@ int ksmbd_conn_handler_loop(void *p) kvfree(conn->request_buf); conn->request_buf = NULL; +recheck: + if (atomic_read(&conn->req_running) + 1 > max_req) { + wait_event_interruptible(conn->req_running_q, + atomic_read(&conn->req_running) < max_req); + goto recheck; + } + size = t->ops->read(t, hdr_buf, sizeof(hdr_buf), -1); if (size != sizeof(hdr_buf)) break; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 8ddd5a3c7bafb..b379ae4fdcdff 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -107,7 +107,6 @@ struct ksmbd_conn { __le16 signing_algorithm; bool binding; atomic_t refcnt; - atomic_t mux_smb_requests; }; struct ksmbd_conn_ops { diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 3ba95bd8edeb2..601e7fcbcf1e6 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -270,7 +270,6 @@ static void handle_ksmbd_work(struct work_struct *wk) ksmbd_conn_try_dequeue_request(work); ksmbd_free_work_struct(work); - atomic_dec(&conn->mux_smb_requests); /* * Checking waitqueue to dropping pending requests on * disconnection. waitqueue_active is safe because it @@ -300,11 +299,6 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) if (err) return 0; - if (atomic_inc_return(&conn->mux_smb_requests) >= conn->vals->max_credits) { - atomic_dec_return(&conn->mux_smb_requests); - return -ENOSPC; - } - work = ksmbd_alloc_work_struct(); if (!work) { pr_err("allocation for work failed\n"); @@ -367,6 +361,7 @@ static int server_conf_init(void) server_conf.auth_mechs |= KSMBD_AUTH_KRB5 | KSMBD_AUTH_MSKRB5; #endif + server_conf.max_inflight_req = SMB2_MAX_CREDITS; return 0; } diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h index 4fc529335271f..94187628ff089 100644 --- a/fs/smb/server/server.h +++ b/fs/smb/server/server.h @@ -42,6 +42,7 @@ struct ksmbd_server_config { struct smb_sid domain_sid; unsigned int auth_mechs; unsigned int max_connections; + unsigned int max_inflight_req; char *conf[SERVER_CONF_WORK_GROUP + 1]; struct task_struct *dh_task; diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 48cda3350e5a2..befaf42b84cc3 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -319,8 +319,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_write_size(req->smb2_max_write); if (req->smb2_max_trans) init_smb2_max_trans_size(req->smb2_max_trans); - if (req->smb2_max_credits) + if (req->smb2_max_credits) { init_smb2_max_credits(req->smb2_max_credits); + server_conf.max_inflight_req = + req->smb2_max_credits; + } if (req->smbd_max_io_size) init_smbd_max_io_size(req->smbd_max_io_size); From fe4ed2f09b492e3507615a053814daa8fafdecb1 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 14 Dec 2024 12:19:03 +0900 Subject: [PATCH 0540/1450] ksmbd: conn lock to serialize smb2 negotiate If client send parallel smb2 negotiate request on same connection, ksmbd_conn can be racy. smb2 negotiate handling that are not performance-related can be serialized with conn lock. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 803b35b89513e..23e21845f9286 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1097,6 +1097,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) return rc; } + ksmbd_conn_lock(conn); smb2_buf_len = get_rfc1002_len(work->request_buf); smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects); if (smb2_neg_size > smb2_buf_len) { @@ -1247,6 +1248,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_conn_set_need_negotiate(conn); err_out: + ksmbd_conn_unlock(conn); if (rc) rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; From aeb3ec99026979287266e4b5a1194789c1488c1a Mon Sep 17 00:00:00 2001 From: Rongwei Liu Date: Fri, 13 Dec 2024 00:13:20 +0200 Subject: [PATCH 0541/1450] net/mlx5: Add device cap abs_native_port_num When the abs_native_port_num is set, the native_port_num reported by the device may not be continuous and bigger than the num_lag_ports. Signed-off-by: Rongwei Liu Reviewed-by: Shay Drory Reviewed-by: Saeed Mahameed Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241212221329.961628-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5451ff1d43560..43b3cb4bf8d1c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1599,7 +1599,8 @@ enum { struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x6]; u8 page_request_disable[0x1]; - u8 reserved_at_7[0x9]; + u8 abs_native_port_num[0x1]; + u8 reserved_at_8[0x8]; u8 shared_object_to_user_object_allowed[0x1]; u8 reserved_at_13[0xe]; u8 vhca_resource_manager[0x1]; From 24740385cb0d6d22ab7fa7adf36546d5b3cdcf73 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 15 Nov 2024 11:54:40 +0200 Subject: [PATCH 0542/1450] thunderbolt: Improve redrive mode handling When USB-C monitor is connected directly to Intel Barlow Ridge host, it goes into "redrive" mode that basically routes the DisplayPort signals directly from the GPU to the USB-C monitor without any tunneling needed. However, the host router must be powered on for this to work. Aaron reported that there are a couple of cases where this will not work with the current code: - Booting with USB-C monitor plugged in. - Plugging in USB-C monitor when the host router is in sleep state (runtime suspended). - Plugging in USB-C device while the system is in system sleep state. In all these cases once the host router is runtime suspended the picture on the connected USB-C display disappears too. This is certainly not what the user expected. For this reason improve the redrive mode handling to keep the host router from runtime suspending when detect that any of the above cases is happening. Fixes: a75e0684efe5 ("thunderbolt: Keep the domain powered when USB4 port is in redrive mode") Reported-by: Aaron Rainbolt Closes: https://lore.kernel.org/linux-usb/20241009220118.70bfedd0@kf-ir16/ Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/tb.c | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index 4f777788e9179..a7c6919fbf978 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -2059,6 +2059,37 @@ static void tb_exit_redrive(struct tb_port *port) } } +static void tb_switch_enter_redrive(struct tb_switch *sw) +{ + struct tb_port *port; + + tb_switch_for_each_port(sw, port) + tb_enter_redrive(port); +} + +/* + * Called during system and runtime suspend to forcefully exit redrive + * mode without querying whether the resource is available. + */ +static void tb_switch_exit_redrive(struct tb_switch *sw) +{ + struct tb_port *port; + + if (!(sw->quirks & QUIRK_KEEP_POWER_IN_DP_REDRIVE)) + return; + + tb_switch_for_each_port(sw, port) { + if (!tb_port_is_dpin(port)) + continue; + + if (port->redrive) { + port->redrive = false; + pm_runtime_put(&sw->dev); + tb_port_dbg(port, "exit redrive mode\n"); + } + } +} + static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port) { struct tb_port *in, *out; @@ -2909,6 +2940,7 @@ static int tb_start(struct tb *tb, bool reset) tb_create_usb3_tunnels(tb->root_switch); /* Add DP IN resources for the root switch */ tb_add_dp_resources(tb->root_switch); + tb_switch_enter_redrive(tb->root_switch); /* Make the discovered switches available to the userspace */ device_for_each_child(&tb->root_switch->dev, NULL, tb_scan_finalize_switch); @@ -2924,6 +2956,7 @@ static int tb_suspend_noirq(struct tb *tb) tb_dbg(tb, "suspending...\n"); tb_disconnect_and_release_dp(tb); + tb_switch_exit_redrive(tb->root_switch); tb_switch_suspend(tb->root_switch, false); tcm->hotplug_active = false; /* signal tb_handle_hotplug to quit */ tb_dbg(tb, "suspend finished\n"); @@ -3016,6 +3049,7 @@ static int tb_resume_noirq(struct tb *tb) tb_dbg(tb, "tunnels restarted, sleeping for 100ms\n"); msleep(100); } + tb_switch_enter_redrive(tb->root_switch); /* Allow tb_handle_hotplug to progress events */ tcm->hotplug_active = true; tb_dbg(tb, "resume finished\n"); @@ -3079,6 +3113,12 @@ static int tb_runtime_suspend(struct tb *tb) struct tb_cm *tcm = tb_priv(tb); mutex_lock(&tb->lock); + /* + * The below call only releases DP resources to allow exiting and + * re-entering redrive mode. + */ + tb_disconnect_and_release_dp(tb); + tb_switch_exit_redrive(tb->root_switch); tb_switch_suspend(tb->root_switch, true); tcm->hotplug_active = false; mutex_unlock(&tb->lock); @@ -3110,6 +3150,7 @@ static int tb_runtime_resume(struct tb *tb) tb_restore_children(tb->root_switch); list_for_each_entry_safe(tunnel, n, &tcm->tunnel_list, list) tb_tunnel_restart(tunnel); + tb_switch_enter_redrive(tb->root_switch); tcm->hotplug_active = true; mutex_unlock(&tb->lock); From a60b990798eb17433d0283788280422b1bd94b18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 14 Dec 2024 12:50:18 +0100 Subject: [PATCH 0543/1450] PCI/MSI: Handle lack of irqdomain gracefully Alexandre observed a warning emitted from pci_msi_setup_msi_irqs() on a RISCV platform which does not provide PCI/MSI support: WARNING: CPU: 1 PID: 1 at drivers/pci/msi/msi.h:121 pci_msi_setup_msi_irqs+0x2c/0x32 __pci_enable_msix_range+0x30c/0x596 pci_msi_setup_msi_irqs+0x2c/0x32 pci_alloc_irq_vectors_affinity+0xb8/0xe2 RISCV uses hierarchical interrupt domains and correctly does not implement the legacy fallback. The warning triggers from the legacy fallback stub. That warning is bogus as the PCI/MSI layer knows whether a PCI/MSI parent domain is associated with the device or not. There is a check for MSI-X, which has a legacy assumption. But that legacy fallback assumption is only valid when legacy support is enabled, but otherwise the check should simply return -ENOTSUPP. Loongarch tripped over the same problem and blindly enabled legacy support without implementing the legacy fallbacks. There are weak implementations which return an error, so the problem was papered over. Correct pci_msi_domain_supports() to evaluate the legacy mode and add the missing supported check into the MSI enable path to complete it. Fixes: d2a463b29741 ("PCI/MSI: Reject multi-MSI early") Reported-by: Alexandre Ghiti Signed-off-by: Thomas Gleixner Tested-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/87ed2a8ow5.ffs@tglx --- drivers/pci/msi/irqdomain.c | 7 +++++-- drivers/pci/msi/msi.c | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 569125726b3e1..d7ba8795d60f8 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -350,8 +350,11 @@ bool pci_msi_domain_supports(struct pci_dev *pdev, unsigned int feature_mask, domain = dev_get_msi_domain(&pdev->dev); - if (!domain || !irq_domain_is_hierarchy(domain)) - return mode == ALLOW_LEGACY; + if (!domain || !irq_domain_is_hierarchy(domain)) { + if (IS_ENABLED(CONFIG_PCI_MSI_ARCH_FALLBACKS)) + return mode == ALLOW_LEGACY; + return false; + } if (!irq_domain_is_msi_parent(domain)) { /* diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 3a45879d85db9..2f647cac4cae3 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -433,6 +433,10 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, if (WARN_ON_ONCE(dev->msi_enabled)) return -EINVAL; + /* Test for the availability of MSI support */ + if (!pci_msi_domain_supports(dev, 0, ALLOW_LEGACY)) + return -ENOTSUPP; + nvec = pci_msi_vec_count(dev); if (nvec < 0) return nvec; From 88438444fdddd0244c8b2697713adcca3e71599e Mon Sep 17 00:00:00 2001 From: Venkata Prasad Potturu Date: Fri, 13 Dec 2024 11:41:46 +0530 Subject: [PATCH 0544/1450] ASoC: amd: ps: Fix for enabling DMIC on acp63 platform via _DSD entry Add condition check to register ACP PDM sound card by reading _WOV acpi entry. Fixes: 0386d765f27a ("ASoC: amd: ps: refactor acp device configuration read logic") Signed-off-by: Venkata Prasad Potturu Link: https://patch.msgid.link/20241213061147.1060451-1-venkataprasad.potturu@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/ps/pci-ps.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/sound/soc/amd/ps/pci-ps.c b/sound/soc/amd/ps/pci-ps.c index 823a69bf778b9..4575326d06352 100644 --- a/sound/soc/amd/ps/pci-ps.c +++ b/sound/soc/amd/ps/pci-ps.c @@ -375,11 +375,18 @@ static int get_acp63_device_config(struct pci_dev *pci, struct acp63_dev_data *a { struct acpi_device *pdm_dev; const union acpi_object *obj; + acpi_handle handle; + acpi_integer dmic_status; u32 config; bool is_dmic_dev = false; bool is_sdw_dev = false; + bool wov_en, dmic_en; int ret; + /* IF WOV entry not found, enable dmic based on acp-audio-device-type entry*/ + wov_en = true; + dmic_en = false; + config = readl(acp_data->acp63_base + ACP_PIN_CONFIG); switch (config) { case ACP_CONFIG_4: @@ -412,10 +419,18 @@ static int get_acp63_device_config(struct pci_dev *pci, struct acp63_dev_data *a if (!acpi_dev_get_property(pdm_dev, "acp-audio-device-type", ACPI_TYPE_INTEGER, &obj) && obj->integer.value == ACP_DMIC_DEV) - is_dmic_dev = true; + dmic_en = true; } + + handle = ACPI_HANDLE(&pci->dev); + ret = acpi_evaluate_integer(handle, "_WOV", NULL, &dmic_status); + if (!ACPI_FAILURE(ret)) + wov_en = dmic_status; } + if (dmic_en && wov_en) + is_dmic_dev = true; + if (acp_data->is_sdw_config) { ret = acp_scan_sdw_devices(&pci->dev, ACP63_SDW_ADDR); if (!ret && acp_data->info.link_mask) From 0471b1093e3a5d702ba2bf5987c35ee0e2336855 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 12 Dec 2024 16:36:04 +0100 Subject: [PATCH 0545/1450] tls: block decryption when a rekey is pending When a TLS handshake record carrying a KeyUpdate message is received, all subsequent records will be encrypted with a new key. We need to stop decrypting incoming records with the old key, and wait until userspace provides a new key. Make a note of this in the RX context just after decrypting that record, and stop recvmsg/splice calls with EKEYEXPIRED until the new key is available. key_update_pending can't be combined with the existing bitfield, because we will read it locklessly in ->poll. Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/net/tls.h | 3 +++ net/tls/tls_sw.c | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/include/net/tls.h b/include/net/tls.h index 61fef28801140..857340338b694 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -59,6 +59,8 @@ struct tls_rec; #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) +#define TLS_HANDSHAKE_KEYUPDATE 24 /* rfc8446 B.3: Key update */ + #define TLS_AAD_SPACE_SIZE 13 #define TLS_MAX_IV_SIZE 16 @@ -130,6 +132,7 @@ struct tls_sw_context_rx { u8 async_capable:1; u8 zc_capable:1; u8 reader_contended:1; + bool key_update_pending; struct tls_strparser strp; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bbf26cc4f6ee2..3dcf8ee60fea2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1314,6 +1314,10 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, int ret = 0; long timeo; + /* a rekey is pending, let userspace deal with it */ + if (unlikely(ctx->key_update_pending)) + return -EKEYEXPIRED; + timeo = sock_rcvtimeo(sk, nonblock); while (!tls_strp_msg_ready(ctx)) { @@ -1720,6 +1724,34 @@ tls_decrypt_device(struct sock *sk, struct msghdr *msg, return 1; } +static int tls_check_pending_rekey(struct tls_context *ctx, struct sk_buff *skb) +{ + const struct strp_msg *rxm = strp_msg(skb); + const struct tls_msg *tlm = tls_msg(skb); + char hs_type; + int err; + + if (likely(tlm->control != TLS_RECORD_TYPE_HANDSHAKE)) + return 0; + + if (rxm->full_len < 1) + return 0; + + err = skb_copy_bits(skb, rxm->offset, &hs_type, 1); + if (err < 0) { + DEBUG_NET_WARN_ON_ONCE(1); + return err; + } + + if (hs_type == TLS_HANDSHAKE_KEYUPDATE) { + struct tls_sw_context_rx *rx_ctx = ctx->priv_ctx_rx; + + WRITE_ONCE(rx_ctx->key_update_pending, true); + } + + return 0; +} + static int tls_rx_one_record(struct sock *sk, struct msghdr *msg, struct tls_decrypt_arg *darg) { @@ -1739,7 +1771,7 @@ static int tls_rx_one_record(struct sock *sk, struct msghdr *msg, rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); - return 0; + return tls_check_pending_rekey(tls_ctx, darg->skb); } int decrypt_skb(struct sock *sk, struct scatterlist *sgout) @@ -2719,6 +2751,7 @@ int tls_set_sw_offload(struct sock *sk, int tx) crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; + sw_ctx_rx->key_update_pending = false; } cipher_desc = get_cipher_desc(crypto_info->cipher_type); From 47069594e67e882ec5c1d8d374f6aab037511509 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 12 Dec 2024 16:36:05 +0100 Subject: [PATCH 0546/1450] tls: implement rekey for TLS1.3 This adds the possibility to change the key and IV when using TLS1.3. Changing the cipher or TLS version is not supported. Once we have updated the RX key, we can unblock the receive side. If the rekey fails, the context is unmodified and userspace is free to retry the update or close the socket. This change only affects tls_sw, since 1.3 offload isn't supported. Signed-off-by: Sabrina Dubroca Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls.h | 3 +- net/tls/tls_device.c | 2 +- net/tls/tls_main.c | 46 ++++++++++++++----- net/tls/tls_sw.c | 105 +++++++++++++++++++++++++++++-------------- 4 files changed, 108 insertions(+), 48 deletions(-) diff --git a/net/tls/tls.h b/net/tls/tls.h index e5e47452308ab..774859b63f0de 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -145,7 +145,8 @@ void tls_err_abort(struct sock *sk, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc); -int tls_set_sw_offload(struct sock *sk, int tx); +int tls_set_sw_offload(struct sock *sk, int tx, + struct tls_crypto_info *new_crypto_info); void tls_update_rx_zc_capable(struct tls_context *tls_ctx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index dc063c2c7950e..e50b6e71df13b 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1227,7 +1227,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; - rc = tls_set_sw_offload(sk, 0); + rc = tls_set_sw_offload(sk, 0, NULL); if (rc) goto release_ctx; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 6b4b9f2749a6f..68b5735dafc19 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -423,9 +423,10 @@ static __poll_t tls_sk_poll(struct file *file, struct socket *sock, ctx = tls_sw_ctx_rx(tls_ctx); psock = sk_psock_get(sk); - if (skb_queue_empty_lockless(&ctx->rx_list) && - !tls_strp_msg_ready(ctx) && - sk_psock_queue_empty(psock)) + if ((skb_queue_empty_lockless(&ctx->rx_list) && + !tls_strp_msg_ready(ctx) && + sk_psock_queue_empty(psock)) || + READ_ONCE(ctx->key_update_pending)) mask &= ~(EPOLLIN | EPOLLRDNORM); if (psock) @@ -612,11 +613,13 @@ static int validate_crypto_info(const struct tls_crypto_info *crypto_info, static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, unsigned int optlen, int tx) { - struct tls_crypto_info *crypto_info; - struct tls_crypto_info *alt_crypto_info; + struct tls_crypto_info *crypto_info, *alt_crypto_info; + struct tls_crypto_info *old_crypto_info = NULL; struct tls_context *ctx = tls_get_ctx(sk); const struct tls_cipher_desc *cipher_desc; union tls_crypto_context *crypto_ctx; + union tls_crypto_context tmp = {}; + bool update = false; int rc = 0; int conf; @@ -633,9 +636,18 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, crypto_info = &crypto_ctx->info; - /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) - return -EBUSY; + if (TLS_CRYPTO_INFO_READY(crypto_info)) { + /* Currently we only support setting crypto info more + * than one time for TLS 1.3 + */ + if (crypto_info->version != TLS_1_3_VERSION) + return -EBUSY; + + update = true; + old_crypto_info = crypto_info; + crypto_info = &tmp.info; + crypto_ctx = &tmp; + } rc = copy_from_sockptr(crypto_info, optval, sizeof(*crypto_info)); if (rc) { @@ -643,7 +655,14 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, goto err_crypto_info; } - rc = validate_crypto_info(crypto_info, alt_crypto_info); + if (update) { + /* Ensure that TLS version and ciphers are not modified */ + if (crypto_info->version != old_crypto_info->version || + crypto_info->cipher_type != old_crypto_info->cipher_type) + rc = -EINVAL; + } else { + rc = validate_crypto_info(crypto_info, alt_crypto_info); + } if (rc) goto err_crypto_info; @@ -673,7 +692,8 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); } else { - rc = tls_set_sw_offload(sk, 1); + rc = tls_set_sw_offload(sk, 1, + update ? crypto_info : NULL); if (rc) goto err_crypto_info; TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); @@ -687,14 +707,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); } else { - rc = tls_set_sw_offload(sk, 0); + rc = tls_set_sw_offload(sk, 0, + update ? crypto_info : NULL); if (rc) goto err_crypto_info; TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); conf = TLS_SW; } - tls_sw_strparser_arm(sk, ctx); + if (!update) + tls_sw_strparser_arm(sk, ctx); } if (tx) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 3dcf8ee60fea2..9e5aff5bab98e 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2716,12 +2716,22 @@ int init_prot_info(struct tls_prot_info *prot, return 0; } -int tls_set_sw_offload(struct sock *sk, int tx) +static void tls_finish_key_update(struct sock *sk, struct tls_context *tls_ctx) { + struct tls_sw_context_rx *ctx = tls_ctx->priv_ctx_rx; + + WRITE_ONCE(ctx->key_update_pending, false); + /* wake-up pre-existing poll() */ + ctx->saved_data_ready(sk); +} + +int tls_set_sw_offload(struct sock *sk, int tx, + struct tls_crypto_info *new_crypto_info) +{ + struct tls_crypto_info *crypto_info, *src_crypto_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; const struct tls_cipher_desc *cipher_desc; - struct tls_crypto_info *crypto_info; char *iv, *rec_seq, *key, *salt; struct cipher_context *cctx; struct tls_prot_info *prot; @@ -2733,45 +2743,47 @@ int tls_set_sw_offload(struct sock *sk, int tx) ctx = tls_get_ctx(sk); prot = &ctx->prot_info; - if (tx) { - ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); - if (!ctx->priv_ctx_tx) - return -ENOMEM; + /* new_crypto_info != NULL means rekey */ + if (!new_crypto_info) { + if (tx) { + ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); + if (!ctx->priv_ctx_tx) + return -ENOMEM; + } else { + ctx->priv_ctx_rx = init_ctx_rx(ctx); + if (!ctx->priv_ctx_rx) + return -ENOMEM; + } + } + if (tx) { sw_ctx_tx = ctx->priv_ctx_tx; crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { - ctx->priv_ctx_rx = init_ctx_rx(ctx); - if (!ctx->priv_ctx_rx) - return -ENOMEM; - sw_ctx_rx = ctx->priv_ctx_rx; crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; - sw_ctx_rx->key_update_pending = false; } - cipher_desc = get_cipher_desc(crypto_info->cipher_type); + src_crypto_info = new_crypto_info ?: crypto_info; + + cipher_desc = get_cipher_desc(src_crypto_info->cipher_type); if (!cipher_desc) { rc = -EINVAL; goto free_priv; } - rc = init_prot_info(prot, crypto_info, cipher_desc); + rc = init_prot_info(prot, src_crypto_info, cipher_desc); if (rc) goto free_priv; - iv = crypto_info_iv(crypto_info, cipher_desc); - key = crypto_info_key(crypto_info, cipher_desc); - salt = crypto_info_salt(crypto_info, cipher_desc); - rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); - - memcpy(cctx->iv, salt, cipher_desc->salt); - memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv); - memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq); + iv = crypto_info_iv(src_crypto_info, cipher_desc); + key = crypto_info_key(src_crypto_info, cipher_desc); + salt = crypto_info_salt(src_crypto_info, cipher_desc); + rec_seq = crypto_info_rec_seq(src_crypto_info, cipher_desc); if (!*aead) { *aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0); @@ -2784,20 +2796,30 @@ int tls_set_sw_offload(struct sock *sk, int tx) ctx->push_pending_record = tls_sw_push_pending_record; + /* setkey is the last operation that could fail during a + * rekey. if it succeeds, we can start modifying the + * context. + */ rc = crypto_aead_setkey(*aead, key, cipher_desc->key); - if (rc) - goto free_aead; + if (rc) { + if (new_crypto_info) + goto out; + else + goto free_aead; + } - rc = crypto_aead_setauthsize(*aead, prot->tag_size); - if (rc) - goto free_aead; + if (!new_crypto_info) { + rc = crypto_aead_setauthsize(*aead, prot->tag_size); + if (rc) + goto free_aead; + } - if (sw_ctx_rx) { + if (!tx && !new_crypto_info) { tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); tls_update_rx_zc_capable(ctx); sw_ctx_rx->async_capable = - crypto_info->version != TLS_1_3_VERSION && + src_crypto_info->version != TLS_1_3_VERSION && !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC); rc = tls_strp_init(&sw_ctx_rx->strp, sk); @@ -2805,18 +2827,33 @@ int tls_set_sw_offload(struct sock *sk, int tx) goto free_aead; } + memcpy(cctx->iv, salt, cipher_desc->salt); + memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv); + memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq); + + if (new_crypto_info) { + unsafe_memcpy(crypto_info, new_crypto_info, + cipher_desc->crypto_info, + /* size was checked in do_tls_setsockopt_conf */); + memzero_explicit(new_crypto_info, cipher_desc->crypto_info); + if (!tx) + tls_finish_key_update(sk, ctx); + } + goto out; free_aead: crypto_free_aead(*aead); *aead = NULL; free_priv: - if (tx) { - kfree(ctx->priv_ctx_tx); - ctx->priv_ctx_tx = NULL; - } else { - kfree(ctx->priv_ctx_rx); - ctx->priv_ctx_rx = NULL; + if (!new_crypto_info) { + if (tx) { + kfree(ctx->priv_ctx_tx); + ctx->priv_ctx_tx = NULL; + } else { + kfree(ctx->priv_ctx_rx); + ctx->priv_ctx_rx = NULL; + } } out: return rc; From 510128b30f2db1600172e9aaec44f66db3c16e15 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 12 Dec 2024 16:36:06 +0100 Subject: [PATCH 0547/1450] tls: add counters for rekey This introduces 5 counters to keep track of key updates: Tls{Rx,Tx}Rekey{Ok,Error} and TlsRxRekeyReceived. Suggested-by: Jakub Kicinski Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 5 +++++ net/tls/tls_main.c | 27 ++++++++++++++++++++++----- net/tls/tls_proc.c | 5 +++++ net/tls/tls_sw.c | 6 ++++-- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index adf5fd78dd503..51da2e00112de 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -358,6 +358,11 @@ enum LINUX_MIB_TLSRXDEVICERESYNC, /* TlsRxDeviceResync */ LINUX_MIB_TLSDECRYPTRETRY, /* TlsDecryptRetry */ LINUX_MIB_TLSRXNOPADVIOL, /* TlsRxNoPadViolation */ + LINUX_MIB_TLSRXREKEYOK, /* TlsRxRekeyOk */ + LINUX_MIB_TLSRXREKEYERROR, /* TlsRxRekeyError */ + LINUX_MIB_TLSTXREKEYOK, /* TlsTxRekeyOk */ + LINUX_MIB_TLSTXREKEYERROR, /* TlsTxRekeyError */ + LINUX_MIB_TLSRXREKEYRECEIVED, /* TlsRxRekeyReceived */ __LINUX_MIB_TLSMAX }; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 68b5735dafc19..9ee5a83c5b400 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -640,8 +640,11 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, /* Currently we only support setting crypto info more * than one time for TLS 1.3 */ - if (crypto_info->version != TLS_1_3_VERSION) + if (crypto_info->version != TLS_1_3_VERSION) { + TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR + : LINUX_MIB_TLSRXREKEYERROR); return -EBUSY; + } update = true; old_crypto_info = crypto_info; @@ -696,8 +699,13 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, update ? crypto_info : NULL); if (rc) goto err_crypto_info; - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); + + if (update) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXREKEYOK); + } else { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); + } conf = TLS_SW; } } else { @@ -711,8 +719,13 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, update ? crypto_info : NULL); if (rc) goto err_crypto_info; - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + + if (update) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYOK); + } else { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + } conf = TLS_SW; } if (!update) @@ -735,6 +748,10 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, return 0; err_crypto_info: + if (update) { + TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR + : LINUX_MIB_TLSRXREKEYERROR); + } memzero_explicit(crypto_ctx, sizeof(*crypto_ctx)); return rc; } diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c index 68982728f6209..367666aa07b88 100644 --- a/net/tls/tls_proc.c +++ b/net/tls/tls_proc.c @@ -22,6 +22,11 @@ static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY), SNMP_MIB_ITEM("TlsRxNoPadViolation", LINUX_MIB_TLSRXNOPADVIOL), + SNMP_MIB_ITEM("TlsRxRekeyOk", LINUX_MIB_TLSRXREKEYOK), + SNMP_MIB_ITEM("TlsRxRekeyError", LINUX_MIB_TLSRXREKEYERROR), + SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK), + SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR), + SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED), SNMP_MIB_SENTINEL }; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9e5aff5bab98e..47550d4858193 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1724,7 +1724,8 @@ tls_decrypt_device(struct sock *sk, struct msghdr *msg, return 1; } -static int tls_check_pending_rekey(struct tls_context *ctx, struct sk_buff *skb) +static int tls_check_pending_rekey(struct sock *sk, struct tls_context *ctx, + struct sk_buff *skb) { const struct strp_msg *rxm = strp_msg(skb); const struct tls_msg *tlm = tls_msg(skb); @@ -1747,6 +1748,7 @@ static int tls_check_pending_rekey(struct tls_context *ctx, struct sk_buff *skb) struct tls_sw_context_rx *rx_ctx = ctx->priv_ctx_rx; WRITE_ONCE(rx_ctx->key_update_pending, true); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYRECEIVED); } return 0; @@ -1771,7 +1773,7 @@ static int tls_rx_one_record(struct sock *sk, struct msghdr *msg, rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); - return tls_check_pending_rekey(tls_ctx, darg->skb); + return tls_check_pending_rekey(sk, tls_ctx, darg->skb); } int decrypt_skb(struct sock *sk, struct scatterlist *sgout) From 5aa97a43d042fffa8bd0f0bc2723f3574310686e Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 12 Dec 2024 16:36:07 +0100 Subject: [PATCH 0548/1450] docs: tls: document TLS1.3 key updates Document the kernel's behavior and userspace expectations. Suggested-by: Jakub Kicinski Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- Documentation/networking/tls.rst | 36 ++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst index 658ed3a71e1b5..c7904a1bc1674 100644 --- a/Documentation/networking/tls.rst +++ b/Documentation/networking/tls.rst @@ -200,6 +200,32 @@ received without a cmsg buffer set. recv will never return data from mixed types of TLS records. +TLS 1.3 Key Updates +------------------- + +In TLS 1.3, KeyUpdate handshake messages signal that the sender is +updating its TX key. Any message sent after a KeyUpdate will be +encrypted using the new key. The userspace library can pass the new +key to the kernel using the TLS_TX and TLS_RX socket options, as for +the initial keys. TLS version and cipher cannot be changed. + +To prevent attempting to decrypt incoming records using the wrong key, +decryption will be paused when a KeyUpdate message is received by the +kernel, until the new key has been provided using the TLS_RX socket +option. Any read occurring after the KeyUpdate has been read and +before the new key is provided will fail with EKEYEXPIRED. poll() will +not report any read events from the socket until the new key is +provided. There is no pausing on the transmit side. + +Userspace should make sure that the crypto_info provided has been set +properly. In particular, the kernel will not check for key/nonce +reuse. + +The number of successful and failed key updates is tracked in the +``TlsTxRekeyOk``, ``TlsRxRekeyOk``, ``TlsTxRekeyError``, +``TlsRxRekeyError`` statistics. The ``TlsRxRekeyReceived`` statistic +counts KeyUpdate handshake messages that have been received. + Integrating in to userspace TLS library --------------------------------------- @@ -286,3 +312,13 @@ TLS implementation exposes the following per-namespace statistics - ``TlsRxNoPadViolation`` - number of data RX records which had to be re-decrypted due to ``TLS_RX_EXPECT_NO_PAD`` mis-prediction. + +- ``TlsTxRekeyOk``, ``TlsRxRekeyOk`` - + number of successful rekeys on existing sessions for TX and RX + +- ``TlsTxRekeyError``, ``TlsRxRekeyError`` - + number of failed rekeys on existing sessions for TX and RX + +- ``TlsRxRekeyReceived`` - + number of received KeyUpdate handshake messages, requiring userspace + to provide a new RX key From b2e584aa3c710802600b690f34a56fb526aebf2f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 12 Dec 2024 16:36:08 +0100 Subject: [PATCH 0549/1450] selftests: tls: add key_generation argument to tls_crypto_info_init This allows us to generate different keys, so that we can test that rekey is using the correct one. Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 1a706d03bb6bb..b1f52d2bb0965 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -44,9 +44,11 @@ struct tls_crypto_info_keys { }; static void tls_crypto_info_init(uint16_t tls_version, uint16_t cipher_type, - struct tls_crypto_info_keys *tls12) + struct tls_crypto_info_keys *tls12, + char key_generation) { - memset(tls12, 0, sizeof(*tls12)); + memset(tls12, key_generation, sizeof(*tls12)); + memset(tls12, 0, sizeof(struct tls_crypto_info)); switch (cipher_type) { case TLS_CIPHER_CHACHA20_POLY1305: @@ -275,7 +277,7 @@ TEST_F(tls_basic, recseq_wrap) if (self->notls) SKIP(return, "no TLS support"); - tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12); + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12, 0); memset(&tls12.aes128.rec_seq, 0xff, sizeof(tls12.aes128.rec_seq)); ASSERT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); @@ -391,7 +393,7 @@ FIXTURE_SETUP(tls) SKIP(return, "Unsupported cipher in FIPS mode"); tls_crypto_info_init(variant->tls_version, variant->cipher_type, - &tls12); + &tls12, 0); ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls); @@ -1175,7 +1177,7 @@ TEST_F(tls, bidir) struct tls_crypto_info_keys tls12; tls_crypto_info_init(variant->tls_version, variant->cipher_type, - &tls12); + &tls12, 0); ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12, tls12.len); @@ -1614,7 +1616,7 @@ TEST_F(tls, getsockopt) EXPECT_EQ(get.crypto_info.cipher_type, variant->cipher_type); /* get the full crypto_info */ - tls_crypto_info_init(variant->tls_version, variant->cipher_type, &expect); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &expect, 0); len = expect.len; memrnd(&get, sizeof(get)); EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &get, &len), 0); @@ -1696,7 +1698,7 @@ FIXTURE_SETUP(tls_err) int ret; tls_crypto_info_init(variant->tls_version, TLS_CIPHER_AES_GCM_128, - &tls12); + &tls12, 0); ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls); ulp_sock_pair(_metadata, &self->fd2, &self->cfd2, &self->notls); @@ -2118,7 +2120,7 @@ TEST(tls_v6ops) { int sfd, ret, fd; socklen_t len, len2; - tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12); + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_128, &tls12, 0); addr.sin6_family = AF_INET6; addr.sin6_addr = in6addr_any; @@ -2177,7 +2179,7 @@ TEST(prequeue) { len = sizeof(addr); memrnd(buf, sizeof(buf)); - tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls12); + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls12, 0); addr.sin_family = AF_INET; addr.sin_addr.s_addr = htonl(INADDR_ANY); From 555f0edb9ff043196655a5b7cc65f67dfd05b530 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 12 Dec 2024 16:36:09 +0100 Subject: [PATCH 0550/1450] selftests: tls: add rekey tests Test the kernel's ability to: - update the key (but not the version or cipher), only for TLS1.3 - pause decryption after receiving a KeyUpdate message, until a new RX key has been provided - reflect the pause/non-readable socket in poll() Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 458 ++++++++++++++++++++++++++++++ 1 file changed, 458 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index b1f52d2bb0965..9a85f93c33d86 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -1670,6 +1670,464 @@ TEST_F(tls, recv_efault) EXPECT_EQ(memcmp(rec2, recv_mem + 9, ret - 9), 0); } +#define TLS_RECORD_TYPE_HANDSHAKE 0x16 +/* key_update, length 1, update_not_requested */ +static const char key_update_msg[] = "\x18\x00\x00\x01\x00"; +static void tls_send_keyupdate(struct __test_metadata *_metadata, int fd) +{ + size_t len = sizeof(key_update_msg); + + EXPECT_EQ(tls_send_cmsg(fd, TLS_RECORD_TYPE_HANDSHAKE, + (char *)key_update_msg, len, 0), + len); +} + +static void tls_recv_keyupdate(struct __test_metadata *_metadata, int fd, int flags) +{ + char buf[100]; + + EXPECT_EQ(tls_recv_cmsg(_metadata, fd, TLS_RECORD_TYPE_HANDSHAKE, buf, sizeof(buf), flags), + sizeof(key_update_msg)); + EXPECT_EQ(memcmp(buf, key_update_msg, sizeof(key_update_msg)), 0); +} + +/* set the key to 0 then 1 for RX, immediately to 1 for TX */ +TEST_F(tls_basic, rekey_rx) +{ + struct tls_crypto_info_keys tls12_0, tls12_1; + char const *test_str = "test_message"; + int send_len = strlen(test_str) + 1; + char buf[20]; + int ret; + + if (self->notls) + return; + + tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128, + &tls12_0, 0); + tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128, + &tls12_1, 1); + + ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_1, tls12_1.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_0, tls12_0.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_1, tls12_1.len); + EXPECT_EQ(ret, 0); + + EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len); + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str, send_len), 0); +} + +/* set the key to 0 then 1 for TX, immediately to 1 for RX */ +TEST_F(tls_basic, rekey_tx) +{ + struct tls_crypto_info_keys tls12_0, tls12_1; + char const *test_str = "test_message"; + int send_len = strlen(test_str) + 1; + char buf[20]; + int ret; + + if (self->notls) + return; + + tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128, + &tls12_0, 0); + tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128, + &tls12_1, 1); + + ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_0, tls12_0.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_1, tls12_1.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_1, tls12_1.len); + EXPECT_EQ(ret, 0); + + EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len); + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str, send_len), 0); +} + +TEST_F(tls, rekey) +{ + char const *test_str_1 = "test_message_before_rekey"; + char const *test_str_2 = "test_message_after_rekey"; + struct tls_crypto_info_keys tls12; + int send_len; + char buf[100]; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + /* initial send/recv */ + send_len = strlen(test_str_1) + 1; + EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len); + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + /* send after rekey */ + send_len = strlen(test_str_2) + 1; + EXPECT_EQ(send(self->fd, test_str_2, send_len, 0), send_len); + + /* can't receive the KeyUpdate without a control message */ + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* recv blocking -> -EKEYEXPIRED */ + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), 0), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* recv non-blocking -> -EKEYEXPIRED */ + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_DONTWAIT), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* update RX key */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + /* recv after rekey */ + EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1); + EXPECT_EQ(memcmp(buf, test_str_2, send_len), 0); +} + +TEST_F(tls, rekey_fail) +{ + char const *test_str_1 = "test_message_before_rekey"; + char const *test_str_2 = "test_message_after_rekey"; + struct tls_crypto_info_keys tls12; + int send_len; + char buf[100]; + + /* initial send/recv */ + send_len = strlen(test_str_1) + 1; + EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len); + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + + if (variant->tls_version != TLS_1_3_VERSION) { + /* just check that rekey is not supported and return */ + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1); + EXPECT_EQ(errno, EBUSY); + return; + } + + /* successful update */ + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + /* invalid update: change of version */ + tls_crypto_info_init(TLS_1_2_VERSION, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1); + EXPECT_EQ(errno, EINVAL); + + /* invalid update (RX socket): change of version */ + tls_crypto_info_init(TLS_1_2_VERSION, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), -1); + EXPECT_EQ(errno, EINVAL); + + /* invalid update: change of cipher */ + if (variant->cipher_type == TLS_CIPHER_AES_GCM_256) + tls_crypto_info_init(variant->tls_version, TLS_CIPHER_CHACHA20_POLY1305, &tls12, 1); + else + tls_crypto_info_init(variant->tls_version, TLS_CIPHER_AES_GCM_256, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), -1); + EXPECT_EQ(errno, EINVAL); + + /* send after rekey, the invalid updates shouldn't have an effect */ + send_len = strlen(test_str_2) + 1; + EXPECT_EQ(send(self->fd, test_str_2, send_len, 0), send_len); + + /* can't receive the KeyUpdate without a control message */ + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* recv blocking -> -EKEYEXPIRED */ + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), 0), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* recv non-blocking -> -EKEYEXPIRED */ + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_DONTWAIT), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* update RX key */ + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + /* recv after rekey */ + EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1); + EXPECT_EQ(memcmp(buf, test_str_2, send_len), 0); +} + +TEST_F(tls, rekey_peek) +{ + char const *test_str_1 = "test_message_before_rekey"; + struct tls_crypto_info_keys tls12; + int send_len; + char buf[100]; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + send_len = strlen(test_str_1) + 1; + EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + /* can't receive the KeyUpdate without a control message */ + EXPECT_EQ(recv(self->cfd, buf, send_len, MSG_PEEK), -1); + + /* peek KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, MSG_PEEK); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* update RX key */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); +} + +TEST_F(tls, splice_rekey) +{ + int send_len = TLS_PAYLOAD_MAX_LEN / 2; + char mem_send[TLS_PAYLOAD_MAX_LEN]; + char mem_recv[TLS_PAYLOAD_MAX_LEN]; + struct tls_crypto_info_keys tls12; + int p[2]; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + memrnd(mem_send, sizeof(mem_send)); + + ASSERT_GE(pipe(p), 0); + EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len); + + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len); + EXPECT_EQ(read(p[0], mem_recv, send_len), send_len); + EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); + + /* can't splice the KeyUpdate */ + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), -1); + EXPECT_EQ(errno, EINVAL); + + /* peek KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, MSG_PEEK); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* can't splice before updating the key */ + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), -1); + EXPECT_EQ(errno, EKEYEXPIRED); + + /* update RX key */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len); + EXPECT_EQ(read(p[0], mem_recv, send_len), send_len); + EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); +} + +TEST_F(tls, rekey_peek_splice) +{ + char const *test_str_1 = "test_message_before_rekey"; + struct tls_crypto_info_keys tls12; + int send_len; + char buf[100]; + char mem_recv[TLS_PAYLOAD_MAX_LEN]; + int p[2]; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + ASSERT_GE(pipe(p), 0); + + send_len = strlen(test_str_1) + 1; + EXPECT_EQ(send(self->fd, test_str_1, send_len, 0), send_len); + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_PEEK), send_len); + EXPECT_EQ(memcmp(buf, test_str_1, send_len), 0); + + EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, TLS_PAYLOAD_MAX_LEN, 0), send_len); + EXPECT_EQ(read(p[0], mem_recv, send_len), send_len); + EXPECT_EQ(memcmp(mem_recv, test_str_1, send_len), 0); +} + +TEST_F(tls, rekey_getsockopt) +{ + struct tls_crypto_info_keys tls12; + struct tls_crypto_info_keys tls12_get; + socklen_t len; + + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 0); + + len = tls12.len; + EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_get, &len), 0); + EXPECT_EQ(len, tls12.len); + EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0); + + len = tls12.len; + EXPECT_EQ(getsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_get, &len), 0); + EXPECT_EQ(len, tls12.len); + EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0); + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + tls_recv_keyupdate(_metadata, self->cfd, 0); + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + len = tls12.len; + EXPECT_EQ(getsockopt(self->fd, SOL_TLS, TLS_TX, &tls12_get, &len), 0); + EXPECT_EQ(len, tls12.len); + EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0); + + len = tls12.len; + EXPECT_EQ(getsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12_get, &len), 0); + EXPECT_EQ(len, tls12.len); + EXPECT_EQ(memcmp(&tls12_get, &tls12, tls12.len), 0); +} + +TEST_F(tls, rekey_poll_pending) +{ + char const *test_str = "test_message_after_rekey"; + struct tls_crypto_info_keys tls12; + struct pollfd pfd = { }; + int send_len; + int ret; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + /* send immediately after rekey */ + send_len = strlen(test_str) + 1; + EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len); + + /* key hasn't been updated, expect cfd to be non-readable */ + pfd.fd = self->cfd; + pfd.events = POLLIN; + EXPECT_EQ(poll(&pfd, 1, 0), 0); + + ret = fork(); + ASSERT_GE(ret, 0); + + if (ret) { + int pid2, status; + + /* wait before installing the new key */ + sleep(1); + + /* update RX key while poll() is sleeping */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + pid2 = wait(&status); + EXPECT_EQ(pid2, ret); + EXPECT_EQ(status, 0); + } else { + pfd.fd = self->cfd; + pfd.events = POLLIN; + EXPECT_EQ(poll(&pfd, 1, 5000), 1); + + exit(!__test_passed(_metadata)); + } +} + +TEST_F(tls, rekey_poll_delay) +{ + char const *test_str = "test_message_after_rekey"; + struct tls_crypto_info_keys tls12; + struct pollfd pfd = { }; + int send_len; + int ret; + + if (variant->tls_version != TLS_1_3_VERSION) + return; + + /* update TX key */ + tls_send_keyupdate(_metadata, self->fd); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12, 1); + EXPECT_EQ(setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12, tls12.len), 0); + + /* get KeyUpdate */ + tls_recv_keyupdate(_metadata, self->cfd, 0); + + ret = fork(); + ASSERT_GE(ret, 0); + + if (ret) { + int pid2, status; + + /* wait before installing the new key */ + sleep(1); + + /* update RX key while poll() is sleeping */ + EXPECT_EQ(setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len), 0); + + sleep(1); + send_len = strlen(test_str) + 1; + EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len); + + pid2 = wait(&status); + EXPECT_EQ(pid2, ret); + EXPECT_EQ(status, 0); + } else { + pfd.fd = self->cfd; + pfd.events = POLLIN; + EXPECT_EQ(poll(&pfd, 1, 5000), 1); + exit(!__test_passed(_metadata)); + } +} + FIXTURE(tls_err) { int fd, cfd; From 5e51e50e2324c9374d06ab05e3d7d09123e1114f Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:41 +0100 Subject: [PATCH 0551/1450] net: Make dev_get_hwtstamp_phylib accessible Make the dev_get_hwtstamp_phylib function accessible in prevision to use it from ethtool to read the hwtstamp current configuration. Reviewed-by: Florian Fainelli Reviewed-by: Jacob Keller Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- net/core/dev.h | 2 ++ net/core/dev_ioctl.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/dev.h b/net/core/dev.h index d043dee25a685..357543cbde656 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -310,5 +310,7 @@ static inline void dev_xmit_recursion_dec(void) int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack); +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg); #endif diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 46d43b9504713..67cf68817f232 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -266,8 +266,8 @@ static int dev_eth_ioctl(struct net_device *dev, * -EOPNOTSUPP for phylib for now, which is still more accurate than letting * the netdev handle the GET request. */ -static int dev_get_hwtstamp_phylib(struct net_device *dev, - struct kernel_hwtstamp_config *cfg) +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) { if (phy_is_default_hwtstamp(dev->phydev)) return phy_hwtstamp_get(dev->phydev, cfg); From b18fe47c0c093cc429f7e4d7694fdf0fc362aaf5 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:42 +0100 Subject: [PATCH 0552/1450] net: Make net_hwtstamp_validate accessible Make the net_hwtstamp_validate function accessible in prevision to use it from ethtool to validate the hwtstamp configuration before setting it. Reviewed-by: Florian Fainelli Reviewed-by: Jacob Keller Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- net/core/dev.h | 1 + net/core/dev_ioctl.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.h b/net/core/dev.h index 357543cbde656..aa91eed55a404 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -312,5 +312,6 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, struct netlink_ext_ack *extack); int dev_get_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg); +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg); #endif diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 67cf68817f232..1f09930fca26d 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -184,7 +184,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm return err; } -static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) { enum hwtstamp_tx_types tx_type; enum hwtstamp_rx_filters rx_filter; From 35f7cad1743e04bf2944a2aadb6b6a42adc57bca Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:43 +0100 Subject: [PATCH 0553/1450] net: Add the possibility to support a selected hwtstamp in netdevice Introduce the description of a hwtstamp provider, mainly defined with a the hwtstamp source and the phydev pointer. Add a hwtstamp provider description within the netdev structure to allow saving the hwtstamp we want to use. This prepares for future support of an ethtool netlink command to select the desired hwtstamp provider. By default, the old API that does not support hwtstamp selectability is used, meaning the hwtstamp provider pointer is unset. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 10 +++++++ include/linux/net_tstamp.h | 29 ++++++++++++++++++ include/linux/netdevice.h | 4 +++ include/uapi/linux/net_tstamp.h | 11 +++++++ net/core/dev_ioctl.c | 41 ++++++++++++++++++++++++-- net/core/timestamping.c | 52 +++++++++++++++++++++++++++++---- 6 files changed, 140 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index b26bb33cd1d48..1a908af4175b2 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -1998,6 +1999,15 @@ void phy_detach(struct phy_device *phydev) phy_suspend(phydev); if (dev) { + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + /* Disable timestamp if it is the one selected */ + if (hwprov && hwprov->phydev == phydev) { + rcu_assign_pointer(dev->hwprov, NULL); + kfree_rcu(hwprov, rcu_head); + } + phydev->attached_dev->phydev = NULL; phydev->attached_dev = NULL; phy_link_topo_del_phy(dev, phydev); diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index 662074b08c94c..ff0758e88ea10 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -19,6 +19,33 @@ enum hwtstamp_source { HWTSTAMP_SOURCE_PHYLIB, }; +/** + * struct hwtstamp_provider_desc - hwtstamp provider description + * + * @index: index of the hwtstamp provider. + * @qualifier: hwtstamp provider qualifier. + */ +struct hwtstamp_provider_desc { + int index; + enum hwtstamp_provider_qualifier qualifier; +}; + +/** + * struct hwtstamp_provider - hwtstamp provider object + * + * @rcu_head: RCU callback used to free the struct. + * @source: source of the hwtstamp provider. + * @phydev: pointer of the phydev source in case a PTP coming from phylib + * @desc: hwtstamp provider description. + */ + +struct hwtstamp_provider { + struct rcu_head rcu_head; + enum hwtstamp_source source; + struct phy_device *phydev; + struct hwtstamp_provider_desc desc; +}; + /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -31,6 +58,7 @@ enum hwtstamp_source { * copied the ioctl request back to user space * @source: indication whether timestamps should come from the netdev or from * an attached phylib PHY + * @qualifier: qualifier of the hwtstamp provider * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -43,6 +71,7 @@ struct kernel_hwtstamp_config { struct ifreq *ifr; bool copied_to_user; enum hwtstamp_source source; + enum hwtstamp_provider_qualifier qualifier; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d917949bba03f..2593019ad5b16 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -82,6 +82,7 @@ struct xdp_metadata_ops; struct xdp_md; struct ethtool_netdev_state; struct phy_link_topology; +struct hwtstamp_provider; typedef u32 xdp_features_t; @@ -2045,6 +2046,7 @@ enum netdev_reg_state { * * @neighbours: List heads pointing to this device's neighbours' * dev_list, one per address-family. + * @hwprov: Tracks which PTP performs hardware packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2457,6 +2459,8 @@ struct net_device { struct hlist_head neighbours[NEIGH_NR_TABLES]; + struct hwtstamp_provider __rcu *hwprov; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 858339d1c1c4c..55b0ab51096c1 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -13,6 +13,17 @@ #include #include /* for SO_TIMESTAMPING */ +/* + * Possible type of hwtstamp provider. Mainly "precise" the default one + * is for IEEE 1588 quality and "approx" is for NICs DMA point. + */ +enum hwtstamp_provider_qualifier { + HWTSTAMP_PROVIDER_QUALIFIER_PRECISE, + HWTSTAMP_PROVIDER_QUALIFIER_APPROX, + + HWTSTAMP_PROVIDER_QUALIFIER_CNT, +}; + /* SO_TIMESTAMPING flags */ enum { SOF_TIMESTAMPING_TX_HARDWARE = (1<<0), diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 1f09930fca26d..087a57b7e4fab 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -269,6 +270,21 @@ static int dev_eth_ioctl(struct net_device *dev, int dev_get_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + cfg->qualifier = hwprov->desc.qualifier; + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) + return phy_hwtstamp_get(hwprov->phydev, cfg); + + if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); + + return -EOPNOTSUPP; + } + if (phy_is_default_hwtstamp(dev->phydev)) return phy_hwtstamp_get(dev->phydev, cfg); @@ -324,11 +340,32 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; - bool phy_ts = phy_is_default_hwtstamp(dev->phydev); struct kernel_hwtstamp_config old_cfg = {}; + struct hwtstamp_provider *hwprov; + struct phy_device *phydev; bool changed = false; + bool phy_ts; int err; + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) { + phy_ts = true; + phydev = hwprov->phydev; + } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) { + phy_ts = false; + } else { + return -EOPNOTSUPP; + } + + cfg->qualifier = hwprov->desc.qualifier; + } else { + phy_ts = phy_is_default_hwtstamp(dev->phydev); + if (phy_ts) + phydev = dev->phydev; + } + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; if (phy_ts && dev->see_all_hwtstamp_requests) { @@ -350,7 +387,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); if (phy_ts) { - err = phy_hwtstamp_set(dev->phydev, cfg, extack); + err = phy_hwtstamp_set(phydev, cfg, extack); if (err) { if (changed) ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 3717fb152ecc4..a50a7ef49ae89 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -9,6 +9,7 @@ #include #include #include +#include static unsigned int classify(const struct sk_buff *skb) { @@ -21,19 +22,39 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; struct sk_buff *clone; unsigned int type; - if (!skb->sk || !skb->dev || - !phy_is_default_hwtstamp(skb->dev->phydev)) + if (!skb->sk || !skb->dev) return; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + type = classify(skb); if (type == PTP_CLASS_NONE) return; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->txtstamp)) { clone = skb_clone_sk(skb); if (!clone) @@ -45,12 +66,33 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; unsigned int type; - if (!skb->dev || !phy_is_default_hwtstamp(skb->dev->phydev)) + if (!skb->dev) return false; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return false; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + if (skb_headroom(skb) < ETH_HLEN) return false; @@ -63,7 +105,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return false; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->rxtstamp)) return mii_ts->rxtstamp(mii_ts, skb, type); From b9e3f7dc9ed95daeb83cfa45b821cacaa01aa906 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:44 +0100 Subject: [PATCH 0554/1450] net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology Either the MAC or the PHY can provide hwtstamp, so we should be able to read the tsinfo for any hwtstamp provider. Enhance 'get' command to retrieve tsinfo of hwtstamp providers within a network topology. Add support for a specific dump command to retrieve all hwtstamp providers within the network topology, with added functionality for filtered dump to target a single interface. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- Documentation/netlink/specs/ethtool.yaml | 20 + Documentation/networking/ethtool-netlink.rst | 7 +- include/linux/ethtool.h | 4 + .../uapi/linux/ethtool_netlink_generated.h | 10 + net/ethtool/common.c | 141 ++++++- net/ethtool/common.h | 13 + net/ethtool/netlink.c | 6 +- net/ethtool/netlink.h | 5 +- net/ethtool/ts.h | 20 + net/ethtool/tsinfo.c | 358 +++++++++++++++++- 10 files changed, 563 insertions(+), 21 deletions(-) create mode 100644 net/ethtool/ts.h diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index c7634e957d9c4..4082816e5f3d8 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -836,6 +836,20 @@ attribute-sets: - name: tx-err type: uint + - + name: ts-hwtstamp-provider + attr-cnt-name: __ethtool-a-ts-hwtstamp-provider-cnt + attributes: + - + name: unspec + type: unused + value: 0 + - + name: index + type: u32 + - + name: qualifier + type: u32 - name: tsinfo attr-cnt-name: __ethtool-a-tsinfo-cnt @@ -867,6 +881,10 @@ attribute-sets: name: stats type: nest nested-attributes: ts-stat + - + name: hwtstamp-provider + type: nest + nested-attributes: ts-hwtstamp-provider - name: cable-result attr-cnt-name: __ethtool-a-cable-result-cnt @@ -1912,6 +1930,7 @@ operations: request: attributes: - header + - hwtstamp-provider reply: attributes: - header @@ -1920,6 +1939,7 @@ operations: - rx-filters - phc-index - stats + - hwtstamp-provider dump: *tsinfo-get-op - name: cable-test-act diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index b25926071ece0..c585e2f0ddfae 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1245,9 +1245,10 @@ Gets timestamping information like ``ETHTOOL_GET_TS_INFO`` ioctl request. Request contents: - ===================================== ====== ========================== - ``ETHTOOL_A_TSINFO_HEADER`` nested request header - ===================================== ====== ========================== + ======================================== ====== ============================ + ``ETHTOOL_A_TSINFO_HEADER`` nested request header + ``ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER`` nested PTP hw clock provider + ======================================== ====== ============================ Kernel response contents: diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e217c6321ed0d..f711bfd75c4dc 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -711,6 +711,7 @@ struct ethtool_rxfh_param { * @cmd: command number = %ETHTOOL_GET_TS_INFO * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags * @phc_index: device index of the associated PHC, or -1 if there is none + * @phc_qualifier: qualifier of the associated PHC * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values */ @@ -718,6 +719,7 @@ struct kernel_ethtool_ts_info { u32 cmd; u32 so_timestamping; int phc_index; + enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_tx_types tx_types; enum hwtstamp_rx_filters rx_filters; }; @@ -749,6 +751,7 @@ struct kernel_ethtool_ts_info { * @rss_context argument to @create_rxfh_context and friends. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. + * @supported_hwtstamp_qualifiers: bitfield of supported hwtstamp qualifier. * @get_drvinfo: Report driver/device information. Modern drivers no * longer have to implement this callback. Most fields are * correctly filled in by the core using system information, or @@ -966,6 +969,7 @@ struct ethtool_ops { u32 rxfh_max_num_contexts; u32 supported_coalesce_params; u32 supported_ring_params; + u32 supported_hwtstamp_qualifiers; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index b58f352fe4f2a..df289dde0f61b 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -385,6 +385,15 @@ enum { ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) }; +enum { + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_UNSPEC, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, + + __ETHTOOL_A_TS_HWTSTAMP_PROVIDER_CNT, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_MAX = (__ETHTOOL_A_TS_HWTSTAMP_PROVIDER_CNT - 1) +}; + enum { ETHTOOL_A_TSINFO_UNSPEC, ETHTOOL_A_TSINFO_HEADER, @@ -393,6 +402,7 @@ enum { ETHTOOL_A_TSINFO_RX_FILTERS, ETHTOOL_A_TSINFO_PHC_INDEX, ETHTOOL_A_TSINFO_STATS, + ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER, __ETHTOOL_A_TSINFO_CNT, ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 05ce4f8080b35..666db40bcfdaf 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -5,9 +5,12 @@ #include #include #include +#include #include "netlink.h" #include "common.h" +#include "../core/dev.h" + const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_SG_BIT] = "tx-scatter-gather", @@ -763,20 +766,91 @@ int ethtool_check_ops(const struct ethtool_ops *ops) return 0; } -int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) +static void ethtool_init_tsinfo(struct kernel_ethtool_ts_info *info) { - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - int err = 0; - memset(info, 0, sizeof(*info)); info->cmd = ETHTOOL_GET_TS_INFO; info->phc_index = -1; +} + +int ethtool_net_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + int err; + + if (!ops->get_ts_info) + return -ENODEV; + + /* Does ptp comes from netdev */ + ethtool_init_tsinfo(info); + info->phc_qualifier = hwprov_desc->qualifier; + err = ops->get_ts_info(dev, info); + if (err) + return err; + + if (info->phc_index == hwprov_desc->index && + net_support_hwtstamp_qualifier(dev, hwprov_desc->qualifier)) + return 0; + + return -ENODEV; +} + +int +ethtool_phy_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + int err; + + /* Only precise qualifier is supported in phydev */ + if (hwprov_desc->qualifier != HWTSTAMP_PROVIDER_QUALIFIER_PRECISE) + return -ENODEV; + + /* Look in the phy topology */ + if (dev->link_topo) { + struct phy_device_node *pdn; + unsigned long phy_index; + + xa_for_each(&dev->link_topo->phys, phy_index, pdn) { + if (!phy_has_tsinfo(pdn->phy)) + continue; + + ethtool_init_tsinfo(info); + err = phy_ts_info(pdn->phy, info); + if (err) + return err; + + if (info->phc_index == hwprov_desc->index) + return 0; + } + return -ENODEV; + } + + /* Look on the dev->phydev */ + if (phy_has_tsinfo(dev->phydev)) { + ethtool_init_tsinfo(info); + err = phy_ts_info(dev->phydev, info); + if (err) + return err; + + if (info->phc_index == hwprov_desc->index) + return 0; + } + + return -ENODEV; +} + +int ethtool_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + int err; - if (phy_is_default_hwtstamp(phydev) && phy_has_tsinfo(phydev)) - err = phy_ts_info(phydev, info); - else if (ops->get_ts_info) - err = ops->get_ts_info(dev, info); + err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); + if (err == -ENODEV) + err = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; @@ -784,6 +858,55 @@ int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info return err; } +int __ethtool_get_ts_info(struct net_device *dev, + struct kernel_ethtool_ts_info *info) +{ + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + /* No provider specified, use default behavior */ + if (!hwprov) { + const struct ethtool_ops *ops = dev->ethtool_ops; + struct phy_device *phydev = dev->phydev; + int err = 0; + + ethtool_init_tsinfo(info); + if (phy_is_default_hwtstamp(phydev) && + phy_has_tsinfo(phydev)) + err = phy_ts_info(phydev, info); + else if (ops->get_ts_info) + err = ops->get_ts_info(dev, info); + + info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + + return err; + } + + return ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc); +} + +bool net_support_hwtstamp_qualifier(struct net_device *dev, + enum hwtstamp_provider_qualifier qualifier) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops) + return false; + + /* Return true with precise qualifier and with NIC without + * qualifier description to not break the old behavior. + */ + if (!ops->supported_hwtstamp_qualifiers && + qualifier == HWTSTAMP_PROVIDER_QUALIFIER_PRECISE) + return true; + + if (ops->supported_hwtstamp_qualifiers & BIT(qualifier)) + return true; + + return false; +} + int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index) { struct kernel_ethtool_ts_info info = { }; diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 4a2de3ce7354f..f5119204c8ff6 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -21,6 +21,7 @@ struct link_mode_info { }; struct genl_info; +struct hwtstamp_provider_desc; extern const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; @@ -49,6 +50,18 @@ int ethtool_check_max_channel(struct net_device *dev, struct genl_info *info); int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context); int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); +int ethtool_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc); +int ethtool_net_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc); +int +ethtool_phy_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc); +bool net_support_hwtstamp_qualifier(struct net_device *dev, + enum hwtstamp_provider_qualifier qualifier); extern const struct ethtool_phy_ops *ethtool_phy_ops; extern const struct ethtool_pse_ops *ethtool_pse_ops; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index e3f0ef6b851bb..6ae1d91f36e7e 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -1074,9 +1074,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_TSINFO_GET, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_tsinfo_start, + .dumpit = ethnl_tsinfo_dumpit, + .done = ethnl_tsinfo_done, .policy = ethnl_tsinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1, }, diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 203b08eb6c6f6..960cda13e4fc2 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -464,7 +464,7 @@ extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; -extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + 1]; +extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1]; extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; @@ -499,6 +499,9 @@ int ethnl_phy_start(struct netlink_callback *cb); int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_phy_done(struct netlink_callback *cb); +int ethnl_tsinfo_start(struct netlink_callback *cb); +int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ethnl_tsinfo_done(struct netlink_callback *cb); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/ts.h b/net/ethtool/ts.h new file mode 100644 index 0000000000000..d901a879a6710 --- /dev/null +++ b/net/ethtool/ts.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _NET_ETHTOOL_TS_H +#define _NET_ETHTOOL_TS_H + +#include "netlink.h" + +static const struct nla_policy +ethnl_ts_hwtst_prov_policy[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_MAX + 1] = { + [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER] = + NLA_POLICY_MAX(NLA_U32, HWTSTAMP_PROVIDER_QUALIFIER_CNT - 1) +}; + +int ts_parse_hwtst_provider(const struct nlattr *nest, + struct hwtstamp_provider_desc *hwprov_desc, + struct netlink_ext_ack *extack, + bool *mod); + +#endif /* _NET_ETHTOOL_TS_H */ diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 03d12d6f79ca0..7e495a41aeec5 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -1,13 +1,18 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include +#include +#include #include "netlink.h" #include "common.h" #include "bitset.h" +#include "ts.h" struct tsinfo_req_info { struct ethnl_req_info base; + struct hwtstamp_provider_desc hwprov_desc; }; struct tsinfo_reply_data { @@ -16,34 +21,96 @@ struct tsinfo_reply_data { struct ethtool_ts_stats stats; }; +#define TSINFO_REQINFO(__req_base) \ + container_of(__req_base, struct tsinfo_req_info, base) + #define TSINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct tsinfo_reply_data, base) #define ETHTOOL_TS_STAT_CNT \ (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1)) -const struct nla_policy ethnl_tsinfo_get_policy[] = { +const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = { [ETHTOOL_A_TSINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), + [ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER] = + NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), }; +int ts_parse_hwtst_provider(const struct nlattr *nest, + struct hwtstamp_provider_desc *hwprov_desc, + struct netlink_ext_ack *extack, + bool *mod) +{ + struct nlattr *tb[ARRAY_SIZE(ethnl_ts_hwtst_prov_policy)]; + int ret; + + ret = nla_parse_nested(tb, + ARRAY_SIZE(ethnl_ts_hwtst_prov_policy) - 1, + nest, + ethnl_ts_hwtst_prov_policy, extack); + if (ret < 0) + return ret; + + if (NL_REQ_ATTR_CHECK(extack, nest, tb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX) || + NL_REQ_ATTR_CHECK(extack, nest, tb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER)) + return -EINVAL; + + ethnl_update_u32(&hwprov_desc->index, + tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX], + mod); + ethnl_update_u32(&hwprov_desc->qualifier, + tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER], + mod); + + return 0; +} + +static int +tsinfo_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); + bool mod = false; + + req->hwprov_desc.index = -1; + + if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER]) + return 0; + + return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER], + &req->hwprov_desc, extack, &mod); +} + static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); + struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + + if (req->hwprov_desc.index != -1) { + ret = ethtool_get_ts_info_by_phc(dev, &data->ts_info, + &req->hwprov_desc); + ethnl_ops_complete(dev); + return ret; + } + if (req_base->flags & ETHTOOL_FLAG_STATS) { ethtool_stats_init((u64 *)&data->stats, sizeof(data->stats) / sizeof(u64)); if (dev->ethtool_ops->get_ts_stats) dev->ethtool_ops->get_ts_stats(dev, &data->stats); } + ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); @@ -87,8 +154,11 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base, return ret; len += ret; /* _TSINFO_RX_FILTERS */ } - if (ts_info->phc_index >= 0) + if (ts_info->phc_index >= 0) { len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ + /* _TSINFO_HWTSTAMP_PROVIDER */ + len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); + } if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _TSINFO_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; @@ -163,9 +233,29 @@ static int tsinfo_fill_reply(struct sk_buff *skb, if (ret < 0) return ret; } - if (ts_info->phc_index >= 0 && - nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index)) - return -EMSGSIZE; + if (ts_info->phc_index >= 0) { + struct nlattr *nest; + + ret = nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, + ts_info->phc_index); + if (ret) + return -EMSGSIZE; + + nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, + ts_info->phc_index) || + nla_put_u32(skb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, + ts_info->phc_qualifier)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + } if (req_base->flags & ETHTOOL_FLAG_STATS && tsinfo_put_stats(skb, &data->stats)) return -EMSGSIZE; @@ -173,6 +263,263 @@ static int tsinfo_fill_reply(struct sk_buff *skb, return 0; } +struct ethnl_tsinfo_dump_ctx { + struct tsinfo_req_info *req_info; + struct tsinfo_reply_data *reply_data; + unsigned long pos_ifindex; + bool netdev_dump_done; + unsigned long pos_phyindex; + enum hwtstamp_provider_qualifier pos_phcqualifier; +}; + +static void *ethnl_tsinfo_prepare_dump(struct sk_buff *skb, + struct net_device *dev, + struct tsinfo_reply_data *reply_data, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + void *ehdr = NULL; + + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TSINFO_GET_REPLY); + if (!ehdr) + return ERR_PTR(-EMSGSIZE); + + reply_data = ctx->reply_data; + memset(reply_data, 0, sizeof(*reply_data)); + reply_data->base.dev = dev; + memset(&reply_data->ts_info, 0, sizeof(reply_data->ts_info)); + + return ehdr; +} + +static int ethnl_tsinfo_end_dump(struct sk_buff *skb, + struct net_device *dev, + struct tsinfo_req_info *req_info, + struct tsinfo_reply_data *reply_data, + void *ehdr) +{ + int ret; + + reply_data->ts_info.so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TSINFO_HEADER); + if (ret < 0) + return ret; + + ret = tsinfo_fill_reply(skb, &req_info->base, &reply_data->base); + if (ret < 0) + return ret; + + reply_data->base.dev = NULL; + genlmsg_end(skb, ehdr); + + return ret; +} + +static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb, + struct net_device *dev, + struct phy_device *phydev, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct tsinfo_reply_data *reply_data; + struct tsinfo_req_info *req_info; + void *ehdr = NULL; + int ret = 0; + + if (!phy_has_tsinfo(phydev)) + return -EOPNOTSUPP; + + reply_data = ctx->reply_data; + req_info = ctx->req_info; + ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); + if (IS_ERR(ehdr)) + return PTR_ERR(ehdr); + + ret = phy_ts_info(phydev, &reply_data->ts_info); + if (ret < 0) + goto err; + + ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); + if (ret < 0) + goto err; + + return ret; +err: + genlmsg_cancel(skb, ehdr); + return ret; +} + +static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, + struct net_device *dev, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct tsinfo_reply_data *reply_data; + struct tsinfo_req_info *req_info; + void *ehdr = NULL; + int ret = 0; + + if (!ops->get_ts_info) + return -EOPNOTSUPP; + + reply_data = ctx->reply_data; + req_info = ctx->req_info; + for (; ctx->pos_phcqualifier < HWTSTAMP_PROVIDER_QUALIFIER_CNT; + ctx->pos_phcqualifier++) { + if (!net_support_hwtstamp_qualifier(dev, + ctx->pos_phcqualifier)) + continue; + + ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); + if (IS_ERR(ehdr)) { + ret = PTR_ERR(ehdr); + goto err; + } + + reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier; + ret = ops->get_ts_info(dev, &reply_data->ts_info); + if (ret < 0) + goto err; + + ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, + ehdr); + if (ret < 0) + goto err; + } + + return ret; + +err: + genlmsg_cancel(skb, ehdr); + return ret; +} + +static int ethnl_tsinfo_dump_one_net_topo(struct sk_buff *skb, + struct net_device *dev, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct phy_device_node *pdn; + int ret = 0; + + if (!ctx->netdev_dump_done) { + ret = ethnl_tsinfo_dump_one_netdev(skb, dev, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + ctx->netdev_dump_done = true; + } + + if (!dev->link_topo) { + if (phy_has_tsinfo(dev->phydev)) { + ret = ethnl_tsinfo_dump_one_phydev(skb, dev, + dev->phydev, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + } + + return 0; + } + + xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, + ctx->pos_phyindex) { + if (phy_has_tsinfo(pdn->phy)) { + ret = ethnl_tsinfo_dump_one_phydev(skb, dev, + pdn->phy, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + } + } + + return ret; +} + +int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + if (ctx->req_info->base.dev) { + ret = ethnl_tsinfo_dump_one_net_topo(skb, + ctx->req_info->base.dev, + cb); + } else { + for_each_netdev_dump(net, dev, ctx->pos_ifindex) { + ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + break; + ctx->pos_phyindex = 0; + ctx->netdev_dump_done = false; + ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; + } + } + rtnl_unlock(); + + return ret; +} + +int ethnl_tsinfo_start(struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct nlattr **tb = info->info.attrs; + struct tsinfo_reply_data *reply_data; + struct tsinfo_req_info *req_info; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kzalloc(sizeof(*reply_data), GFP_KERNEL); + if (!reply_data) { + ret = -ENOMEM; + goto free_req_info; + } + + ret = ethnl_parse_header_dev_get(&req_info->base, + tb[ETHTOOL_A_TSINFO_HEADER], + sock_net(cb->skb->sk), cb->extack, + false); + if (ret < 0) + goto free_reply_data; + + ctx->req_info = req_info; + ctx->reply_data = reply_data; + ctx->pos_ifindex = 0; + ctx->pos_phyindex = 0; + ctx->netdev_dump_done = false; + ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; + + return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; +} + +int ethnl_tsinfo_done(struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct tsinfo_req_info *req_info = ctx->req_info; + + ethnl_parse_header_dev_put(&req_info->base); + kfree(ctx->reply_data); + kfree(ctx->req_info); + + return 0; +} + const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .request_cmd = ETHTOOL_MSG_TSINFO_GET, .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY, @@ -180,6 +527,7 @@ const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .req_info_size = sizeof(struct tsinfo_req_info), .reply_data_size = sizeof(struct tsinfo_reply_data), + .parse_request = tsinfo_parse_request, .prepare_data = tsinfo_prepare_data, .reply_size = tsinfo_reply_size, .fill_reply = tsinfo_fill_reply, From 6e9e2eed4f39d52edf5fd006409d211facf49f6b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:45 +0100 Subject: [PATCH 0555/1450] net: ethtool: Add support for tsconfig command to get/set hwtstamp config Introduce support for ETHTOOL_MSG_TSCONFIG_GET/SET ethtool netlink socket to read and configure hwtstamp configuration of a PHC provider. Note that simultaneous hwtstamp isn't supported; configuring a new one disables the previous setting. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- Documentation/netlink/specs/ethtool.yaml | 56 +++ Documentation/networking/ethtool-netlink.rst | 75 +++ Documentation/networking/timestamping.rst | 38 +- .../uapi/linux/ethtool_netlink_generated.h | 16 + net/ethtool/Makefile | 2 +- net/ethtool/common.c | 27 +- net/ethtool/common.h | 2 +- net/ethtool/netlink.c | 18 + net/ethtool/netlink.h | 3 + net/ethtool/tsconfig.c | 444 ++++++++++++++++++ 10 files changed, 655 insertions(+), 26 deletions(-) create mode 100644 net/ethtool/tsconfig.c diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 4082816e5f3d8..60f85fbf41561 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1489,6 +1489,33 @@ attribute-sets: - name: downstream-sfp-name type: string + - + name: tsconfig + attr-cnt-name: __ethtool-a-tsconfig-cnt + attributes: + - + name: unspec + type: unused + value: 0 + - + name: header + type: nest + nested-attributes: header + - + name: hwtstamp-provider + type: nest + nested-attributes: ts-hwtstamp-provider + - + name: tx-types + type: nest + nested-attributes: bitset + - + name: rx-filters + type: nest + nested-attributes: bitset + - + name: hwtstamp-flags + type: u32 operations: enum-model: directional @@ -2314,3 +2341,32 @@ operations: name: phy-ntf doc: Notification for change in PHY devices. notify: phy-get + - + name: tsconfig-get + doc: Get hwtstamp config. + + attribute-set: tsconfig + + do: &tsconfig-get-op + request: + attributes: + - header + reply: + attributes: &tsconfig + - header + - hwtstamp-provider + - tx-types + - rx-filters + - hwtstamp-flags + dump: *tsconfig-get-op + - + name: tsconfig-set + doc: Set hwtstamp config. + + attribute-set: tsconfig + + do: + request: + attributes: *tsconfig + reply: + attributes: *tsconfig diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index c585e2f0ddfae..a7ba6368a4d52 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -237,6 +237,8 @@ Userspace to kernel: ``ETHTOOL_MSG_MM_SET`` set MAC merge layer parameters ``ETHTOOL_MSG_MODULE_FW_FLASH_ACT`` flash transceiver module firmware ``ETHTOOL_MSG_PHY_GET`` get Ethernet PHY information + ``ETHTOOL_MSG_TSCONFIG_GET`` get hw timestamping configuration + ``ETHTOOL_MSG_TSCONFIG_SET`` set hw timestamping configuration ===================================== ================================= Kernel to userspace: @@ -286,6 +288,8 @@ Kernel to userspace: ``ETHTOOL_MSG_MODULE_FW_FLASH_NTF`` transceiver module flash updates ``ETHTOOL_MSG_PHY_GET_REPLY`` Ethernet PHY information ``ETHTOOL_MSG_PHY_NTF`` Ethernet PHY information change + ``ETHTOOL_MSG_TSCONFIG_GET_REPLY`` hw timestamping configuration + ``ETHTOOL_MSG_TSCONFIG_SET_REPLY`` new hw timestamping configuration ======================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -2244,6 +2248,75 @@ Kernel response contents: When ``ETHTOOL_A_PHY_UPSTREAM_TYPE`` is PHY_UPSTREAM_PHY, the PHY's parent is another PHY. +TSCONFIG_GET +============ + +Retrieves the information about the current hardware timestamping source and +configuration. + +It is similar to the deprecated ``SIOCGHWTSTAMP`` ioctl request. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_TSCONFIG_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ======================================== ====== ============================ + ``ETHTOOL_A_TSCONFIG_HEADER`` nested request header + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` nested PTP hw clock provider + ``ETHTOOL_A_TSCONFIG_TX_TYPES`` bitset hwtstamp Tx type + ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` bitset hwtstamp Rx filter + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` u32 hwtstamp flags + ======================================== ====== ============================ + +When set the ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` attribute identifies the +source of the hw timestamping provider. It is composed by +``ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX`` attribute which describe the index of +the PTP device and ``ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER`` which describe +the qualifier of the timestamp. + +When set the ``ETHTOOL_A_TSCONFIG_TX_TYPES``, ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` +and the ``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` attributes identify the Tx +type, the Rx filter and the flags configured for the current hw timestamping +provider. The attributes are propagated to the driver through the following +structure: + +.. kernel-doc:: include/linux/net_tstamp.h + :identifiers: kernel_hwtstamp_config + +TSCONFIG_SET +============ + +Set the information about the current hardware timestamping source and +configuration. + +It is similar to the deprecated ``SIOCSHWTSTAMP`` ioctl request. + +Request contents: + + ======================================== ====== ============================ + ``ETHTOOL_A_TSCONFIG_HEADER`` nested request header + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` nested PTP hw clock provider + ``ETHTOOL_A_TSCONFIG_TX_TYPES`` bitset hwtstamp Tx type + ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` bitset hwtstamp Rx filter + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` u32 hwtstamp flags + ======================================== ====== ============================ + +Kernel response contents: + + ======================================== ====== ============================ + ``ETHTOOL_A_TSCONFIG_HEADER`` nested request header + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` nested PTP hw clock provider + ``ETHTOOL_A_TSCONFIG_TX_TYPES`` bitset hwtstamp Tx type + ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` bitset hwtstamp Rx filter + ``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` u32 hwtstamp flags + ======================================== ====== ============================ + +For a description of each attribute, see ``TSCONFIG_GET``. + Request translation =================== @@ -2352,4 +2425,6 @@ are netlink only. n/a ``ETHTOOL_MSG_MM_SET`` n/a ``ETHTOOL_MSG_MODULE_FW_FLASH_ACT`` n/a ``ETHTOOL_MSG_PHY_GET`` + ``SIOCGHWTSTAMP`` ``ETHTOOL_MSG_TSCONFIG_GET`` + ``SIOCSHWTSTAMP`` ``ETHTOOL_MSG_TSCONFIG_SET`` =================================== ===================================== diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst index b37bfbfc7d797..61ef9da10e284 100644 --- a/Documentation/networking/timestamping.rst +++ b/Documentation/networking/timestamping.rst @@ -525,8 +525,8 @@ implicitly defined. ts[0] holds a software timestamp if set, ts[1] is again deprecated and ts[2] holds a hardware timestamp if set. -3. Hardware Timestamping configuration: SIOCSHWTSTAMP and SIOCGHWTSTAMP -======================================================================= +3. Hardware Timestamping configuration: ETHTOOL_MSG_TSCONFIG_SET/GET +==================================================================== Hardware time stamping must also be initialized for each device driver that is expected to do hardware time stamping. The parameter is defined in @@ -539,12 +539,14 @@ include/uapi/linux/net_tstamp.h as:: }; Desired behavior is passed into the kernel and to a specific device by -calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose -ifr_data points to a struct hwtstamp_config. The tx_type and -rx_filter are hints to the driver what it is expected to do. If -the requested fine-grained filtering for incoming packets is not -supported, the driver may time stamp more than just the requested types -of packets. +calling the tsconfig netlink socket ``ETHTOOL_MSG_TSCONFIG_SET``. +The ``ETHTOOL_A_TSCONFIG_TX_TYPES``, ``ETHTOOL_A_TSCONFIG_RX_FILTERS`` and +``ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS`` netlink attributes are then used to set +the struct hwtstamp_config accordingly. + +The ``ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER`` netlink nested attribute is used +to select the source of the hardware time stamping. It is composed of an index +for the device source and a qualifier for the type of time stamping. Drivers are free to use a more permissive configuration than the requested configuration. It is expected that drivers should only implement directly the @@ -563,9 +565,16 @@ Only a processes with admin rights may change the configuration. User space is responsible to ensure that multiple processes don't interfere with each other and that the settings are reset. -Any process can read the actual configuration by passing this -structure to ioctl(SIOCGHWTSTAMP) in the same way. However, this has -not been implemented in all drivers. +Any process can read the actual configuration by requesting tsconfig netlink +socket ``ETHTOOL_MSG_TSCONFIG_GET``. + +The legacy configuration is the use of the ioctl(SIOCSHWTSTAMP) with a pointer +to a struct ifreq whose ifr_data points to a struct hwtstamp_config. +The tx_type and rx_filter are hints to the driver what it is expected to do. +If the requested fine-grained filtering for incoming packets is not +supported, the driver may time stamp more than just the requested types +of packets. ioctl(SIOCGHWTSTAMP) is used in the same way as the +ioctl(SIOCSHWTSTAMP). However, this has not been implemented in all drivers. :: @@ -610,9 +619,10 @@ not been implemented in all drivers. -------------------------------------------------------- A driver which supports hardware time stamping must support the -SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with -the actual values as described in the section on SIOCSHWTSTAMP. It -should also support SIOCGHWTSTAMP. +ndo_hwtstamp_set NDO or the legacy SIOCSHWTSTAMP ioctl and update the +supplied struct hwtstamp_config with the actual values as described in +the section on SIOCSHWTSTAMP. It should also support ndo_hwtstamp_get or +the legacy SIOCGHWTSTAMP. Time stamps for received packets must be stored in the skb. To get a pointer to the shared time stamp structure of the skb call skb_hwtstamps(). Then diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index df289dde0f61b..43993a2d68e56 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -694,6 +694,18 @@ enum { ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) }; +enum { + ETHTOOL_A_TSCONFIG_UNSPEC, + ETHTOOL_A_TSCONFIG_HEADER, + ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER, + ETHTOOL_A_TSCONFIG_TX_TYPES, + ETHTOOL_A_TSCONFIG_RX_FILTERS, + ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, + + __ETHTOOL_A_TSCONFIG_CNT, + ETHTOOL_A_TSCONFIG_MAX = (__ETHTOOL_A_TSCONFIG_CNT - 1) +}; + enum { ETHTOOL_MSG_USER_NONE = 0, ETHTOOL_MSG_STRSET_GET = 1, @@ -741,6 +753,8 @@ enum { ETHTOOL_MSG_MM_SET, ETHTOOL_MSG_MODULE_FW_FLASH_ACT, ETHTOOL_MSG_PHY_GET, + ETHTOOL_MSG_TSCONFIG_GET, + ETHTOOL_MSG_TSCONFIG_SET, __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) @@ -794,6 +808,8 @@ enum { ETHTOOL_MSG_MODULE_FW_FLASH_NTF, ETHTOOL_MSG_PHY_GET_REPLY, ETHTOOL_MSG_PHY_NTF, + ETHTOOL_MSG_TSCONFIG_GET_REPLY, + ETHTOOL_MSG_TSCONFIG_SET_REPLY, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 9b540644ba31f..a1490c4afe6bc 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -9,4 +9,4 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o \ - phy.o + phy.o tsconfig.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 666db40bcfdaf..02f941f667dd5 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -797,7 +797,7 @@ int ethtool_net_get_ts_info_by_phc(struct net_device *dev, return -ENODEV; } -int +struct phy_device * ethtool_phy_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, struct hwtstamp_provider_desc *hwprov_desc) @@ -806,7 +806,7 @@ ethtool_phy_get_ts_info_by_phc(struct net_device *dev, /* Only precise qualifier is supported in phydev */ if (hwprov_desc->qualifier != HWTSTAMP_PROVIDER_QUALIFIER_PRECISE) - return -ENODEV; + return ERR_PTR(-ENODEV); /* Look in the phy topology */ if (dev->link_topo) { @@ -820,12 +820,12 @@ ethtool_phy_get_ts_info_by_phc(struct net_device *dev, ethtool_init_tsinfo(info); err = phy_ts_info(pdn->phy, info); if (err) - return err; + return ERR_PTR(err); if (info->phc_index == hwprov_desc->index) - return 0; + return pdn->phy; } - return -ENODEV; + return ERR_PTR(-ENODEV); } /* Look on the dev->phydev */ @@ -833,13 +833,13 @@ ethtool_phy_get_ts_info_by_phc(struct net_device *dev, ethtool_init_tsinfo(info); err = phy_ts_info(dev->phydev, info); if (err) - return err; + return ERR_PTR(err); if (info->phc_index == hwprov_desc->index) - return 0; + return dev->phydev; } - return -ENODEV; + return ERR_PTR(-ENODEV); } int ethtool_get_ts_info_by_phc(struct net_device *dev, @@ -849,8 +849,15 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, int err; err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); - if (err == -ENODEV) - err = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); + if (err == -ENODEV) { + struct phy_device *phy; + + phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); + if (IS_ERR(phy)) + err = PTR_ERR(phy); + else + err = 0; + } info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; diff --git a/net/ethtool/common.h b/net/ethtool/common.h index f5119204c8ff6..850eadde4bfcc 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -56,7 +56,7 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, int ethtool_net_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, struct hwtstamp_provider_desc *hwprov_desc); -int +struct phy_device * ethtool_phy_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, struct hwtstamp_provider_desc *hwprov_desc); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 6ae1d91f36e7e..849c98e637c65 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -394,6 +394,8 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_PLCA_GET_STATUS] = ðnl_plca_status_request_ops, [ETHTOOL_MSG_MM_GET] = ðnl_mm_request_ops, [ETHTOOL_MSG_MM_SET] = ðnl_mm_request_ops, + [ETHTOOL_MSG_TSCONFIG_GET] = ðnl_tsconfig_request_ops, + [ETHTOOL_MSG_TSCONFIG_SET] = ðnl_tsconfig_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -1243,6 +1245,22 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_phy_get_policy, .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_TSCONFIG_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_tsconfig_get_policy, + .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_TSCONFIG_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_set_doit, + .policy = ethnl_tsconfig_set_policy, + .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 960cda13e4fc2..0a09298fff92b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -435,6 +435,7 @@ extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct ethnl_request_ops ethnl_phy_request_ops; +extern const struct ethnl_request_ops ethnl_tsconfig_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -485,6 +486,8 @@ extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1]; extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1]; extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1]; extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1]; +extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1]; +extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1]; int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c new file mode 100644 index 0000000000000..9188e088fb2f9 --- /dev/null +++ b/net/ethtool/tsconfig.c @@ -0,0 +1,444 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include "netlink.h" +#include "common.h" +#include "bitset.h" +#include "../core/dev.h" +#include "ts.h" + +struct tsconfig_req_info { + struct ethnl_req_info base; +}; + +struct tsconfig_reply_data { + struct ethnl_reply_data base; + struct hwtstamp_provider_desc hwprov_desc; + struct { + u32 tx_type; + u32 rx_filter; + u32 flags; + } hwtst_config; +}; + +#define TSCONFIG_REPDATA(__reply_base) \ + container_of(__reply_base, struct tsconfig_reply_data, base) + +const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1] = { + [ETHTOOL_A_TSCONFIG_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + const struct genl_info *info) +{ + struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); + struct hwtstamp_provider *hwprov = NULL; + struct net_device *dev = reply_base->dev; + struct kernel_hwtstamp_config cfg = {}; + int ret; + + if (!dev->netdev_ops->ndo_hwtstamp_get) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = dev_get_hwtstamp_phylib(dev, &cfg); + if (ret) + goto out; + + data->hwtst_config.tx_type = BIT(cfg.tx_type); + data->hwtst_config.rx_filter = BIT(cfg.rx_filter); + data->hwtst_config.flags = BIT(cfg.flags); + + data->hwprov_desc.index = -1; + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + data->hwprov_desc.index = hwprov->desc.index; + data->hwprov_desc.qualifier = hwprov->desc.qualifier; + } else { + struct kernel_ethtool_ts_info ts_info = {}; + + ts_info.phc_index = -1; + ret = __ethtool_get_ts_info(dev, &ts_info); + if (ret) + goto out; + + if (ts_info.phc_index == -1) + return -ENODEV; + + data->hwprov_desc.index = ts_info.phc_index; + data->hwprov_desc.qualifier = ts_info.phc_qualifier; + } + +out: + ethnl_ops_complete(dev); + return ret; +} + +static int tsconfig_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int len = 0; + int ret; + + BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); + BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); + + if (data->hwtst_config.flags) + /* _TSCONFIG_HWTSTAMP_FLAGS */ + len += nla_total_size(sizeof(u32)); + + if (data->hwtst_config.tx_type) { + ret = ethnl_bitset32_size(&data->hwtst_config.tx_type, + NULL, __HWTSTAMP_TX_CNT, + ts_tx_type_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_TX_TYPES */ + } + if (data->hwtst_config.rx_filter) { + ret = ethnl_bitset32_size(&data->hwtst_config.rx_filter, + NULL, __HWTSTAMP_FILTER_CNT, + ts_rx_filter_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_RX_FILTERS */ + } + + if (data->hwprov_desc.index >= 0) + /* _TSCONFIG_HWTSTAMP_PROVIDER */ + len += nla_total_size(0) + + 2 * nla_total_size(sizeof(u32)); + + return len; +} + +static int tsconfig_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int ret; + + if (data->hwtst_config.flags) { + ret = nla_put_u32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, + data->hwtst_config.flags); + if (ret < 0) + return ret; + } + + if (data->hwtst_config.tx_type) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_TX_TYPES, + &data->hwtst_config.tx_type, NULL, + __HWTSTAMP_TX_CNT, + ts_tx_type_names, compact); + if (ret < 0) + return ret; + } + + if (data->hwtst_config.rx_filter) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_RX_FILTERS, + &data->hwtst_config.rx_filter, + NULL, __HWTSTAMP_FILTER_CNT, + ts_rx_filter_names, compact); + if (ret < 0) + return ret; + } + + if (data->hwprov_desc.index >= 0) { + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, + data->hwprov_desc.index) || + nla_put_u32(skb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, + data->hwprov_desc.qualifier)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + } + return 0; +} + +/* TSCONFIG_SET */ +const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = { + [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] = + NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), + [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_U32 }, + [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED }, + [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED }, +}; + +static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info) +{ + struct tsconfig_reply_data *reply_data; + struct tsconfig_req_info *req_info; + struct sk_buff *rskb; + void *reply_payload; + int reply_len = 0; + int ret; + + req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(sizeof(*reply_data), GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return -ENOMEM; + } + + ASSERT_RTNL(); + reply_data->base.dev = dev; + ret = tsconfig_prepare_data(&req_info->base, &reply_data->base, info); + if (ret < 0) + goto err_cleanup; + + ret = tsconfig_reply_size(&req_info->base, &reply_data->base); + if (ret < 0) + goto err_cleanup; + + reply_len = ret + ethnl_reply_header_size(); + rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY, + ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload); + if (!rskb) + goto err_cleanup; + + ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base); + if (ret < 0) + goto err_cleanup; + + genlmsg_end(rskb, reply_payload); + ret = genlmsg_reply(rskb, info); + +err_cleanup: + kfree(reply_data); + kfree(req_info); + return ret; +} + +static int ethnl_set_tsconfig_validate(struct ethnl_req_info *req_base, + struct genl_info *info) +{ + const struct net_device_ops *ops = req_base->dev->netdev_ops; + + if (!ops->ndo_hwtstamp_set || !ops->ndo_hwtstamp_get) + return -EOPNOTSUPP; + + return 1; +} + +static struct hwtstamp_provider * +tsconfig_set_hwprov_from_desc(struct net_device *dev, + struct genl_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + struct kernel_ethtool_ts_info ts_info; + struct hwtstamp_provider *hwprov; + struct nlattr **tb = info->attrs; + struct phy_device *phy = NULL; + enum hwtstamp_source source; + int ret; + + ret = ethtool_net_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); + if (!ret) { + /* Found */ + source = HWTSTAMP_SOURCE_NETDEV; + } else { + phy = ethtool_phy_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); + if (IS_ERR(phy)) { + if (PTR_ERR(phy) == -ENODEV) + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], + "phc not in this net device topology"); + return ERR_CAST(phy); + } + + source = HWTSTAMP_SOURCE_PHYLIB; + } + + hwprov = kzalloc(sizeof(*hwprov), GFP_KERNEL); + if (!hwprov) + return ERR_PTR(-ENOMEM); + + hwprov->desc.index = hwprov_desc->index; + hwprov->desc.qualifier = hwprov_desc->qualifier; + hwprov->source = source; + hwprov->phydev = phy; + + return hwprov; +} + +static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, + struct genl_info *info) +{ + struct kernel_hwtstamp_config hwtst_config = {0}; + bool hwprov_mod = false, config_mod = false; + struct hwtstamp_provider *hwprov = NULL; + struct net_device *dev = req_base->dev; + struct nlattr **tb = info->attrs; + int ret; + + BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32); + BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32); + + if (!netif_device_present(dev)) + return -ENODEV; + + if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER]) { + struct hwtstamp_provider_desc __hwprov_desc = {.index = -1}; + struct hwtstamp_provider *__hwprov; + + __hwprov = rtnl_dereference(dev->hwprov); + if (__hwprov) { + __hwprov_desc.index = __hwprov->desc.index; + __hwprov_desc.qualifier = __hwprov->desc.qualifier; + } + + ret = ts_parse_hwtst_provider(tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], + &__hwprov_desc, info->extack, + &hwprov_mod); + if (ret < 0) + return ret; + + if (hwprov_mod) { + hwprov = tsconfig_set_hwprov_from_desc(dev, info, + &__hwprov_desc); + if (IS_ERR(hwprov)) + return PTR_ERR(hwprov); + } + } + + /* Get current hwtstamp config if we are not changing the + * hwtstamp source. It will be zeroed in the other case. + */ + if (!hwprov_mod) { + ret = dev_get_hwtstamp_phylib(dev, &hwtst_config); + if (ret < 0 && ret != -EOPNOTSUPP) + goto err_free_hwprov; + } + + /* Get the hwtstamp config from netlink */ + if (tb[ETHTOOL_A_TSCONFIG_TX_TYPES]) { + u32 req_tx_type; + + req_tx_type = BIT(hwtst_config.tx_type); + ret = ethnl_update_bitset32(&req_tx_type, + __HWTSTAMP_TX_CNT, + tb[ETHTOOL_A_TSCONFIG_TX_TYPES], + ts_tx_type_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; + + /* Select only one tx type at a time */ + if (ffs(req_tx_type) != fls(req_tx_type)) { + ret = -EINVAL; + goto err_free_hwprov; + } + + hwtst_config.tx_type = ffs(req_tx_type) - 1; + } + + if (tb[ETHTOOL_A_TSCONFIG_RX_FILTERS]) { + u32 req_rx_filter; + + req_rx_filter = BIT(hwtst_config.rx_filter); + ret = ethnl_update_bitset32(&req_rx_filter, + __HWTSTAMP_FILTER_CNT, + tb[ETHTOOL_A_TSCONFIG_RX_FILTERS], + ts_rx_filter_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; + + /* Select only one rx filter at a time */ + if (ffs(req_rx_filter) != fls(req_rx_filter)) { + ret = -EINVAL; + goto err_free_hwprov; + } + + hwtst_config.rx_filter = ffs(req_rx_filter) - 1; + } + + if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) { + ethnl_update_u32(&hwtst_config.flags, + tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], + &config_mod); + } + + ret = net_hwtstamp_validate(&hwtst_config); + if (ret) + goto err_free_hwprov; + + if (hwprov_mod) { + struct kernel_hwtstamp_config zero_config = {0}; + struct hwtstamp_provider *__hwprov; + + /* Disable current time stamping if we try to enable + * another one + */ + ret = dev_set_hwtstamp_phylib(dev, &zero_config, info->extack); + if (ret < 0) + goto err_free_hwprov; + + /* Change the selected hwtstamp source */ + __hwprov = rcu_replace_pointer_rtnl(dev->hwprov, hwprov); + if (__hwprov) + kfree_rcu(__hwprov, rcu_head); + } + + if (config_mod) { + ret = dev_set_hwtstamp_phylib(dev, &hwtst_config, + info->extack); + if (ret < 0) + return ret; + } + + if (hwprov_mod || config_mod) { + ret = tsconfig_send_reply(dev, info); + if (ret && ret != -EOPNOTSUPP) { + NL_SET_ERR_MSG(info->extack, + "error while reading the new configuration set"); + return ret; + } + } + + /* tsconfig has no notification */ + return 0; + +err_free_hwprov: + kfree(hwprov); + + return ret; +} + +const struct ethnl_request_ops ethnl_tsconfig_request_ops = { + .request_cmd = ETHTOOL_MSG_TSCONFIG_GET, + .reply_cmd = ETHTOOL_MSG_TSCONFIG_GET_REPLY, + .hdr_attr = ETHTOOL_A_TSCONFIG_HEADER, + .req_info_size = sizeof(struct tsconfig_req_info), + .reply_data_size = sizeof(struct tsconfig_reply_data), + + .prepare_data = tsconfig_prepare_data, + .reply_size = tsconfig_reply_size, + .fill_reply = tsconfig_fill_reply, + + .set_validate = ethnl_set_tsconfig_validate, + .set = ethnl_set_tsconfig, +}; From 7b00af2c5414dc01e0718deef7ead81102867636 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 16 Dec 2024 20:53:08 +0800 Subject: [PATCH 0556/1450] erofs: use `struct erofs_device_info` for the primary device Instead of just listing each one directly in `struct erofs_sb_info` except that we still use `sb->s_bdev` for the primary block device. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241216125310.930933-2-hsiangkao@linux.alibaba.com --- fs/erofs/data.c | 12 ++++-------- fs/erofs/fscache.c | 6 +++--- fs/erofs/internal.h | 8 ++------ fs/erofs/super.c | 27 +++++++++++++-------------- 4 files changed, 22 insertions(+), 31 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1c49f8962021f..622017c659587 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -56,10 +56,10 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) buf->file = NULL; if (erofs_is_fileio_mode(sbi)) { - buf->file = sbi->fdev; /* some fs like FUSE needs it */ + buf->file = sbi->dif0.file; /* some fs like FUSE needs it */ buf->mapping = buf->file->f_mapping; } else if (erofs_is_fscache_mode(sb)) - buf->mapping = sbi->s_fscache->inode->i_mapping; + buf->mapping = sbi->dif0.fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; } @@ -201,12 +201,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) erofs_off_t startoff, length; int id; - map->m_bdev = sb->s_bdev; - map->m_daxdev = EROFS_SB(sb)->dax_dev; - map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; - map->m_fscache = EROFS_SB(sb)->s_fscache; - map->m_fp = EROFS_SB(sb)->fdev; - + erofs_fill_from_devinfo(map, &EROFS_SB(sb)->dif0); + map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); dif = idr_find(&devs->tree, map->m_deviceid - 1); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index fda16eedafb57..ce7e38c827198 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -657,7 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb) if (IS_ERR(fscache)) return PTR_ERR(fscache); - sbi->s_fscache = fscache; + sbi->dif0.fscache = fscache; return 0; } @@ -665,14 +665,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - erofs_fscache_unregister_cookie(sbi->s_fscache); + erofs_fscache_unregister_cookie(sbi->dif0.fscache); if (sbi->domain) erofs_fscache_domain_put(sbi->domain); else fscache_relinquish_volume(sbi->volume, NULL, false); - sbi->s_fscache = NULL; + sbi->dif0.fscache = NULL; sbi->volume = NULL; sbi->domain = NULL; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1c847c30a918f..3e8d71d516f42 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -107,6 +107,7 @@ struct erofs_xattr_prefix_item { }; struct erofs_sb_info { + struct erofs_device_info dif0; struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ @@ -124,13 +125,9 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ - struct file *fdev; struct inode *packed_inode; struct erofs_dev_context *devs; - struct dax_device *dax_dev; - u64 dax_part_off; u64 total_blocks; - u32 primarydevice_blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR @@ -166,7 +163,6 @@ struct erofs_sb_info { /* fscache support */ struct fscache_volume *volume; - struct erofs_fscache *s_fscache; struct erofs_domain *domain; char *fsid; char *domain_id; @@ -187,7 +183,7 @@ struct erofs_sb_info { static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) { - return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev; + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file; } static inline bool erofs_is_fscache_mode(struct super_block *sb) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index de8e3ecc6381d..9044907354e12 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -203,7 +203,7 @@ static int erofs_scan_devices(struct super_block *sb, struct erofs_device_info *dif; int id, err = 0; - sbi->total_blocks = sbi->primarydevice_blocks; + sbi->total_blocks = sbi->dif0.blocks; if (!erofs_sb_has_device_table(sbi)) ondisk_extradevs = 0; else @@ -307,7 +307,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); + sbi->dif0.blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -602,9 +602,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -EINVAL; } - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off, - NULL, NULL); + sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev, + &sbi->dif0.dax_part_off, NULL, NULL); } err = erofs_read_superblock(sb); @@ -627,7 +626,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) } if (test_opt(&sbi->opt, DAX_ALWAYS)) { - if (!sbi->dax_dev) { + if (!sbi->dif0.dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(&sbi->opt, DAX_ALWAYS); } else if (sbi->blkszbits != PAGE_SHIFT) { @@ -707,14 +706,13 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (!fc->source) return invalf(fc, "No source specified"); - file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); if (IS_ERR(file)) return PTR_ERR(file); - sbi->fdev = file; + sbi->dif0.file = file; - if (S_ISREG(file_inode(sbi->fdev)->i_mode) && - sbi->fdev->f_mapping->a_ops->read_folio) + if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) && + sbi->dif0.file->f_mapping->a_ops->read_folio) return get_tree_nodev(fc, erofs_fc_fill_super); } #endif @@ -771,8 +769,8 @@ static void erofs_sb_free(struct erofs_sb_info *sbi) erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); kfree(sbi->domain_id); - if (sbi->fdev) - fput(sbi->fdev); + if (sbi->dif0.file) + fput(sbi->dif0.file); kfree(sbi); } @@ -817,11 +815,12 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || + sbi->dif0.file) kill_anon_super(sb); else kill_block_super(sb); - fs_put_dax(sbi->dax_dev, NULL); + fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); sb->s_fs_info = NULL; From f8d920a402aec3482931cb5f1539ed438740fc49 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 13 Dec 2024 07:54:01 +0800 Subject: [PATCH 0557/1450] erofs: reference `struct erofs_device_info` for erofs_map_dev Record `m_sb` and `m_dif` to replace `m_fscache`, `m_daxdev`, `m_fp` and `m_dax_part_off` in order to simplify the codebase. Note that `m_bdev` is still left since it can be assigned from `sb->s_bdev` directly. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212235401.2857246-1-hsiangkao@linux.alibaba.com --- fs/erofs/data.c | 26 ++++++++++---------------- fs/erofs/fileio.c | 2 +- fs/erofs/fscache.c | 4 ++-- fs/erofs/internal.h | 6 ++---- 4 files changed, 15 insertions(+), 23 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 622017c659587..0cd6b5c4df985 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -179,19 +179,13 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) } static void erofs_fill_from_devinfo(struct erofs_map_dev *map, - struct erofs_device_info *dif) + struct super_block *sb, struct erofs_device_info *dif) { + map->m_sb = sb; + map->m_dif = dif; map->m_bdev = NULL; - map->m_fp = NULL; - if (dif->file) { - if (S_ISBLK(file_inode(dif->file)->i_mode)) - map->m_bdev = file_bdev(dif->file); - else - map->m_fp = dif->file; - } - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); } int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) @@ -201,7 +195,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) erofs_off_t startoff, length; int id; - erofs_fill_from_devinfo(map, &EROFS_SB(sb)->dif0); + erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); @@ -215,7 +209,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - erofs_fill_from_devinfo(map, dif); + erofs_fill_from_devinfo(map, sb, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); @@ -228,7 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - erofs_fill_from_devinfo(map, dif); + erofs_fill_from_devinfo(map, sb, dif); break; } } @@ -298,7 +292,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->offset = map.m_la; if (flags & IOMAP_DAX) - iomap->dax_dev = mdev.m_daxdev; + iomap->dax_dev = mdev.m_dif->dax_dev; else iomap->bdev = mdev.m_bdev; iomap->length = map.m_llen; @@ -327,7 +321,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; if (flags & IOMAP_DAX) - iomap->addr += mdev.m_dax_part_off; + iomap->addr += mdev.m_dif->dax_part_off; } return 0; } diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 3af96b1e2c2aa..a61b8faec6518 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -67,7 +67,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) GFP_KERNEL | __GFP_NOFAIL); bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); - rq->iocb.ki_filp = mdev->m_fp; + rq->iocb.ki_filp = mdev->m_dif->file; return rq; } diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index ce7e38c827198..ce3d8737df85d 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -198,7 +198,7 @@ struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL); bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ); - io->io.private = mdev->m_fscache->cookie; + io->io.private = mdev->m_dif->fscache->cookie; io->io.end_io = erofs_fscache_bio_endio; refcount_set(&io->io.ref, 1); return &io->bio; @@ -316,7 +316,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) if (!io) return -ENOMEM; iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count); - ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie, + ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie, mdev.m_pa + (pos - map.m_la), io); erofs_fscache_req_io_put(io); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 3e8d71d516f42..7cc8e1be04e8f 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -353,11 +353,9 @@ enum { }; struct erofs_map_dev { - struct erofs_fscache *m_fscache; + struct super_block *m_sb; + struct erofs_device_info *m_dif; struct block_device *m_bdev; - struct dax_device *m_daxdev; - struct file *m_fp; - u64 m_dax_part_off; erofs_off_t m_pa; unsigned int m_deviceid; From 6422cde1b0d5a31b206b263417c1c2b3c80fe82c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Dec 2024 21:43:36 +0800 Subject: [PATCH 0558/1450] erofs: use buffered I/O for file-backed mounts by default For many use cases (e.g. container images are just fetched from remote), performance will be impacted if underlay page cache is up-to-date but direct i/o flushes dirty pages first. Instead, let's use buffered I/O by default to keep in sync with loop devices and add a (re)mount option to explicitly give a try to use direct I/O if supported by the underlying files. The container startup time is improved as below: [workload] docker.io/library/workpress:latest unpack 1st run non-1st runs EROFS snapshotter buffered I/O file 4.586404265s 0.308s 0.198s EROFS snapshotter direct I/O file 4.581742849s 2.238s 0.222s EROFS snapshotter loop 4.596023152s 0.346s 0.201s Overlayfs snapshotter 5.382851037s 0.206s 0.214s Fixes: fb176750266a ("erofs: add file-backed mount support") Cc: Derek McGowan Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212134336.2059899-1-hsiangkao@linux.alibaba.com --- fs/erofs/fileio.c | 7 +++++-- fs/erofs/internal.h | 1 + fs/erofs/super.c | 23 +++++++++++++++-------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index a61b8faec6518..33f8539dda4ae 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -9,6 +9,7 @@ struct erofs_fileio_rq { struct bio_vec bvecs[BIO_MAX_VECS]; struct bio bio; struct kiocb iocb; + struct super_block *sb; }; struct erofs_fileio { @@ -52,8 +53,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; rq->iocb.ki_ioprio = get_current_ioprio(); rq->iocb.ki_complete = erofs_fileio_ki_complete; - rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ? - IOCB_DIRECT : 0; + if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) && + rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) + rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); @@ -68,6 +70,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); rq->iocb.ki_filp = mdev->m_dif->file; + rq->sb = mdev->m_sb; return rq; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 7cc8e1be04e8f..686d835eb533a 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -176,6 +176,7 @@ struct erofs_sb_info { #define EROFS_MOUNT_POSIX_ACL 0x00000020 #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 +#define EROFS_MOUNT_DIRECT_IO 0x00000100 #define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 9044907354e12..f5956474bfdea 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -364,14 +364,8 @@ static void erofs_default_options(struct erofs_sb_info *sbi) } enum { - Opt_user_xattr, - Opt_acl, - Opt_cache_strategy, - Opt_dax, - Opt_dax_enum, - Opt_device, - Opt_fsid, - Opt_domain_id, + Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_err }; @@ -398,6 +392,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), fsparam_string("domain_id", Opt_domain_id), + fsparam_flag_no("directio", Opt_directio), {} }; @@ -511,6 +506,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); break; #endif + case Opt_directio: +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (result.boolean) + set_opt(&sbi->opt, DIRECT_IO); + else + clear_opt(&sbi->opt, DIRECT_IO); +#else + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); +#endif + break; default: return -ENOPARAM; } @@ -948,6 +953,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); + if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO)) + seq_puts(seq, ",directio"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); From 38651476e46e088598354510502c383e932e2297 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 11 Dec 2024 14:09:27 +0530 Subject: [PATCH 0559/1450] RDMA/bnxt_re: Fix the check for 9060 condition The check for 9060 condition should only be made for legacy chips. Fixes: 9152e0b722b2 ("RDMA/bnxt_re: HW workarounds for handling specific conditions") Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 72f35070f671c..093bfb748cdfd 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2669,10 +2669,12 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, bnxt_qplib_add_flush_qp(qp); } else { /* Before we complete, do WA 9060 */ - if (do_wa9060(qp, cq, cq_cons, sq->swq_last, - cqe_sq_cons)) { - *lib_qp = qp; - goto out; + if (!bnxt_qplib_is_chip_gen_p5_p7(qp->cctx)) { + if (do_wa9060(qp, cq, cq_cons, sq->swq_last, + cqe_sq_cons)) { + *lib_qp = qp; + goto out; + } } if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) { cqe->status = CQ_REQ_STATUS_OK; From 798653a0ee30d3cd495099282751c0f248614ae7 Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Wed, 11 Dec 2024 14:09:28 +0530 Subject: [PATCH 0560/1450] RDMA/bnxt_re: Add check for path mtu in modify_qp When RDMA app configures path MTU, add a check in modify_qp verb to make sure that it doesn't go beyond interface MTU. If this check fails, driver will fail the modify_qp verb. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Reviewed-by: Kalesh AP Signed-off-by: Saravanan Vajravel Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 26 +++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 215074c0860b8..a609e1635a3d4 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2162,18 +2162,20 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, } } - if (qp_attr_mask & IB_QP_PATH_MTU) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = __from_ib_mtu(qp_attr->path_mtu); - qp->qplib_qp.mtu = ib_mtu_enum_to_int(qp_attr->path_mtu); - } else if (qp_attr->qp_state == IB_QPS_RTR) { - qp->qplib_qp.modify_flags |= - CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; - qp->qplib_qp.path_mtu = - __from_ib_mtu(iboe_get_mtu(rdev->netdev->mtu)); - qp->qplib_qp.mtu = - ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu)); + if (qp_attr->qp_state == IB_QPS_RTR) { + enum ib_mtu qpmtu; + + qpmtu = iboe_get_mtu(rdev->netdev->mtu); + if (qp_attr_mask & IB_QP_PATH_MTU) { + if (ib_mtu_enum_to_int(qp_attr->path_mtu) > + ib_mtu_enum_to_int(qpmtu)) + return -EINVAL; + qpmtu = qp_attr->path_mtu; + } + + qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; + qp->qplib_qp.path_mtu = __from_ib_mtu(qpmtu); + qp->qplib_qp.mtu = ib_mtu_enum_to_int(qpmtu); } if (qp_attr_mask & IB_QP_TIMEOUT) { From da2132e683954e7ddda3cd674e866a847b7389eb Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Wed, 11 Dec 2024 14:09:29 +0530 Subject: [PATCH 0561/1450] RDMA/bnxt_re: Fix setting mandatory attributes for modify_qp Firmware expects "min_rnr_timer" as a mandatory attribute in MODIFY_QP command during the RTR-RTS transition. This needs to be enforced by the driver which is missing while setting bnxt_set_mandatory_attributes that sends these flags as part of modify_qp optimization. Fixes: 82c32d219272 ("RDMA/bnxt_re: Add support for optimized modify QP") Reviewed-by: Rukhsana Ansari Reviewed-by: Kalesh AP Signed-off-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 13 +++++++++++-- drivers/infiniband/hw/bnxt_re/qplib_res.h | 5 +++++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 1 + 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 093bfb748cdfd..5169804e6f127 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1285,7 +1285,8 @@ static void __filter_modify_flags(struct bnxt_qplib_qp *qp) } } -static void bnxt_set_mandatory_attributes(struct bnxt_qplib_qp *qp, +static void bnxt_set_mandatory_attributes(struct bnxt_qplib_res *res, + struct bnxt_qplib_qp *qp, struct cmdq_modify_qp *req) { u32 mandatory_flags = 0; @@ -1300,6 +1301,14 @@ static void bnxt_set_mandatory_attributes(struct bnxt_qplib_qp *qp, mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; } + if (_is_min_rnr_in_rtr_rts_mandatory(res->dattr->dev_cap_flags2) && + (qp->cur_qp_state == CMDQ_MODIFY_QP_NEW_STATE_RTR && + qp->state == CMDQ_MODIFY_QP_NEW_STATE_RTS)) { + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_RC) + mandatory_flags |= + CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER; + } + if (qp->type == CMDQ_MODIFY_QP_QP_TYPE_UD || qp->type == CMDQ_MODIFY_QP_QP_TYPE_GSI) mandatory_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY; @@ -1340,7 +1349,7 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) /* Set mandatory attributes for INIT -> RTR and RTR -> RTS transition */ if (_is_optimize_modify_qp_supported(res->dattr->dev_cap_flags2) && is_optimized_state_transition(qp)) - bnxt_set_mandatory_attributes(qp, &req); + bnxt_set_mandatory_attributes(res, qp, &req); } bmask = qp->modify_flags; req.modify_mask = cpu_to_le32(qp->modify_flags); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 21fb148713a6f..cbfc49a1a56d7 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -584,6 +584,11 @@ static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; } +static inline bool _is_min_rnr_in_rtr_rts_mandatory(u16 dev_cap_ext_flags2) +{ + return !!(dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED); +} + static inline bool _is_cq_coalescing_supported(u16 dev_cap_ext_flags2) { return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED; diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index a98fc9c2313e7..0ee60fdc18b33 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -2215,6 +2215,7 @@ struct creq_query_func_resp_sb { #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE (0x2UL << 4) #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE + #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL __le16 max_xp_qp_size; __le16 create_qp_batch_size; __le16 destroy_qp_batch_size; From 34db8ec931b84d1426423f263b1927539e73b397 Mon Sep 17 00:00:00 2001 From: Hongguang Gao Date: Wed, 11 Dec 2024 14:09:30 +0530 Subject: [PATCH 0562/1450] RDMA/bnxt_re: Fix to export port num to ib_query_qp Current driver implementation doesn't populate the port_num field in query_qp. Adding the code to convert internal firmware port id to ibv defined port number and export it. Reviewed-by: Saravanan Vajravel Reviewed-by: Kalesh AP Signed-off-by: Hongguang Gao Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-5-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 1 + drivers/infiniband/hw/bnxt_re/ib_verbs.h | 4 ++++ drivers/infiniband/hw/bnxt_re/qplib_fp.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_fp.h | 1 + 4 files changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index a609e1635a3d4..bcb7cfc63d094 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2325,6 +2325,7 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->retry_cnt = qplib_qp->retry_cnt; qp_attr->rnr_retry = qplib_qp->rnr_retry; qp_attr->min_rnr_timer = qplib_qp->min_rnr_timer; + qp_attr->port_num = __to_ib_port_num(qplib_qp->port_id); qp_attr->rq_psn = qplib_qp->rq.psn; qp_attr->max_rd_atomic = qplib_qp->max_rd_atomic; qp_attr->sq_psn = qplib_qp->sq.psn; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index ac59f1d73b152..fbb16a411d6a3 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -268,6 +268,10 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +static inline u32 __to_ib_port_num(u16 port_id) +{ + return (u32)port_id + 1; +} unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp); void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp, unsigned long flags); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 5169804e6f127..d8a2a929bbe32 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1532,6 +1532,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp->dest_qpn = le32_to_cpu(sb->dest_qp_id); memcpy(qp->smac, sb->src_mac, 6); qp->vlan_id = le16_to_cpu(sb->vlan_pcp_vlan_dei_vlan_id); + qp->port_id = le16_to_cpu(sb->port_id); bail: dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 19e279871f107..0660101b5310e 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -298,6 +298,7 @@ struct bnxt_qplib_qp { u32 dest_qpn; u8 smac[6]; u16 vlan_id; + u16 port_id; u8 nw_type; struct bnxt_qplib_ah ah; From 7179fe0074a3c962e43a9e51169304c4911989ed Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Wed, 11 Dec 2024 14:09:31 +0530 Subject: [PATCH 0563/1450] RDMA/bnxt_re: Fix reporting hw_ver in query_device Driver currently populates subsystem_device id in the "hw_ver" field of ib_attr structure in query_device. Updated to populate PCI revision ID. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Reviewed-by: Preethi G Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241211083931.968831-6-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index bcb7cfc63d094..e3d26bd6de05a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -199,7 +199,7 @@ int bnxt_re_query_device(struct ib_device *ibdev, ib_attr->vendor_id = rdev->en_dev->pdev->vendor; ib_attr->vendor_part_id = rdev->en_dev->pdev->device; - ib_attr->hw_ver = rdev->en_dev->pdev->subsystem_device; + ib_attr->hw_ver = rdev->en_dev->pdev->revision; ib_attr->max_qp = dev_attr->max_qp; ib_attr->max_qp_wr = dev_attr->max_qp_wqes; ib_attr->device_cap_flags = From 7c449ef0fdce540bfb235a2d93e7184864c3388b Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 16 Dec 2024 22:08:20 +0800 Subject: [PATCH 0564/1450] ASoC: Intel: sof_sdw: Fix DMI match for Lenovo 21Q6 and 21Q7 Update the DMI match for a Lenovo laptop to the new DMI identifier. This laptop ships with a different DMI identifier to what was expected, and now has two identifiers. Signed-off-by: Richard Fitzgerald Fixes: 83c062ae81e8 ("ASoC: Intel: sof_sdw: Add quirks for some new Lenovo laptops") Signed-off-by: Bard Liao Link: https://patch.msgid.link/20241216140821.153670-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 810be7c949a5c..e20ab6bfa5dda 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -641,9 +641,17 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { .callback = sof_sdw_quirk_cb, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "233B") + DMI_MATCH(DMI_PRODUCT_NAME, "21Q6") }, - .driver_data = (void *)(SOC_SDW_SIDECAR_AMPS), + .driver_data = (void *)(SOC_SDW_SIDECAR_AMPS | SOC_SDW_CODEC_MIC), + }, + { + .callback = sof_sdw_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21Q7") + }, + .driver_data = (void *)(SOC_SDW_SIDECAR_AMPS | SOC_SDW_CODEC_MIC), }, /* ArrowLake devices */ From ba7d47a54bf23a7201bdd2978e16b04fc1cb1f6e Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 16 Dec 2024 22:08:21 +0800 Subject: [PATCH 0565/1450] ASoC: Intel: sof_sdw: Fix DMI match for Lenovo 21QA and 21QB Update the DMI match for a Lenovo laptop to the new DMI identifier. This laptop ships with a different DMI identifier to what was expected, and now has two identifiers. Signed-off-by: Richard Fitzgerald Fixes: ea657f6b24e1 ("ASoC: Intel: sof_sdw: Add quirk for cs42l43 system using host DMICs") Signed-off-by: Bard Liao Link: https://patch.msgid.link/20241216140821.153670-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index e20ab6bfa5dda..667103027f7ef 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -632,7 +632,16 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { .callback = sof_sdw_quirk_cb, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "233C") + DMI_MATCH(DMI_PRODUCT_NAME, "21QB") + }, + /* Note this quirk excludes the CODEC mic */ + .driver_data = (void *)(SOC_SDW_CODEC_MIC), + }, + { + .callback = sof_sdw_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21QA") }, /* Note this quirk excludes the CODEC mic */ .driver_data = (void *)(SOC_SDW_CODEC_MIC), From 6f4a0fd03ce856c6d9811429b9969b4f27e2eaee Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 11 Dec 2024 11:54:02 +0800 Subject: [PATCH 0566/1450] ASoC: dt-bindings: realtek,rt5645: Fix CPVDD voltage comment Both the ALC5645 and ALC5650 datasheets specify a recommended voltage of 1.8V for CPVDD, not 3.5V. Fix the comment. Cc: Matthias Brugger Fixes: 26aa19174f0d ("ASoC: dt-bindings: rt5645: add suppliers") Fixes: 83d43ab0a1cb ("ASoC: dt-bindings: realtek,rt5645: Convert to dtschema") Signed-off-by: Chen-Yu Tsai Acked-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20241211035403.4157760-1-wenst@chromium.org Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/realtek,rt5645.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml b/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml index 13f09f1bc8003..0a698798c22be 100644 --- a/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml +++ b/Documentation/devicetree/bindings/sound/realtek,rt5645.yaml @@ -51,7 +51,7 @@ properties: description: Power supply for AVDD, providing 1.8V. cpvdd-supply: - description: Power supply for CPVDD, providing 3.5V. + description: Power supply for CPVDD, providing 1.8V. hp-detect-gpios: description: From 65c8c78cc74d5bcbc43f1f785a004796a2d78360 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 12 Dec 2024 21:13:10 +0100 Subject: [PATCH 0567/1450] thermal/thresholds: Fix uapi header macros leading to a compilation error The macros giving the direction of the crossing thresholds use the BIT macro which is not exported to the userspace. Consequently when an userspace program includes the header, it fails to compile. Replace the macros by their litteral to allow the compilation of userspace program using this header. Fixes: 445936f9e258 ("thermal: core: Add user thresholds support") Signed-off-by: Daniel Lezcano Link: https://patch.msgid.link/20241212201311.4143196-1-daniel.lezcano@linaro.org [ rjw: Add Fixes: ] Signed-off-by: Rafael J. Wysocki --- include/uapi/linux/thermal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index ba8604bdf2069..349718c271ebf 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -3,8 +3,8 @@ #define _UAPI_LINUX_THERMAL_H #define THERMAL_NAME_LENGTH 20 -#define THERMAL_THRESHOLD_WAY_UP BIT(0) -#define THERMAL_THRESHOLD_WAY_DOWN BIT(1) +#define THERMAL_THRESHOLD_WAY_UP 0x1 +#define THERMAL_THRESHOLD_WAY_DOWN 0x2 enum thermal_device_mode { THERMAL_DEVICE_DISABLED = 0, From cc252bb592638e0f7aea40d580186c36d89526b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Dec 2024 13:53:35 -0500 Subject: [PATCH 0568/1450] fgraph: Still initialize idle shadow stacks when starting A bug was discovered where the idle shadow stacks were not initialized for offline CPUs when starting function graph tracer, and when they came online they were not traced due to the missing shadow stack. To fix this, the idle task shadow stack initialization was moved to using the CPU hotplug callbacks. But it removed the initialization when the function graph was enabled. The problem here is that the hotplug callbacks are called when the CPUs come online, but the idle shadow stack initialization only happens if function graph is currently active. This caused the online CPUs to not get their shadow stack initialized. The idle shadow stack initialization still needs to be done when the function graph is registered, as they will not be allocated if function graph is not registered. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241211135335.094ba282@batman.local.home Fixes: 2c02f7375e65 ("fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks") Reported-by: Linus Walleij Tested-by: Linus Walleij Closes: https://lore.kernel.org/all/CACRpkdaTBrHwRbbrphVy-=SeDz6MSsXhTKypOtLrTQ+DgGAOcQ@mail.gmail.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 0bf78517b5d4c..ddedcb50917f4 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1215,7 +1215,7 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret; + int ret, cpu; ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, sizeof(*ret_stack_list), GFP_KERNEL); @@ -1223,6 +1223,12 @@ static int start_graph_tracing(void) if (!ret_stack_list) return -ENOMEM; + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + } + do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); From 8dccbecbb9692a96cf477eb826352a7c556a31e2 Mon Sep 17 00:00:00 2001 From: Bastien Curutchet Date: Fri, 13 Dec 2024 16:06:20 +0100 Subject: [PATCH 0569/1450] selftests/bpf: test_xdp_meta: Rename BPF sections SEC("t") and SEC("x") can't be loaded by the __load() helper. Rename these sections SEC("tc") and SEC("xdp") so they can be interpreted by the __load() helper in upcoming patch. Update the test_xdp_meta.sh to fit these new names. Signed-off-by: Bastien Curutchet Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20241213-xdp_meta-v2-1-634582725b90@bootlin.com --- tools/testing/selftests/bpf/progs/test_xdp_meta.c | 4 ++-- tools/testing/selftests/bpf/test_xdp_meta.sh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index a7c4a7d49fe6b..fe2d71ae0e717 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -8,7 +8,7 @@ #define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) #define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem -SEC("t") +SEC("tc") int ing_cls(struct __sk_buff *ctx) { __u8 *data, *data_meta, *data_end; @@ -28,7 +28,7 @@ int ing_cls(struct __sk_buff *ctx) return diff ? TC_ACT_SHOT : TC_ACT_OK; } -SEC("x") +SEC("xdp") int ing_xdp(struct xdp_md *ctx) { __u8 *data, *data_meta, *data_end; diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh index 2740322c1878b..6039b92f10949 100755 --- a/tools/testing/selftests/bpf/test_xdp_meta.sh +++ b/tools/testing/selftests/bpf/test_xdp_meta.sh @@ -43,11 +43,11 @@ ip netns exec ${NS2} ip addr add 10.1.1.22/24 dev veth2 ip netns exec ${NS1} tc qdisc add dev veth1 clsact ip netns exec ${NS2} tc qdisc add dev veth2 clsact -ip netns exec ${NS1} tc filter add dev veth1 ingress bpf da obj ${BPF_FILE} sec t -ip netns exec ${NS2} tc filter add dev veth2 ingress bpf da obj ${BPF_FILE} sec t +ip netns exec ${NS1} tc filter add dev veth1 ingress bpf da obj ${BPF_FILE} sec tc +ip netns exec ${NS2} tc filter add dev veth2 ingress bpf da obj ${BPF_FILE} sec tc -ip netns exec ${NS1} ip link set dev veth1 xdp obj ${BPF_FILE} sec x -ip netns exec ${NS2} ip link set dev veth2 xdp obj ${BPF_FILE} sec x +ip netns exec ${NS1} ip link set dev veth1 xdp obj ${BPF_FILE} sec xdp +ip netns exec ${NS2} ip link set dev veth2 xdp obj ${BPF_FILE} sec xdp ip netns exec ${NS1} ip link set dev veth1 up ip netns exec ${NS2} ip link set dev veth2 up From df539cefb0abbd16be9fbcc6ec46a5a35495800f Mon Sep 17 00:00:00 2001 From: Bastien Curutchet Date: Fri, 13 Dec 2024 16:06:21 +0100 Subject: [PATCH 0570/1450] selftests/bpf: Migrate test_xdp_meta.sh into xdp_context_test_run.c test_xdp_meta.sh can't be used by the BPF CI. Migrate test_xdp_meta.sh in a new test case in xdp_context_test_run.c. It uses the same BPF programs located in progs/test_xdp_meta.c and the same network topology. Remove test_xdp_meta.sh and its Makefile entry. Signed-off-by: Bastien Curutchet Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20241213-xdp_meta-v2-2-634582725b90@bootlin.com --- tools/testing/selftests/bpf/Makefile | 1 - .../bpf/prog_tests/xdp_context_test_run.c | 87 +++++++++++++++++++ tools/testing/selftests/bpf/test_xdp_meta.sh | 58 ------------- 3 files changed, 87 insertions(+), 59 deletions(-) delete mode 100755 tools/testing/selftests/bpf/test_xdp_meta.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6ad3b1ba1920c..772bfc6b63fa6 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -129,7 +129,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) TEST_PROGS := test_kmod.sh \ test_xdp_redirect.sh \ test_xdp_redirect_multi.sh \ - test_xdp_meta.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index e6a783c7f5db9..937da9b7532a5 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -2,6 +2,14 @@ #include #include #include "test_xdp_context_test_run.skel.h" +#include "test_xdp_meta.skel.h" + +#define TX_ADDR "10.0.0.1" +#define RX_ADDR "10.0.0.2" +#define RX_NAME "veth0" +#define TX_NAME "veth1" +#define TX_NETNS "xdp_context_tx" +#define RX_NETNS "xdp_context_rx" void test_xdp_context_error(int prog_fd, struct bpf_test_run_opts opts, __u32 data_meta, __u32 data, __u32 data_end, @@ -103,3 +111,82 @@ void test_xdp_context_test_run(void) test_xdp_context_test_run__destroy(skel); } + +void test_xdp_context_functional(void) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); + struct netns_obj *rx_ns = NULL, *tx_ns = NULL; + struct bpf_program *tc_prog, *xdp_prog; + struct test_xdp_meta *skel = NULL; + struct nstoken *nstoken = NULL; + int rx_ifindex; + int ret; + + tx_ns = netns_new(TX_NETNS, false); + if (!ASSERT_OK_PTR(tx_ns, "create tx_ns")) + return; + + rx_ns = netns_new(RX_NETNS, false); + if (!ASSERT_OK_PTR(rx_ns, "create rx_ns")) + goto close; + + SYS(close, "ip link add " RX_NAME " netns " RX_NETNS + " type veth peer name " TX_NAME " netns " TX_NETNS); + + nstoken = open_netns(RX_NETNS); + if (!ASSERT_OK_PTR(nstoken, "setns rx_ns")) + goto close; + + SYS(close, "ip addr add " RX_ADDR "/24 dev " RX_NAME); + SYS(close, "ip link set dev " RX_NAME " up"); + + skel = test_xdp_meta__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + goto close; + + rx_ifindex = if_nametoindex(RX_NAME); + if (!ASSERT_GE(rx_ifindex, 0, "if_nametoindex rx")) + goto close; + + tc_hook.ifindex = rx_ifindex; + ret = bpf_tc_hook_create(&tc_hook); + if (!ASSERT_OK(ret, "bpf_tc_hook_create")) + goto close; + + tc_prog = bpf_object__find_program_by_name(skel->obj, "ing_cls"); + if (!ASSERT_OK_PTR(tc_prog, "open ing_cls prog")) + goto close; + + tc_opts.prog_fd = bpf_program__fd(tc_prog); + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + + xdp_prog = bpf_object__find_program_by_name(skel->obj, "ing_xdp"); + if (!ASSERT_OK_PTR(xdp_prog, "open ing_xdp prog")) + goto close; + + ret = bpf_xdp_attach(rx_ifindex, + bpf_program__fd(xdp_prog), + 0, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) + goto close; + + close_netns(nstoken); + + nstoken = open_netns(TX_NETNS); + if (!ASSERT_OK_PTR(nstoken, "setns tx_ns")) + goto close; + + SYS(close, "ip addr add " TX_ADDR "/24 dev " TX_NAME); + SYS(close, "ip link set dev " TX_NAME " up"); + ASSERT_OK(SYS_NOFAIL("ping -c 1 " RX_ADDR), "ping"); + +close: + close_netns(nstoken); + test_xdp_meta__destroy(skel); + netns_free(rx_ns); + netns_free(tx_ns); +} + diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh deleted file mode 100755 index 6039b92f10949..0000000000000 --- a/tools/testing/selftests/bpf/test_xdp_meta.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh - -BPF_FILE="test_xdp_meta.bpf.o" -# Kselftest framework requirement - SKIP code is 4. -readonly KSFT_SKIP=4 -readonly NS1="ns1-$(mktemp -u XXXXXX)" -readonly NS2="ns2-$(mktemp -u XXXXXX)" - -cleanup() -{ - if [ "$?" = "0" ]; then - echo "selftests: test_xdp_meta [PASS]"; - else - echo "selftests: test_xdp_meta [FAILED]"; - fi - - set +e - ip link del veth1 2> /dev/null - ip netns del ${NS1} 2> /dev/null - ip netns del ${NS2} 2> /dev/null -} - -ip link set dev lo xdp off 2>/dev/null > /dev/null -if [ $? -ne 0 ];then - echo "selftests: [SKIP] Could not run test without the ip xdp support" - exit $KSFT_SKIP -fi -set -e - -ip netns add ${NS1} -ip netns add ${NS2} - -trap cleanup 0 2 3 6 9 - -ip link add veth1 type veth peer name veth2 - -ip link set veth1 netns ${NS1} -ip link set veth2 netns ${NS2} - -ip netns exec ${NS1} ip addr add 10.1.1.11/24 dev veth1 -ip netns exec ${NS2} ip addr add 10.1.1.22/24 dev veth2 - -ip netns exec ${NS1} tc qdisc add dev veth1 clsact -ip netns exec ${NS2} tc qdisc add dev veth2 clsact - -ip netns exec ${NS1} tc filter add dev veth1 ingress bpf da obj ${BPF_FILE} sec tc -ip netns exec ${NS2} tc filter add dev veth2 ingress bpf da obj ${BPF_FILE} sec tc - -ip netns exec ${NS1} ip link set dev veth1 xdp obj ${BPF_FILE} sec xdp -ip netns exec ${NS2} ip link set dev veth2 xdp obj ${BPF_FILE} sec xdp - -ip netns exec ${NS1} ip link set dev veth1 up -ip netns exec ${NS2} ip link set dev veth2 up - -ip netns exec ${NS1} ping -c 1 10.1.1.22 -ip netns exec ${NS2} ping -c 1 10.1.1.11 - -exit 0 From 166438a432d76c68d3f0da60667248f3c2303d6c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 16:46:33 -0500 Subject: [PATCH 0571/1450] ftrace: Do not find "true_parent" if HAVE_DYNAMIC_FTRACE_WITH_ARGS is not set When function tracing and function graph tracing are both enabled (in different instances) the "parent" of some of the function tracing events is "return_to_handler" which is the trampoline used by function graph tracing. To fix this, ftrace_get_true_parent_ip() was introduced that returns the "true" parent ip instead of the trampoline. To do this, the ftrace_regs_get_stack_pointer() is used, which uses kernel_stack_pointer(). The problem is that microblaze does not implement kerenl_stack_pointer() so when function graph tracing is enabled, the build fails. But microblaze also does not enabled HAVE_DYNAMIC_FTRACE_WITH_ARGS. That option has to be enabled by the architecture to reliably get the values from the fregs parameter passed in. When that config is not set, the architecture can also pass in NULL, which is not tested for in that function and could cause the kernel to crash. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Al Viro Cc: Michal Simek Cc: Jeff Xie Link: https://lore.kernel.org/20241216164633.6df18e87@gandalf.local.home Fixes: 60b1f578b578 ("ftrace: Get the true parent ip for function tracer") Reported-by: Al Viro Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 74c353164ca12..d358c9935164d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -176,7 +176,8 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->array_buffer); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* fregs are guaranteed not to be NULL if HAVE_DYNAMIC_FTRACE_WITH_ARGS is set */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) static __always_inline unsigned long function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) { From d6fd6f8280f0257ba93f16900a0d3d3912f32c79 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 5 Dec 2024 16:49:51 +0100 Subject: [PATCH 0572/1450] ceph: fix memory leaks in __ceph_sync_read() In two `break` statements, the call to ceph_release_page_vector() was missing, leaking the allocation from ceph_alloc_page_vector(). Instead of adding the missing ceph_release_page_vector() calls, the Ceph maintainers preferred to transfer page ownership to the `ceph_osd_request` by passing `own_pages=true` to osd_req_op_extent_osd_data_pages(). This requires postponing the ceph_osdc_put_request() call until after the block that accesses the `pages`. Cc: stable@vger.kernel.org Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Fixes: f0fe1e54cfcf ("ceph: plumb in decryption during reads") Signed-off-by: Max Kellermann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4b8d59ebda009..ce342a5d4b8b0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1127,7 +1127,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, offset_in_page(read_off), - false, false); + false, true); op = &req->r_ops[0]; if (sparse) { @@ -1186,8 +1186,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret = min_t(ssize_t, fret, len); } - ceph_osdc_put_request(req); - /* Short read but not EOF? Zero out the remainder. */ if (ret >= 0 && ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); @@ -1221,7 +1219,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, break; } } - ceph_release_page_vector(pages, num_pages); + + ceph_osdc_put_request(req); if (ret < 0) { if (ret == -EBLOCKLISTED) From 550f7ca98ee028a606aa75705a7e77b1bd11720f Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 18 Nov 2024 23:28:28 +0100 Subject: [PATCH 0573/1450] ceph: give up on paths longer than PATH_MAX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the full path to be built by ceph_mdsc_build_path() happens to be longer than PATH_MAX, then this function will enter an endless (retry) loop, effectively blocking the whole task. Most of the machine becomes unusable, making this a very simple and effective DoS vulnerability. I cannot imagine why this retry was ever implemented, but it seems rather useless and harmful to me. Let's remove it and fail with ENAMETOOLONG instead. Cc: stable@vger.kernel.org Reported-by: Dario Weißer Signed-off-by: Max Kellermann Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 219a2cc2bf3cc..785fe489ef4b8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2800,12 +2800,11 @@ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, if (pos < 0) { /* - * A rename didn't occur, but somehow we didn't end up where - * we thought we would. Throw a warning and try again. + * The path is longer than PATH_MAX and this function + * cannot ever succeed. Creating paths that long is + * possible with Ceph, but Linux cannot use them. */ - pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n", - pos); - goto retry; + return ERR_PTR(-ENAMETOOLONG); } *pbase = base; From 12eb22a5a609421b380c3c6ca887474fb2089b2c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 20 Nov 2024 16:43:51 +0100 Subject: [PATCH 0574/1450] ceph: validate snapdirname option length when mounting It becomes a path component, so it shouldn't exceed NAME_MAX characters. This was hardened in commit c152737be22b ("ceph: Use strscpy() instead of strcpy() in __get_snap_name()"), but no actual check was put in place. Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index de03cd6eb86ee..4344e1f118069 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -431,6 +431,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, switch (token) { case Opt_snapdirname: + if (strlen(param->string) > NAME_MAX) + return invalfc(fc, "snapdirname too long"); kfree(fsopt->snapdir_name); fsopt->snapdir_name = param->string; param->string = NULL; From 9abee475803fab6ad59d4f4fc59c6a75374a7d9d Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Wed, 27 Nov 2024 15:34:10 +0200 Subject: [PATCH 0575/1450] ceph: improve error handling and short/overflow-read logic in __ceph_sync_read() This patch refines the read logic in __ceph_sync_read() to ensure more predictable and efficient behavior in various edge cases. - Return early if the requested read length is zero or if the file size (`i_size`) is zero. - Initialize the index variable (`idx`) where needed and reorder some code to ensure it is always set before use. - Improve error handling by checking for negative return values earlier. - Remove redundant encrypted file checks after failures. Only attempt filesystem-level decryption if the read succeeded. - Simplify leftover calculations to correctly handle cases where the read extends beyond the end of the file or stops short. This can be hit by continuously reading a file while, on another client, we keep truncating and writing new data into it. - This resolves multiple issues caused by integer and consequent buffer overflow (`pages` array being accessed beyond `num_pages`): - https://tracker.ceph.com/issues/67524 - https://tracker.ceph.com/issues/68980 - https://tracker.ceph.com/issues/68981 Cc: stable@vger.kernel.org Fixes: 1065da21e5df ("ceph: stop copying to iter at EOF on sync reads") Reported-by: Luis Henriques (SUSE) Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ce342a5d4b8b0..8e0400d461a2a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1066,7 +1066,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, if (ceph_inode_is_shutdown(inode)) return -EIO; - if (!len) + if (!len || !i_size) return 0; /* * flush any page cache pages in this range. this @@ -1086,7 +1086,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, int num_pages; size_t page_off; bool more; - int idx; + int idx = 0; size_t left; struct ceph_osd_req_op *op; u64 read_off = off; @@ -1160,7 +1160,14 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, else if (ret == -ENOENT) ret = 0; - if (ret > 0 && IS_ENCRYPTED(inode)) { + if (ret < 0) { + ceph_osdc_put_request(req); + if (ret == -EBLOCKLISTED) + fsc->blocklisted = true; + break; + } + + if (IS_ENCRYPTED(inode)) { int fret; fret = ceph_fscrypt_decrypt_extents(inode, pages, @@ -1187,7 +1194,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, } /* Short read but not EOF? Zero out the remainder. */ - if (ret >= 0 && ret < len && (off + ret < i_size)) { + if (ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); int zoff = page_off + ret; @@ -1197,13 +1204,11 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret += zlen; } - idx = 0; - if (ret <= 0) - left = 0; - else if (off + ret > i_size) - left = i_size - off; + if (off + ret > i_size) + left = (i_size > off) ? i_size - off : 0; else left = ret; + while (left > 0) { size_t plen, copied; @@ -1222,12 +1227,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ceph_osdc_put_request(req); - if (ret < 0) { - if (ret == -EBLOCKLISTED) - fsc->blocklisted = true; - break; - } - if (off >= i_size || !more) break; } From 66e0c4f91461d17d48071695271c824620bed4ef Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 6 Dec 2024 17:32:59 +0100 Subject: [PATCH 0576/1450] ceph: fix memory leak in ceph_direct_read_write() The bvecs array which is allocated in iter_get_bvecs_alloc() is leaked and pages remain pinned if ceph_alloc_sparse_ext_map() fails. There is no need to delay the allocation of sparse_ext map until after the bvecs array is set up, so fix this by moving sparse_ext allocation a bit earlier. Also, make a similar adjustment in __ceph_sync_read() for consistency (a leak of the same kind in __ceph_sync_read() has been addressed differently). Cc: stable@vger.kernel.org Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/file.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8e0400d461a2a..67468d88f1390 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1116,6 +1116,16 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, len = read_off + read_len - off; more = len < iov_iter_count(to); + op = &req->r_ops[0]; + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + num_pages = calc_pages_for(read_off, read_len); page_off = offset_in_page(off); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); @@ -1129,16 +1139,6 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, offset_in_page(read_off), false, true); - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } - ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); @@ -1551,6 +1551,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } + op = &req->r_ops[0]; + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); if (len < 0) { ceph_osdc_put_request(req); @@ -1560,6 +1570,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (len != size) osd_req_op_extent_update(req, 0, len); + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + /* * To simplify error handling, allow AIO when IO within i_size * or IO can be satisfied by single OSD request. @@ -1591,17 +1603,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req->r_mtime = mtime; } - osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, size); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; From 18d44c5d062b97b97bb0162d9742440518958dc1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 7 Dec 2024 17:33:25 +0100 Subject: [PATCH 0577/1450] ceph: allocate sparse_ext map only for sparse reads If mounted with sparseread option, ceph_direct_read_write() ends up making an unnecessarily allocation for O_DIRECT writes. Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/file.c | 2 +- net/ceph/osd_client.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 67468d88f1390..851d70200c6b8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1552,7 +1552,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } op = &req->r_ops[0]; - if (sparse) { + if (!write && sparse) { extent_cnt = __ceph_sparse_read_ext_count(inode, size); ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9b1168eb77abb..b24afec241382 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1173,6 +1173,8 @@ EXPORT_SYMBOL(ceph_osdc_new_request); int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) { + WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ); + op->extent.sparse_ext_cnt = cnt; op->extent.sparse_ext = kmalloc_array(cnt, sizeof(*op->extent.sparse_ext), From 74d7e038fd072635d21e4734e3223378e09168d3 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 16 Dec 2024 20:36:46 +0300 Subject: [PATCH 0578/1450] hwmon: (tmp513) Fix interpretation of values of Shunt Voltage and Limit Registers The values returned by the driver after processing the contents of the Shunt Voltage Register and the Shunt Limit Registers do not correspond to the TMP512/TMP513 specifications. A raw register value is converted to a signed integer value by a sign extension in accordance with the algorithm provided in the specification, but due to the off-by-one error in the sign bit index, the result is incorrect. Moreover, the PGA shift calculated with the tmp51x_get_pga_shift function is relevant only to the Shunt Voltage Register, but is also applied to the Shunt Limit Registers. According to the TMP512 and TMP513 datasheets, the Shunt Voltage Register (04h) is 13 to 16 bit two's complement integer value, depending on the PGA setting. The Shunt Positive (0Ch) and Negative (0Dh) Limit Registers are 16-bit two's complement integer values. Below are some examples: * Shunt Voltage Register If PGA = 8, and regval = 1000 0011 0000 0000, then the decimal value must be -32000, but the value calculated by the driver will be 33536. * Shunt Limit Register If regval = 1000 0011 0000 0000, then the decimal value must be -32000, but the value calculated by the driver will be 768, if PGA = 1. Fix sign bit index, and also correct misleading comment describing the tmp51x_get_pga_shift function. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 59dfa75e5d82 ("hwmon: Add driver for Texas Instruments TMP512/513 sensor chips.") Signed-off-by: Murad Masimov Link: https://lore.kernel.org/r/20241216173648.526-2-m.masimov@maxima.ru [groeck: Fixed description and multi-line alignments] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp513.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index 926d28cd3fab5..d87fcea3ef243 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -182,7 +182,7 @@ struct tmp51x_data { struct regmap *regmap; }; -// Set the shift based on the gain 8=4, 4=3, 2=2, 1=1 +// Set the shift based on the gain: 8 -> 1, 4 -> 2, 2 -> 3, 1 -> 4 static inline u8 tmp51x_get_pga_shift(struct tmp51x_data *data) { return 5 - ffs(data->pga_gain); @@ -204,7 +204,9 @@ static int tmp51x_get_value(struct tmp51x_data *data, u8 reg, u8 pos, * 2's complement number shifted by one to four depending * on the pga gain setting. 1lsb = 10uV */ - *val = sign_extend32(regval, 17 - tmp51x_get_pga_shift(data)); + *val = sign_extend32(regval, + reg == TMP51X_SHUNT_CURRENT_RESULT ? + 16 - tmp51x_get_pga_shift(data) : 15); *val = DIV_ROUND_CLOSEST(*val * 10 * MILLI, data->shunt_uohms); break; case TMP51X_BUS_VOLTAGE_RESULT: From da1d0e6ba211baf6747db74c07700caddfd8a179 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 16 Dec 2024 20:36:47 +0300 Subject: [PATCH 0579/1450] hwmon: (tmp513) Fix Current Register value interpretation The value returned by the driver after processing the contents of the Current Register does not correspond to the TMP512/TMP513 specifications. A raw register value is converted to a signed integer value by a sign extension in accordance with the algorithm provided in the specification, but due to the off-by-one error in the sign bit index, the result is incorrect. Moreover, negative values will be reported as large positive due to missing sign extension from u32 to long. According to the TMP512 and TMP513 datasheets, the Current Register (07h) is a 16-bit two's complement integer value. E.g., if regval = 1000 0011 0000 0000, then the value must be (-32000 * lsb), but the driver will return (33536 * lsb). Fix off-by-one bug, and also cast data->curr_lsb_ua (which is of type u32) to long to prevent incorrect cast for negative values. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 59dfa75e5d82 ("hwmon: Add driver for Texas Instruments TMP512/513 sensor chips.") Signed-off-by: Murad Masimov Link: https://lore.kernel.org/r/20241216173648.526-3-m.masimov@maxima.ru [groeck: Fixed description line length] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp513.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index d87fcea3ef243..2846b1cc515d7 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -222,7 +222,7 @@ static int tmp51x_get_value(struct tmp51x_data *data, u8 reg, u8 pos, break; case TMP51X_BUS_CURRENT_RESULT: // Current = (ShuntVoltage * CalibrationRegister) / 4096 - *val = sign_extend32(regval, 16) * data->curr_lsb_ua; + *val = sign_extend32(regval, 15) * (long)data->curr_lsb_ua; *val = DIV_ROUND_CLOSEST(*val, MILLI); break; case TMP51X_LOCAL_TEMP_RESULT: From dd471e25770e7e632f736b90db1e2080b2171668 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 16 Dec 2024 20:36:48 +0300 Subject: [PATCH 0580/1450] hwmon: (tmp513) Fix interpretation of values of Temperature Result and Limit Registers The values returned by the driver after processing the contents of the Temperature Result and the Temperature Limit Registers do not correspond to the TMP512/TMP513 specifications. A raw register value is converted to a signed integer value by a sign extension in accordance with the algorithm provided in the specification, but due to the off-by-one error in the sign bit index, the result is incorrect. According to the TMP512 and TMP513 datasheets, the Temperature Result (08h to 0Bh) and Limit (11h to 14h) Registers are 13-bit two's complement integer values, shifted left by 3 bits. The value is scaled by 0.0625 degrees Celsius per bit. E.g., if regval = 1 1110 0111 0000 000, the output should be -25 degrees, but the driver will return +487 degrees. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 59dfa75e5d82 ("hwmon: Add driver for Texas Instruments TMP512/513 sensor chips.") Signed-off-by: Murad Masimov Link: https://lore.kernel.org/r/20241216173648.526-4-m.masimov@maxima.ru [groeck: fixed description line length] Signed-off-by: Guenter Roeck --- drivers/hwmon/tmp513.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index 2846b1cc515d7..1c2cb12071b80 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -234,7 +234,7 @@ static int tmp51x_get_value(struct tmp51x_data *data, u8 reg, u8 pos, case TMP51X_REMOTE_TEMP_LIMIT_2: case TMP513_REMOTE_TEMP_LIMIT_3: // 1lsb = 0.0625 degrees centigrade - *val = sign_extend32(regval, 16) >> TMP51X_TEMP_SHIFT; + *val = sign_extend32(regval, 15) >> TMP51X_TEMP_SHIFT; *val = DIV_ROUND_CLOSEST(*val * 625, 10); break; case TMP51X_N_FACTOR_AND_HYST_1: From 239d87327dcd361b0098038995f8908f3296864f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Dec 2024 17:28:06 -0800 Subject: [PATCH 0581/1450] fortify: Hide run-time copy size from value range tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC performs value range tracking for variables as a way to provide better diagnostics. One place this is regularly seen is with warnings associated with bounds-checking, e.g. -Wstringop-overflow, -Wstringop-overread, -Warray-bounds, etc. In order to keep the signal-to-noise ratio high, warnings aren't emitted when a value range spans the entire value range representable by a given variable. For example: unsigned int len; char dst[8]; ... memcpy(dst, src, len); If len's value is unknown, it has the full "unsigned int" range of [0, UINT_MAX], and GCC's compile-time bounds checks against memcpy() will be ignored. However, when a code path has been able to narrow the range: if (len > 16) return; memcpy(dst, src, len); Then the range will be updated for the execution path. Above, len is now [0, 16] when reading memcpy(), so depending on other optimizations, we might see a -Wstringop-overflow warning like: error: '__builtin_memcpy' writing between 9 and 16 bytes into region of size 8 [-Werror=stringop-overflow] When building with CONFIG_FORTIFY_SOURCE, the fortified run-time bounds checking can appear to narrow value ranges of lengths for memcpy(), depending on how the compiler constructs the execution paths during optimization passes, due to the checks against the field sizes. For example: if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) As intentionally designed, these checks only affect the kernel warnings emitted at run-time and do not block the potentially overflowing memcpy(), so GCC thinks it needs to produce a warning about the resulting value range that might be reaching the memcpy(). We have seen this manifest a few times now, with the most recent being with cpumasks: In function ‘bitmap_copy’, inlined from ‘cpumask_copy’ at ./include/linux/cpumask.h:839:2, inlined from ‘__padata_set_cpumasks’ at kernel/padata.c:730:2: ./include/linux/fortify-string.h:114:33: error: ‘__builtin_memcpy’ reading between 257 and 536870904 bytes from a region of size 256 [-Werror=stringop-overread] 114 | #define __underlying_memcpy __builtin_memcpy | ^ ./include/linux/fortify-string.h:633:9: note: in expansion of macro ‘__underlying_memcpy’ 633 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ ./include/linux/fortify-string.h:678:26: note: in expansion of macro ‘__fortify_memcpy_chk’ 678 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ ./include/linux/bitmap.h:259:17: note: in expansion of macro ‘memcpy’ 259 | memcpy(dst, src, len); | ^~~~~~ kernel/padata.c: In function ‘__padata_set_cpumasks’: kernel/padata.c:713:48: note: source object ‘pcpumask’ of size [0, 256] 713 | cpumask_var_t pcpumask, | ~~~~~~~~~~~~~~^~~~~~~~ This warning is _not_ emitted when CONFIG_FORTIFY_SOURCE is disabled, and with the recent -fdiagnostics-details we can confirm the origin of the warning is due to FORTIFY's bounds checking: ../include/linux/bitmap.h:259:17: note: in expansion of macro 'memcpy' 259 | memcpy(dst, src, len); | ^~~~~~ '__padata_set_cpumasks': events 1-2 ../include/linux/fortify-string.h:613:36: 612 | if (p_size_field != SIZE_MAX && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 613 | p_size != p_size_field && p_size_field < size) | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~ | | | (1) when the condition is evaluated to false | (2) when the condition is evaluated to true '__padata_set_cpumasks': event 3 114 | #define __underlying_memcpy __builtin_memcpy | ^ | | | (3) out of array bounds here Note that the cpumask warning started appearing since bitmap functions were recently marked __always_inline in commit ed8cd2b3bd9f ("bitmap: Switch from inline to __always_inline"), which allowed GCC to gain visibility into the variables as they passed through the FORTIFY implementation. In order to silence these false positives but keep otherwise deterministic compile-time warnings intact, hide the length variable from GCC with OPTIMIZE_HIDE_VAR() before calling the builtin memcpy. Additionally add a comment about why all the macro args have copies with const storage. Reported-by: "Thomas Weißschuh" Closes: https://lore.kernel.org/all/db7190c8-d17f-4a0d-bc2f-5903c79f36c2@t-8ch.de/ Reported-by: Nilay Shroff Closes: https://lore.kernel.org/all/20241112124127.1666300-1-nilay@linux.ibm.com/ Tested-by: Nilay Shroff Acked-by: Yury Norov Acked-by: Greg Kroah-Hartman Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 0d99bf11d260a..e4ce1cae03bf7 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -616,6 +616,12 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, return false; } +/* + * To work around what seems to be an optimizer bug, the macro arguments + * need to have const copies or the values end up changed by the time they + * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture + * __bos() results in const temp vars") for more details. + */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ @@ -623,6 +629,8 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ + /* Keep a mutable version of the size for the final copy. */ \ + size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ @@ -630,7 +638,11 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ - __underlying_##op(p, q, __fortify_size); \ + /* Hide only the run-time size from value range tracking to */ \ + /* silence compile-time false positive bounds warnings. */ \ + if (!__builtin_constant_p(__copy_size)) \ + OPTIMIZER_HIDE_VAR(__copy_size); \ + __underlying_##op(p, q, __copy_size); \ }) /* From 0efbca0fec7d1665e19fa008ba95daab71f76f4d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 9 Dec 2024 12:06:39 +0100 Subject: [PATCH 0582/1450] nios2: Use str_yes_no() helper in show_cpuinfo() Remove hard-coded strings by using the str_yes_no() helper function. Signed-off-by: Thorsten Blum Signed-off-by: Dinh Nguyen --- arch/nios2/kernel/cpuinfo.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/nios2/kernel/cpuinfo.c b/arch/nios2/kernel/cpuinfo.c index 338849c430a52..7b1e8f9128e9b 100644 --- a/arch/nios2/kernel/cpuinfo.c +++ b/arch/nios2/kernel/cpuinfo.c @@ -143,11 +143,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) " DIV:\t\t%s\n" " BMX:\t\t%s\n" " CDX:\t\t%s\n", - cpuinfo.has_mul ? "yes" : "no", - cpuinfo.has_mulx ? "yes" : "no", - cpuinfo.has_div ? "yes" : "no", - cpuinfo.has_bmx ? "yes" : "no", - cpuinfo.has_cdx ? "yes" : "no"); + str_yes_no(cpuinfo.has_mul), + str_yes_no(cpuinfo.has_mulx), + str_yes_no(cpuinfo.has_div), + str_yes_no(cpuinfo.has_bmx), + str_yes_no(cpuinfo.has_cdx)); seq_printf(m, "Icache:\t\t%ukB, line length: %u\n", From b1f3a2f5a742c1e939a73031bd31b9e557a2d77d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:40 -0800 Subject: [PATCH 0583/1450] netdev: fix repeated netlink messages in queue dump The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 102 != 100 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 101 != 100 not ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0 not ok 1 selftests: drivers/net: queues.py # exit=1 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9527dd46e4dc3..9f086b1906194 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -488,24 +488,21 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, struct netdev_nl_dump_ctx *ctx) { int err = 0; - int i; if (!(netdev->flags & IFF_UP)) return err; - for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; - ctx->rxq_idx = i++; } - for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; - ctx->txq_idx = i++; } return err; From ecc391a541573da46b7ccc188105efedd40aef1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:41 -0800 Subject: [PATCH 0584/1450] netdev: fix repeated netlink messages in queue stats The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 103 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 103 != 100 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 102 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 102 != 100 missing queue keys not ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:4 fail:1 xfail:0 xpass:0 skip:0 error:0 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:5 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: ab63a2387cb9 ("netdev: add per-queue statistics") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9f086b1906194..1be8c7c21d19d 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -668,7 +668,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->rxq_idx = i++; + ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { @@ -676,7 +676,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->txq_idx = i++; + ctx->txq_idx = ++i; } ctx->rxq_idx = 0; From 0518863407b8dcc7070fdbc1c015046d66777e78 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:42 -0800 Subject: [PATCH 0585/1450] selftests: net: support setting recv_size in YNL recv_size parameter allows constraining the buffer size for dumps. It's useful in testing kernel handling of dump continuation, IOW testing dumps which span multiple skbs. Let the tests set this parameter when initializing the YNL family. Keep the normal default, we don't want tests to unintentionally behave very differently than normal code. Reviewed-by: Joe Damato Reviewed-by: Petr Machata Link: https://patch.msgid.link/20241213152244.3080955-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/py/ynl.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py index a0d689d58c573..076a7e8dc3ebe 100644 --- a/tools/testing/selftests/net/lib/py/ynl.py +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -32,23 +32,23 @@ # Set schema='' to avoid jsonschema validation, it's slow # class EthtoolFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('ethtool.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) class RtnlFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) class NetdevFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('netdev.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) class NetshaperFamily(YnlFamily): - def __init__(self): + def __init__(self, recv_size=0): super().__init__((SPEC_PATH / Path('net_shaper.yaml')).as_posix(), - schema='') + schema='', recv_size=recv_size) From 1234810b1649e9d781aeafd4b23fb1fcfbf95d8f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:43 -0800 Subject: [PATCH 0586/1450] selftests: net-drv: queues: sanity check netlink dumps This test already catches a netlink bug fixed by this series, but only when running on HW with many queues. Make sure the netdevsim instance created has a lot of queues, and constrain the size of the recv_buffer used by netlink. While at it test both rx and tx queues. Reviewed-by: Joe Damato Reviewed-by: Petr Machata Link: https://patch.msgid.link/20241213152244.3080955-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/queues.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 30f29096e27c2..9c5473abbd784 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -8,25 +8,28 @@ import glob -def sys_get_queues(ifname) -> int: - folders = glob.glob(f'/sys/class/net/{ifname}/queues/rx-*') +def sys_get_queues(ifname, qtype='rx') -> int: + folders = glob.glob(f'/sys/class/net/{ifname}/queues/{qtype}-*') return len(folders) -def nl_get_queues(cfg, nl): +def nl_get_queues(cfg, nl, qtype='rx'): queues = nl.queue_get({'ifindex': cfg.ifindex}, dump=True) if queues: - return len([q for q in queues if q['type'] == 'rx']) + return len([q for q in queues if q['type'] == qtype]) return None def get_queues(cfg, nl) -> None: - queues = nl_get_queues(cfg, nl) - if not queues: - raise KsftSkipEx('queue-get not supported by device') + snl = NetdevFamily(recv_size=4096) - expected = sys_get_queues(cfg.dev['ifname']) - ksft_eq(queues, expected) + for qtype in ['rx', 'tx']: + queues = nl_get_queues(cfg, snl, qtype) + if not queues: + raise KsftSkipEx('queue-get not supported by device') + + expected = sys_get_queues(cfg.dev['ifname'], qtype) + ksft_eq(queues, expected) def addremove_queues(cfg, nl) -> None: @@ -57,7 +60,7 @@ def addremove_queues(cfg, nl) -> None: def main() -> None: - with NetDrvEnv(__file__, queue_count=3) as cfg: + with NetDrvEnv(__file__, queue_count=100) as cfg: ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) ksft_exit() From 5712e323d4c3ad03bba4d28f83e80593171ac3f1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:44 -0800 Subject: [PATCH 0587/1450] selftests: net-drv: stats: sanity check netlink dumps Sanity check netlink dumps, to make sure dumps don't have repeated entries or gaps in IDs. Reviewed-by: Petr Machata Link: https://patch.msgid.link/20241213152244.3080955-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/stats.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/stats.py b/tools/testing/selftests/drivers/net/stats.py index 63e3c045a3b25..031ac9def6c01 100755 --- a/tools/testing/selftests/drivers/net/stats.py +++ b/tools/testing/selftests/drivers/net/stats.py @@ -110,6 +110,23 @@ def qstat_by_ifindex(cfg) -> None: ksft_ge(triple[1][key], triple[0][key], comment="bad key: " + key) ksft_ge(triple[2][key], triple[1][key], comment="bad key: " + key) + # Sanity check the dumps + queues = NetdevFamily(recv_size=4096).qstats_get({"scope": "queue"}, dump=True) + # Reformat the output into {ifindex: {rx: [id, id, ...], tx: [id, id, ...]}} + parsed = {} + for entry in queues: + ifindex = entry["ifindex"] + if ifindex not in parsed: + parsed[ifindex] = {"rx":[], "tx": []} + parsed[ifindex][entry["queue-type"]].append(entry['queue-id']) + # Now, validate + for ifindex, queues in parsed.items(): + for qtype in ['rx', 'tx']: + ksft_eq(len(queues[qtype]), len(set(queues[qtype])), + comment="repeated queue keys") + ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, + comment="missing queue keys") + # Test invalid dumps # 0 is invalid with ksft_raises(NlError) as cm: @@ -158,7 +175,7 @@ def check_down(cfg) -> None: def main() -> None: - with NetDrvEnv(__file__) as cfg: + with NetDrvEnv(__file__, queue_count=100) as cfg: ksft_run([check_pause, check_fec, pkt_byte_sum, qstat_by_ifindex, check_down], args=(cfg, )) From d920270a6dbf756384b125ce39c17666a7c0c9f4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Dec 2024 20:58:15 +0000 Subject: [PATCH 0588/1450] rxrpc: Disable IRQ, not BH, to take the lock for ->attend_link Use spin_lock_irq(), not spin_lock_bh() to take the lock when accessing the ->attend_link() to stop a delay in the I/O thread due to an interrupt being taken in the app thread whilst that holds the lock and vice versa. Fixes: a2ea9a907260 ("rxrpc: Use irq-disabling spinlocks between app and I/O thread") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/2870146.1734037095@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/io_thread.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 2925c7fc82cfb..64f8d77b87312 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -508,9 +508,9 @@ int rxrpc_io_thread(void *data) while ((conn = list_first_entry_or_null(&conn_attend_q, struct rxrpc_connection, attend_link))) { - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); list_del_init(&conn->attend_link); - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); rxrpc_input_conn_event(conn, NULL); rxrpc_put_connection(conn, rxrpc_conn_put_poke); } @@ -527,9 +527,9 @@ int rxrpc_io_thread(void *data) while ((call = list_first_entry_or_null(&call_attend_q, struct rxrpc_call, attend_link))) { - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); list_del_init(&call->attend_link); - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); trace_rxrpc_call_poked(call); rxrpc_input_call_event(call); rxrpc_put_call(call, rxrpc_call_put_poke); From ae4f899894792c436d792c17d3f3e6a2affb787f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Dec 2024 21:04:22 +0000 Subject: [PATCH 0589/1450] rxrpc: Fix ability to add more data to a call once MSG_MORE deasserted When userspace is adding data to an RPC call for transmission, it must pass MSG_MORE to sendmsg() if it intends to add more data in future calls to sendmsg(). Calling sendmsg() without MSG_MORE being asserted closes the transmission phase of the call (assuming sendmsg() adds all the data presented) and further attempts to add more data should be rejected. However, this is no longer the case. The change of call state that was previously the guard got bumped over to the I/O thread, which leaves a window for a repeat sendmsg() to insert more data. This previously went unnoticed, but the more recent patch that changed the structures behind the Tx queue added a warning: WARNING: CPU: 3 PID: 6639 at net/rxrpc/sendmsg.c:296 rxrpc_send_data+0x3f2/0x860 and rejected the additional data, returning error EPROTO. Fix this by adding a guard flag to the call, setting the flag when we queue the final packet and then rejecting further attempts to add data with EPROTO. Fixes: 2d689424b618 ("rxrpc: Move call state changes from sendmsg to I/O thread") Reported-by: syzbot+ff11be94dfcd7a5af8da@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/6757fb68.050a0220.2477f.005f.GAE@google.com/ Signed-off-by: David Howells Tested-by: syzbot+ff11be94dfcd7a5af8da@syzkaller.appspotmail.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/2870480.1734037462@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/sendmsg.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 0c0a3c89dba32..718193df9d2e2 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -571,6 +571,7 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */ + RXRPC_CALL_TX_NO_MORE, /* No more data to transmit (MSG_MORE deasserted) */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index c4c8b718cafaa..0e8da909d4f2f 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -266,6 +266,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, /* Order send_top after the queue->next pointer and txb content. */ smp_store_release(&call->send_top, seq); if (last) { + set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags); rxrpc_notify_end_tx(rx, call, notify_end_tx); call->send_queue = NULL; } @@ -329,6 +330,13 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, bool more = msg->msg_flags & MSG_MORE; int ret, copied = 0; + if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) { + trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, + call->cid, call->call_id, call->rx_consumed, + 0, -EPROTO); + return -EPROTO; + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ret = rxrpc_wait_to_be_connected(call, &timeo); From fbbd84af6ba70334335bdeba3ae536cf751c14c6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Dec 2024 12:47:27 +0300 Subject: [PATCH 0590/1450] chelsio/chtls: prevent potential integer overflow on 32bit The "gl->tot_len" variable is controlled by the user. It comes from process_responses(). On 32bit systems, the "gl->tot_len + sizeof(struct cpl_pass_accept_req) + sizeof(struct rss_header)" addition could have an integer wrapping bug. Use size_add() to prevent this. Fixes: a08943947873 ("crypto: chtls - Register chtls with net tls") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/c6bfb23c-2db2-4e1b-b8ab-ba3925c82ef5@stanley.mountain Signed-off-by: Jakub Kicinski --- .../net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c index 96fd31d75dfd9..daa1ebaef5112 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c @@ -346,8 +346,9 @@ static struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, * driver. Once driver synthesizes cpl_pass_accept_req the skb will go * through the regular cpl_pass_accept_req processing in TOM. */ - skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) - - pktshift, GFP_ATOMIC); + skb = alloc_skb(size_add(gl->tot_len, + sizeof(struct cpl_pass_accept_req)) - + pktshift, GFP_ATOMIC); if (unlikely(!skb)) return NULL; __skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) From 4fe205539c46ab0add34675ab037f49caa30607c Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 13 Dec 2024 11:25:50 +0000 Subject: [PATCH 0591/1450] netlink: specs: add phys-binding attr to rt_link spec Add the missing phys-binding attr to the mctp-attrs in the rt_link spec. This fixes commit 580db513b4a9 ("net: mctp: Expose transport binding identifier via IFLA attribute"). Note that enum mctp_phys_binding is not currently uapi, but perhaps it should be? Signed-off-by: Donald Hunter Link: https://patch.msgid.link/20241213112551.33557-1-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_link.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 9ffa13b77dcf1..96465376d6fed 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -2086,6 +2086,9 @@ attribute-sets: - name: mctp-net type: u32 + - + name: phys-binding + type: u8 - name: stats-attrs name-prefix: ifla-stats- From 77ec16be758ea65de641833149963bec39f311da Mon Sep 17 00:00:00 2001 From: Anna Emese Nyiri Date: Fri, 13 Dec 2024 09:44:54 +0100 Subject: [PATCH 0592/1450] sock: Introduce sk_set_prio_allowed helper function Simplify priority setting permissions with the 'sk_set_prio_allowed' function, centralizing the validation logic. This change is made in anticipation of a second caller in a following patch. No functional changes. Reviewed-by: Willem de Bruijn Reviewed-by: Eric Dumazet Suggested-by: Willem de Bruijn Signed-off-by: Anna Emese Nyiri Link: https://patch.msgid.link/20241213084457.45120-2-annaemesenyiri@gmail.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 74729d20cd009..9016f984d44eb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -454,6 +454,13 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, return 0; } +static bool sk_set_prio_allowed(const struct sock *sk, int val) +{ + return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); +} + static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { @@ -1193,9 +1200,7 @@ int sk_setsockopt(struct sock *sk, int level, int optname, /* handle options which do not require locking the socket. */ switch (optname) { case SO_PRIORITY: - if ((val >= 0 && val <= 6) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (sk_set_prio_allowed(sk, val)) { sock_set_priority(sk, val); return 0; } From a32f3e9d1ed146f81162702605d65447a319eb76 Mon Sep 17 00:00:00 2001 From: Anna Emese Nyiri Date: Fri, 13 Dec 2024 09:44:55 +0100 Subject: [PATCH 0593/1450] sock: support SO_PRIORITY cmsg The Linux socket API currently allows setting SO_PRIORITY at the socket level, applying a uniform priority to all packets sent through that socket. The exception to this is IP_TOS, when the priority value is calculated during the handling of ancillary data, as implemented in commit f02db315b8d8 ("ipv4: IP_TOS and IP_TTL can be specified as ancillary data"). However, this is a computed value, and there is currently no mechanism to set a custom priority via control messages prior to this patch. According to this patch, if SO_PRIORITY is specified as ancillary data, the packet is sent with the priority value set through sockc->priority, overriding the socket-level values set via the traditional setsockopt() method. This is analogous to the existing support for SO_MARK, as implemented in commit c6af0c227a22 ("ip: support SO_MARK cmsg"). If both cmsg SO_PRIORITY and IP_TOS are passed, then the one that takes precedence is the last one in the cmsg list. This patch has the side effect that raw_send_hdrinc now interprets cmsg IP_TOS. Reviewed-by: Willem de Bruijn Suggested-by: Ferenc Fejes Signed-off-by: Anna Emese Nyiri Link: https://patch.msgid.link/20241213084457.45120-3-annaemesenyiri@gmail.com Signed-off-by: Jakub Kicinski --- include/net/inet_sock.h | 2 +- include/net/ip.h | 2 +- include/net/sock.h | 4 +++- net/can/raw.c | 2 +- net/core/sock.c | 7 +++++++ net/ipv4/ip_output.c | 4 ++-- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/ping.c | 1 + net/ipv6/raw.c | 3 ++- net/ipv6/udp.c | 1 + net/packet/af_packet.c | 2 +- 13 files changed, 24 insertions(+), 11 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 56d8bc5593d3d..3ccbad881d74f 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -172,7 +172,7 @@ struct inet_cork { u8 tx_flags; __u8 ttl; __s16 tos; - char priority; + u32 priority; __u16 gso_size; u32 ts_opt_id; u64 transmit_time; diff --git a/include/net/ip.h b/include/net/ip.h index 0e548c1f2a0ec..9f5e33e371fcd 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -81,7 +81,6 @@ struct ipcm_cookie { __u8 protocol; __u8 ttl; __s16 tos; - char priority; __u16 gso_size; }; @@ -96,6 +95,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, ipcm_init(ipcm); ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); + ipcm->sockc.priority = READ_ONCE(inet->sk.sk_priority); ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c5..316a34d6c48b7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1814,13 +1814,15 @@ struct sockcm_cookie { u32 mark; u32 tsflags; u32 ts_opt_id; + u32 priority; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { - .tsflags = READ_ONCE(sk->sk_tsflags) + .tsflags = READ_ONCE(sk->sk_tsflags), + .priority = READ_ONCE(sk->sk_priority), }; } diff --git a/net/can/raw.c b/net/can/raw.c index 255c0a8f39d6b..46e8ed9d64da3 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -962,7 +962,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc.priority; skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; diff --git a/net/core/sock.c b/net/core/sock.c index 9016f984d44eb..a3d9941c1d325 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2947,6 +2947,13 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, case SCM_RIGHTS: case SCM_CREDENTIALS: break; + case SO_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) + return -EPERM; + sockc->priority = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a59204a8d8505..f45a083f2c13b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1333,7 +1333,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->mark = ipc->sockc.mark; - cork->priority = ipc->priority; + cork->priority = ipc->sockc.priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags); @@ -1470,7 +1470,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_options_build(skb, opt, cork->addr, rt); } - skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority); + skb->priority = cork->priority; skb->mark = cork->mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cf377377b52d8..f6a03b418ddec 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -315,7 +315,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, if (val < 0 || val > 255) return -EINVAL; ipc->tos = val; - ipc->priority = rt_tos2priority(ipc->tos); + ipc->sockc.priority = rt_tos2priority(ipc->tos); break; case IP_PROTOCOL: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0e9e01967ec94..4304a68d1db05 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -358,7 +358,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IP); - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_dst_set(skb, &rt->dst); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3d672dea9f562..993106876604b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1401,6 +1401,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; + cork->base.priority = ipc6->sockc.priority; sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->base.flags |= IPCORK_TS_OPT_ID; @@ -1942,7 +1943,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = cork->base.priority; skb->mark = cork->base.mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 88b3fcacd4f94..46b8adf6e7f8e 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,6 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, sk); + ipc6.sockc.priority = READ_ONCE(sk->sk_priority); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8476a3944a884..a45aba090aa41 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -619,7 +619,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IPV6); - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); @@ -780,6 +780,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = fl6.flowi6_mark; + ipc6.sockc.priority = READ_ONCE(sk->sk_priority); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d766fd798ecf9..7c14c449804cb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1448,6 +1448,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.gso_size = READ_ONCE(up->gso_size); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); + ipc6.sockc.priority = READ_ONCE(sk->sk_priority); /* destination address check */ if (sin6) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 886c0dd47b662..f8d87d622699d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3126,7 +3126,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); From cda7d5abe089cc8bd6d623cd6577627d8125d155 Mon Sep 17 00:00:00 2001 From: Anna Emese Nyiri Date: Fri, 13 Dec 2024 09:44:56 +0100 Subject: [PATCH 0594/1450] selftests: net: test SO_PRIORITY ancillary data with cmsg_sender Extend cmsg_sender.c with a new option '-Q' to send SO_PRIORITY ancillary data. cmsg_so_priority.sh script added to validate SO_PRIORITY behavior by creating VLAN device with egress QoS mapping and testing packet priorities using flower filters. Verify that packets with different priorities are correctly matched and counted by filters for multiple protocols and IP versions. Reviewed-by: Willem de Bruijn Acked-by: Willem de Bruijn Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Suggested-by: Ido Schimmel Signed-off-by: Anna Emese Nyiri Link: https://patch.msgid.link/20241213084457.45120-4-annaemesenyiri@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/cmsg_sender.c | 11 +- .../testing/selftests/net/cmsg_so_priority.sh | 151 ++++++++++++++++++ 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/cmsg_so_priority.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index cb2fc601de66b..f09bd96cc978b 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -32,6 +32,7 @@ TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh +TEST_PROGS += cmsg_so_priority.sh TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh TEST_PROGS += netns-name.sh TEST_PROGS += nl_netdev.py diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index 876c2db02a632..bc314382e4e1b 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -59,6 +59,7 @@ struct options { unsigned int proto; } sock; struct option_cmsg_u32 mark; + struct option_cmsg_u32 priority; struct { bool ena; unsigned int delay; @@ -97,6 +98,8 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) "\n" "\t\t-m val Set SO_MARK with given value\n" "\t\t-M val Set SO_MARK via setsockopt\n" + "\t\t-P val Set SO_PRIORITY via setsockopt\n" + "\t\t-Q val Set SO_PRIORITY via cmsg\n" "\t\t-d val Set SO_TXTIME with given delay (usec)\n" "\t\t-t Enable time stamp reporting\n" "\t\t-f val Set don't fragment via cmsg\n" @@ -115,7 +118,7 @@ static void cs_parse_args(int argc, char *argv[]) { int o; - while ((o = getopt(argc, argv, "46sS:p:P:m:M:n:d:tf:F:c:C:l:L:H:")) != -1) { + while ((o = getopt(argc, argv, "46sS:p:P:m:M:n:d:tf:F:c:C:l:L:H:Q:")) != -1) { switch (o) { case 's': opt.silent_send = true; @@ -148,6 +151,10 @@ static void cs_parse_args(int argc, char *argv[]) opt.mark.ena = true; opt.mark.val = atoi(optarg); break; + case 'Q': + opt.priority.ena = true; + opt.priority.val = atoi(optarg); + break; case 'M': opt.sockopt.mark = atoi(optarg); break; @@ -252,6 +259,8 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, SOL_SOCKET, SO_MARK, &opt.mark); + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_SOCKET, SO_PRIORITY, &opt.priority); ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, SOL_IPV6, IPV6_DONTFRAG, &opt.v6.dontfrag); ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, diff --git a/tools/testing/selftests/net/cmsg_so_priority.sh b/tools/testing/selftests/net/cmsg_so_priority.sh new file mode 100755 index 0000000000000..ee07d86532625 --- /dev/null +++ b/tools/testing/selftests/net/cmsg_so_priority.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +readonly KSFT_SKIP=4 + +IP4=192.0.2.1/24 +TGT4=192.0.2.2 +TGT4_RAW=192.0.2.3 +IP6=2001:db8::1/64 +TGT6=2001:db8::2 +TGT6_RAW=2001:db8::3 +PORT=1234 +TOTAL_TESTS=0 +FAILED_TESTS=0 + +if ! command -v jq &> /dev/null; then + echo "SKIP cmsg_so_priroity.sh test: jq is not installed." >&2 + exit "$KSFT_SKIP" +fi + +check_result() { + ((TOTAL_TESTS++)) + if [ "$1" -ne 0 ]; then + ((FAILED_TESTS++)) + fi +} + +cleanup() +{ + cleanup_ns $NS +} + +trap cleanup EXIT + +setup_ns NS + +create_filter() { + local handle=$1 + local vlan_prio=$2 + local ip_type=$3 + local proto=$4 + local dst_ip=$5 + local ip_proto + + if [[ "$proto" == "u" ]]; then + ip_proto="udp" + elif [[ "$ip_type" == "ipv4" && "$proto" == "i" ]]; then + ip_proto="icmp" + elif [[ "$ip_type" == "ipv6" && "$proto" == "i" ]]; then + ip_proto="icmpv6" + fi + + tc -n $NS filter add dev dummy1 \ + egress pref 1 handle "$handle" proto 802.1q \ + flower vlan_prio "$vlan_prio" vlan_ethtype "$ip_type" \ + dst_ip "$dst_ip" ${ip_proto:+ip_proto $ip_proto} \ + action pass +} + +ip -n $NS link set dev lo up +ip -n $NS link add name dummy1 up type dummy + +ip -n $NS link add link dummy1 name dummy1.10 up type vlan id 10 \ + egress-qos-map 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7 + +ip -n $NS address add $IP4 dev dummy1.10 +ip -n $NS address add $IP6 dev dummy1.10 nodad + +ip netns exec $NS sysctl -wq net.ipv4.ping_group_range='0 2147483647' + +ip -n $NS neigh add $TGT4 lladdr 00:11:22:33:44:55 nud permanent \ + dev dummy1.10 +ip -n $NS neigh add $TGT6 lladdr 00:11:22:33:44:55 nud permanent \ + dev dummy1.10 +ip -n $NS neigh add $TGT4_RAW lladdr 00:11:22:33:44:66 nud permanent \ + dev dummy1.10 +ip -n $NS neigh add $TGT6_RAW lladdr 00:11:22:33:44:66 nud permanent \ + dev dummy1.10 + +tc -n $NS qdisc add dev dummy1 clsact + +FILTER_COUNTER=10 + +for i in 4 6; do + for proto in u i r; do + echo "Test IPV$i, prot: $proto" + for priority in {0..7}; do + if [[ $i == 4 && $proto == "r" ]]; then + TGT=$TGT4_RAW + elif [[ $i == 6 && $proto == "r" ]]; then + TGT=$TGT6_RAW + elif [ $i == 4 ]; then + TGT=$TGT4 + else + TGT=$TGT6 + fi + + handle="${FILTER_COUNTER}${priority}" + + create_filter $handle $priority ipv$i $proto $TGT + + pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \ + | jq ".[] | select(.options.handle == ${handle}) | \ + .options.actions[0].stats.packets") + + if [[ $pkts == 0 ]]; then + check_result 0 + else + echo "prio $priority: expected 0, got $pkts" + check_result 1 + fi + + ip netns exec $NS ./cmsg_sender -$i -Q $priority \ + -p $proto $TGT $PORT + + pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \ + | jq ".[] | select(.options.handle == ${handle}) | \ + .options.actions[0].stats.packets") + if [[ $pkts == 1 ]]; then + check_result 0 + else + echo "prio $priority -Q: expected 1, got $pkts" + check_result 1 + fi + + ip netns exec $NS ./cmsg_sender -$i -P $priority \ + -p $proto $TGT $PORT + + pkts=$(tc -n $NS -j -s filter show dev dummy1 egress \ + | jq ".[] | select(.options.handle == ${handle}) | \ + .options.actions[0].stats.packets") + if [[ $pkts == 2 ]]; then + check_result 0 + else + echo "prio $priority -P: expected 2, got $pkts" + check_result 1 + fi + done + FILTER_COUNTER=$((FILTER_COUNTER + 10)) + done +done + +if [ $FAILED_TESTS -ne 0 ]; then + echo "FAIL - $FAILED_TESTS/$TOTAL_TESTS tests failed" + exit 1 +else + echo "OK - All $TOTAL_TESTS tests passed" + exit 0 +fi From e45469e594b255ef8d750ed5576698743450d2ac Mon Sep 17 00:00:00 2001 From: Anna Emese Nyiri Date: Fri, 13 Dec 2024 09:44:57 +0100 Subject: [PATCH 0595/1450] sock: Introduce SO_RCVPRIORITY socket option Add new socket option, SO_RCVPRIORITY, to include SO_PRIORITY in the ancillary data returned by recvmsg(). This is analogous to the existing support for SO_RCVMARK, as implemented in commit 6fd1d51cfa253 ("net: SO_RCVMARK socket option for SO_MARK with recvmsg()"). Reviewed-by: Willem de Bruijn Suggested-by: Ferenc Fejes Signed-off-by: Anna Emese Nyiri Link: https://patch.msgid.link/20241213084457.45120-5-annaemesenyiri@gmail.com Signed-off-by: Jakub Kicinski --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/net/sock.h | 4 +++- include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 8 ++++++++ net/socket.c | 11 +++++++++++ tools/include/uapi/asm-generic/socket.h | 2 ++ 9 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 302507bf9b5dd..3df5f2dd4c0fa 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -148,6 +148,8 @@ #define SCM_TS_OPT_ID 81 +#define SO_RCVPRIORITY 82 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index d118d47315801..22fa8f19924a7 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -159,6 +159,8 @@ #define SCM_TS_OPT_ID 81 +#define SO_RCVPRIORITY 82 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index d268d69bfcd27..aa9cd4b951fe5 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -140,6 +140,8 @@ #define SCM_TS_OPT_ID 0x404C +#define SO_RCVPRIORITY 0x404D + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 113cd9f353e35..5b464a5686640 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -141,6 +141,8 @@ #define SCM_TS_OPT_ID 0x005a +#define SO_RCVPRIORITY 0x005b + #if !defined(__KERNEL__) diff --git a/include/net/sock.h b/include/net/sock.h index 316a34d6c48b7..d4bdd3286e03f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -953,6 +953,7 @@ enum sock_flags { SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ + SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -2660,7 +2661,8 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_RCVMARK)) + (1UL << SOCK_RCVMARK) |\ + (1UL << SOCK_RCVPRIORITY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index deacfd6dd197f..aa5016ff3d911 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -143,6 +143,8 @@ #define SCM_TS_OPT_ID 81 +#define SO_RCVPRIORITY 82 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index a3d9941c1d325..e7bcc8952248f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1519,6 +1519,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; + case SO_RCVPRIORITY: + sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); + break; + case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; @@ -1947,6 +1951,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = sock_flag(sk, SOCK_RCVMARK); break; + case SO_RCVPRIORITY: + v.val = sock_flag(sk, SOCK_RCVPRIORITY); + break; + case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; diff --git a/net/socket.c b/net/socket.c index 9a117248f18f1..16402b8be5a7f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1008,12 +1008,23 @@ static void sock_recv_mark(struct msghdr *msg, struct sock *sk, } } +static void sock_recv_priority(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ + if (sock_flag(sk, SOCK_RCVPRIORITY) && skb) { + __u32 priority = skb->priority; + + put_cmsg(msg, SOL_SOCKET, SO_PRIORITY, sizeof(__u32), &priority); + } +} + void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { sock_recv_timestamp(msg, sk, skb); sock_recv_drops(msg, sk, skb); sock_recv_mark(msg, sk, skb); + sock_recv_priority(msg, sk, skb); } EXPORT_SYMBOL_GPL(__sock_recv_cmsgs); diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index 281df9139d2b9..ffff554a52300 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -126,6 +126,8 @@ #define SCM_TS_OPT_ID 78 +#define SO_RCVPRIORITY 79 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) From e78c20f327bd94dabac68b98218dff069a8780f0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 Dec 2024 13:36:57 +0100 Subject: [PATCH 0596/1450] team: Fix feature exposure when no ports are present Small follow-up to align this to an equivalent behavior as the bond driver. The change in 3625920b62c3 ("teaming: fix vlan_features computing") removed the netdevice vlan_features when there is no team port attached, yet it leaves the full set of enc_features intact. Instead, leave the default features as pre 3625920b62c3, and recompute once we do have ports attached. Also, similarly as in bonding case, call the netdev_base_features() helper on the enc_features. Fixes: 3625920b62c3 ("teaming: fix vlan_features computing") Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241213123657.401868-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- drivers/net/team/team_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index 69ea2c3c76bfc..c7690adec8db7 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -998,9 +998,13 @@ static void __team_compute_features(struct team *team) unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + rcu_read_lock(); + if (list_empty(&team->port_list)) + goto done; + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); - rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { vlan_features = netdev_increment_features(vlan_features, port->dev->vlan_features, @@ -1010,11 +1014,11 @@ static void __team_compute_features(struct team *team) port->dev->hw_enc_features, TEAM_ENC_FEATURES); - dst_release_flag &= port->dev->priv_flags; if (port->dev->hard_header_len > max_hard_header_len) max_hard_header_len = port->dev->hard_header_len; } +done: rcu_read_unlock(); team->dev->vlan_features = vlan_features; From 7203d10e93b6e6e1d19481ef7907de6a9133a467 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Dec 2024 17:28:11 +0300 Subject: [PATCH 0597/1450] net: hinic: Fix cleanup in create_rxqs/txqs() There is a check for NULL at the start of create_txqs() and create_rxqs() which tess if "nic_dev->txqs" is non-NULL. The intention is that if the device is already open and the queues are already created then we don't create them a second time. However, the bug is that if we have an error in the create_txqs() then the pointer doesn't get set back to NULL. The NULL check at the start of the function will say that it's already open when it's not and the device can't be used. Set ->txqs back to NULL on cleanup on error. Fixes: c3e79baf1b03 ("net-next/hinic: Add logical Txq and Rxq") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/0cc98faf-a0ed-4565-a55b-0fa2734bc205@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/huawei/hinic/hinic_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 890f213da8d18..ae1f523d6841b 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -172,6 +172,7 @@ static int create_txqs(struct hinic_dev *nic_dev) hinic_sq_dbgfs_uninit(nic_dev); devm_kfree(&netdev->dev, nic_dev->txqs); + nic_dev->txqs = NULL; return err; } @@ -268,6 +269,7 @@ static int create_rxqs(struct hinic_dev *nic_dev) hinic_rq_dbgfs_uninit(nic_dev); devm_kfree(&netdev->dev, nic_dev->rxqs); + nic_dev->rxqs = NULL; return err; } From b299ea0069284186b0d3d54aebe87f0d195d457a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 13 Dec 2024 20:01:41 +0100 Subject: [PATCH 0598/1450] r8169: adjust version numbering for RTL8126 Adjust version numbering for RTL8126, so that it doesn't overlap with new RTL8125 versions. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/6a354364-20e9-48ad-a198-468264288757@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169.h | 4 +- drivers/net/ethernet/realtek/r8169_main.c | 62 +++++++++---------- .../net/ethernet/realtek/r8169_phy_config.c | 4 +- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index 8904aae41acae..00d74e76c6f2e 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -69,8 +69,8 @@ enum mac_version { RTL_GIGA_MAC_VER_61, RTL_GIGA_MAC_VER_63, RTL_GIGA_MAC_VER_64, - RTL_GIGA_MAC_VER_65, - RTL_GIGA_MAC_VER_66, + RTL_GIGA_MAC_VER_70, + RTL_GIGA_MAC_VER_71, RTL_GIGA_MAC_NONE }; diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 6934bdee2a910..d153fa5598d5a 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -140,8 +140,8 @@ static const struct { /* reserve 62 for CFG_METHOD_4 in the vendor driver */ [RTL_GIGA_MAC_VER_63] = {"RTL8125B", FIRMWARE_8125B_2}, [RTL_GIGA_MAC_VER_64] = {"RTL8125D", FIRMWARE_8125D_1}, - [RTL_GIGA_MAC_VER_65] = {"RTL8126A", FIRMWARE_8126A_2}, - [RTL_GIGA_MAC_VER_66] = {"RTL8126A", FIRMWARE_8126A_3}, + [RTL_GIGA_MAC_VER_70] = {"RTL8126A", FIRMWARE_8126A_2}, + [RTL_GIGA_MAC_VER_71] = {"RTL8126A", FIRMWARE_8126A_3}, }; static const struct pci_device_id rtl8169_pci_tbl[] = { @@ -1228,7 +1228,7 @@ static void rtl_writephy(struct rtl8169_private *tp, int location, int val) case RTL_GIGA_MAC_VER_31: r8168dp_2_mdio_write(tp, location, val); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: r8168g_mdio_write(tp, location, val); break; default: @@ -1243,7 +1243,7 @@ static int rtl_readphy(struct rtl8169_private *tp, int location) case RTL_GIGA_MAC_VER_28: case RTL_GIGA_MAC_VER_31: return r8168dp_2_mdio_read(tp, location); - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: return r8168g_mdio_read(tp, location); default: return r8169_mdio_read(tp, location); @@ -1574,7 +1574,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) break; case RTL_GIGA_MAC_VER_34: case RTL_GIGA_MAC_VER_37: - case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_71: r8169_mod_reg8_cond(tp, Config2, PME_SIGNAL, wolopts); break; default: @@ -2047,7 +2047,7 @@ static void rtl_set_eee_txidle_timer(struct rtl8169_private *tp) tp->tx_lpi_timer = timer_val; r8168_mac_ocp_write(tp, 0xe048, timer_val); break; - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: tp->tx_lpi_timer = timer_val; RTL_W16(tp, EEE_TXIDLE_TIMER_8125, timer_val); break; @@ -2255,8 +2255,8 @@ static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii) enum mac_version ver; } mac_info[] = { /* 8126A family. */ - { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_66 }, - { 0x7cf, 0x649, RTL_GIGA_MAC_VER_65 }, + { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_71 }, + { 0x7cf, 0x649, RTL_GIGA_MAC_VER_70 }, /* 8125D family. */ { 0x7cf, 0x688, RTL_GIGA_MAC_VER_64 }, @@ -2526,7 +2526,7 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_61: RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST); break; - case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_71: RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST | RX_PAUSE_SLOT_ON); break; @@ -2658,7 +2658,7 @@ static void rtl_wait_txrx_fifo_empty(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_61: rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); break; - case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_71: RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond_2, 100, 42); @@ -2901,7 +2901,7 @@ static void rtl_enable_exit_l1(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_37 ... RTL_GIGA_MAC_VER_38: rtl_eri_set_bits(tp, 0xd4, 0x0c00); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: r8168_mac_ocp_modify(tp, 0xc0ac, 0, 0x1f80); break; default: @@ -2915,7 +2915,7 @@ static void rtl_disable_exit_l1(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38: rtl_eri_clear_bits(tp, 0xd4, 0x1f00); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: r8168_mac_ocp_modify(tp, 0xc0ac, 0x1f80, 0); break; default: @@ -2941,8 +2941,8 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) rtl_mod_config5(tp, 0, ASPM_en); switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_65: - case RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_70: + case RTL_GIGA_MAC_VER_71: val8 = RTL_R8(tp, INT_CFG0_8125) | INT_CFG0_CLKREQEN; RTL_W8(tp, INT_CFG0_8125, val8); break; @@ -2953,7 +2953,7 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) switch (tp->mac_version) { case RTL_GIGA_MAC_VER_46 ... RTL_GIGA_MAC_VER_48: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: /* reset ephy tx/rx disable timer */ r8168_mac_ocp_modify(tp, 0xe094, 0xff00, 0); /* chip can trigger L1.2 */ @@ -2965,7 +2965,7 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) } else { switch (tp->mac_version) { case RTL_GIGA_MAC_VER_46 ... RTL_GIGA_MAC_VER_48: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: r8168_mac_ocp_modify(tp, 0xe092, 0x00ff, 0); break; default: @@ -2973,8 +2973,8 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) } switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_65: - case RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_70: + case RTL_GIGA_MAC_VER_71: val8 = RTL_R8(tp, INT_CFG0_8125) & ~INT_CFG0_CLKREQEN; RTL_W8(tp, INT_CFG0_8125, val8); break; @@ -3694,12 +3694,12 @@ static void rtl_hw_start_8125_common(struct rtl8169_private *tp) /* disable new tx descriptor format */ r8168_mac_ocp_modify(tp, 0xeb58, 0x0001, 0x0000); - if (tp->mac_version == RTL_GIGA_MAC_VER_65 || - tp->mac_version == RTL_GIGA_MAC_VER_66) + if (tp->mac_version == RTL_GIGA_MAC_VER_70 || + tp->mac_version == RTL_GIGA_MAC_VER_71) RTL_W8(tp, 0xD8, RTL_R8(tp, 0xD8) & ~0x02); - if (tp->mac_version == RTL_GIGA_MAC_VER_65 || - tp->mac_version == RTL_GIGA_MAC_VER_66) + if (tp->mac_version == RTL_GIGA_MAC_VER_70 || + tp->mac_version == RTL_GIGA_MAC_VER_71) r8168_mac_ocp_modify(tp, 0xe614, 0x0700, 0x0400); else if (tp->mac_version == RTL_GIGA_MAC_VER_63) r8168_mac_ocp_modify(tp, 0xe614, 0x0700, 0x0200); @@ -3717,8 +3717,8 @@ static void rtl_hw_start_8125_common(struct rtl8169_private *tp) r8168_mac_ocp_modify(tp, 0xe056, 0x00f0, 0x0030); r8168_mac_ocp_modify(tp, 0xe040, 0x1000, 0x0000); r8168_mac_ocp_modify(tp, 0xea1c, 0x0003, 0x0001); - if (tp->mac_version == RTL_GIGA_MAC_VER_65 || - tp->mac_version == RTL_GIGA_MAC_VER_66) + if (tp->mac_version == RTL_GIGA_MAC_VER_70 || + tp->mac_version == RTL_GIGA_MAC_VER_71) r8168_mac_ocp_modify(tp, 0xea1c, 0x0300, 0x0000); else r8168_mac_ocp_modify(tp, 0xea1c, 0x0004, 0x0000); @@ -3837,8 +3837,8 @@ static void rtl_hw_config(struct rtl8169_private *tp) [RTL_GIGA_MAC_VER_61] = rtl_hw_start_8125a_2, [RTL_GIGA_MAC_VER_63] = rtl_hw_start_8125b, [RTL_GIGA_MAC_VER_64] = rtl_hw_start_8125d, - [RTL_GIGA_MAC_VER_65] = rtl_hw_start_8126a, - [RTL_GIGA_MAC_VER_66] = rtl_hw_start_8126a, + [RTL_GIGA_MAC_VER_70] = rtl_hw_start_8126a, + [RTL_GIGA_MAC_VER_71] = rtl_hw_start_8126a, }; if (hw_configs[tp->mac_version]) @@ -3859,8 +3859,8 @@ static void rtl_hw_start_8125(struct rtl8169_private *tp) RTL_W32(tp, i, 0); break; case RTL_GIGA_MAC_VER_63: - case RTL_GIGA_MAC_VER_65: - case RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_70: + case RTL_GIGA_MAC_VER_71: for (i = 0xa00; i < 0xa80; i += 4) RTL_W32(tp, i, 0); RTL_W16(tp, INT_CFG1_8125, 0x0000); @@ -4092,7 +4092,7 @@ static void rtl8169_cleanup(struct rtl8169_private *tp) RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); rtl_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 666); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: rtl_enable_rxdvgate(tp); fsleep(2000); break; @@ -4249,7 +4249,7 @@ static unsigned int rtl_quirk_packet_padto(struct rtl8169_private *tp, switch (tp->mac_version) { case RTL_GIGA_MAC_VER_34: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: padto = max_t(unsigned int, padto, ETH_ZLEN); break; default: @@ -5267,7 +5267,7 @@ static void rtl_hw_initialize(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_48: rtl_hw_init_8168g(tp); break; - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_66: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: rtl_hw_init_8125(tp); break; default: diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index b28b30390e84d..bc498ea780342 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -1162,8 +1162,8 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, [RTL_GIGA_MAC_VER_61] = rtl8125a_2_hw_phy_config, [RTL_GIGA_MAC_VER_63] = rtl8125b_hw_phy_config, [RTL_GIGA_MAC_VER_64] = rtl8125d_hw_phy_config, - [RTL_GIGA_MAC_VER_65] = rtl8126a_hw_phy_config, - [RTL_GIGA_MAC_VER_66] = rtl8126a_hw_phy_config, + [RTL_GIGA_MAC_VER_70] = rtl8126a_hw_phy_config, + [RTL_GIGA_MAC_VER_71] = rtl8126a_hw_phy_config, }; if (phy_configs[ver]) From b3593df26ab19f114d613693fa8a92ab202803d0 Mon Sep 17 00:00:00 2001 From: ChunHao Lin Date: Fri, 13 Dec 2024 20:02:58 +0100 Subject: [PATCH 0599/1450] r8169: add support for RTL8125D rev.b Add support for RTL8125D rev.b. Its XID is 0x689. It is basically based on the one with XID 0x688, but with different firmware file. Signed-off-by: ChunHao Lin [hkallweit1@gmail.com: rebased after adjusted version numbering] Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/75e5e9ec-d01f-43ac-b0f4-e7456baf18d1@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169.h | 1 + drivers/net/ethernet/realtek/r8169_main.c | 6 ++++++ drivers/net/ethernet/realtek/r8169_phy_config.c | 1 + 3 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index 00d74e76c6f2e..e0817f2a311ad 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -69,6 +69,7 @@ enum mac_version { RTL_GIGA_MAC_VER_61, RTL_GIGA_MAC_VER_63, RTL_GIGA_MAC_VER_64, + RTL_GIGA_MAC_VER_65, RTL_GIGA_MAC_VER_70, RTL_GIGA_MAC_VER_71, RTL_GIGA_MAC_NONE diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index d153fa5598d5a..5724f650f9c6a 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -57,6 +57,7 @@ #define FIRMWARE_8125A_3 "rtl_nic/rtl8125a-3.fw" #define FIRMWARE_8125B_2 "rtl_nic/rtl8125b-2.fw" #define FIRMWARE_8125D_1 "rtl_nic/rtl8125d-1.fw" +#define FIRMWARE_8125D_2 "rtl_nic/rtl8125d-2.fw" #define FIRMWARE_8126A_2 "rtl_nic/rtl8126a-2.fw" #define FIRMWARE_8126A_3 "rtl_nic/rtl8126a-3.fw" @@ -140,6 +141,7 @@ static const struct { /* reserve 62 for CFG_METHOD_4 in the vendor driver */ [RTL_GIGA_MAC_VER_63] = {"RTL8125B", FIRMWARE_8125B_2}, [RTL_GIGA_MAC_VER_64] = {"RTL8125D", FIRMWARE_8125D_1}, + [RTL_GIGA_MAC_VER_65] = {"RTL8125D", FIRMWARE_8125D_2}, [RTL_GIGA_MAC_VER_70] = {"RTL8126A", FIRMWARE_8126A_2}, [RTL_GIGA_MAC_VER_71] = {"RTL8126A", FIRMWARE_8126A_3}, }; @@ -706,6 +708,7 @@ MODULE_FIRMWARE(FIRMWARE_8107E_2); MODULE_FIRMWARE(FIRMWARE_8125A_3); MODULE_FIRMWARE(FIRMWARE_8125B_2); MODULE_FIRMWARE(FIRMWARE_8125D_1); +MODULE_FIRMWARE(FIRMWARE_8125D_2); MODULE_FIRMWARE(FIRMWARE_8126A_2); MODULE_FIRMWARE(FIRMWARE_8126A_3); @@ -2259,6 +2262,7 @@ static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii) { 0x7cf, 0x649, RTL_GIGA_MAC_VER_70 }, /* 8125D family. */ + { 0x7cf, 0x689, RTL_GIGA_MAC_VER_65 }, { 0x7cf, 0x688, RTL_GIGA_MAC_VER_64 }, /* 8125B family. */ @@ -3837,6 +3841,7 @@ static void rtl_hw_config(struct rtl8169_private *tp) [RTL_GIGA_MAC_VER_61] = rtl_hw_start_8125a_2, [RTL_GIGA_MAC_VER_63] = rtl_hw_start_8125b, [RTL_GIGA_MAC_VER_64] = rtl_hw_start_8125d, + [RTL_GIGA_MAC_VER_65] = rtl_hw_start_8125d, [RTL_GIGA_MAC_VER_70] = rtl_hw_start_8126a, [RTL_GIGA_MAC_VER_71] = rtl_hw_start_8126a, }; @@ -3855,6 +3860,7 @@ static void rtl_hw_start_8125(struct rtl8169_private *tp) switch (tp->mac_version) { case RTL_GIGA_MAC_VER_61: case RTL_GIGA_MAC_VER_64: + case RTL_GIGA_MAC_VER_65: for (i = 0xa00; i < 0xb00; i += 4) RTL_W32(tp, i, 0); break; diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index bc498ea780342..968c8a2185a47 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -1162,6 +1162,7 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, [RTL_GIGA_MAC_VER_61] = rtl8125a_2_hw_phy_config, [RTL_GIGA_MAC_VER_63] = rtl8125b_hw_phy_config, [RTL_GIGA_MAC_VER_64] = rtl8125d_hw_phy_config, + [RTL_GIGA_MAC_VER_65] = rtl8125d_hw_phy_config, [RTL_GIGA_MAC_VER_70] = rtl8126a_hw_phy_config, [RTL_GIGA_MAC_VER_71] = rtl8126a_hw_phy_config, }; From b4845bb6383821a9516ce30af3a27dc873e37fd4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 11:00:52 +0200 Subject: [PATCH 0600/1450] x86/xen: add central hypercall functions Add generic hypercall functions usable for all normal (i.e. not iret) hypercalls. Depending on the guest type and the processor vendor different functions need to be used due to the to be used instruction for entering the hypervisor: - PV guests need to use syscall - HVM/PVH guests on Intel need to use vmcall - HVM/PVH guests on AMD and Hygon need to use vmmcall As PVH guests need to issue hypercalls very early during boot, there is a 4th hypercall function needed for HVM/PVH which can be used on Intel and AMD processors. It will check the vendor type and then set the Intel or AMD specific function to use via static_call(). This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra --- arch/x86/include/asm/xen/hypercall.h | 3 + arch/x86/xen/enlighten.c | 65 ++++++++++++++++++++++ arch/x86/xen/enlighten_hvm.c | 4 ++ arch/x86/xen/enlighten_pv.c | 4 +- arch/x86/xen/xen-asm.S | 23 ++++++++ arch/x86/xen/xen-head.S | 83 ++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 9 +++ 7 files changed, 190 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a2dd24947eb85..6b4dd4de08a6d 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -88,6 +88,9 @@ struct xen_dm_op_buf; extern struct { char _entry[32]; } hypercall_page[]; +void xen_hypercall_func(void); +DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); + #define __HYPERCALL "call hypercall_page+%c[offset]" #define __HYPERCALL_ENTRY(x) \ [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 84e5adbd0925c..1887435af2fbf 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -23,6 +24,9 @@ EXPORT_SYMBOL_GPL(hypercall_page); +DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm); +EXPORT_STATIC_CALL_TRAMP(xen_hypercall); + /* * Pointer to the xen_vcpu_info structure or * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info @@ -68,6 +72,67 @@ EXPORT_SYMBOL(xen_start_flags); */ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; +static __ref void xen_get_vendor(void) +{ + init_cpu_devs(); + cpu_detect(&boot_cpu_data); + get_cpu_vendor(&boot_cpu_data); +} + +void xen_hypercall_setfunc(void) +{ + if (static_call_query(xen_hypercall) != xen_hypercall_hvm) + return; + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + static_call_update(xen_hypercall, xen_hypercall_amd); + else + static_call_update(xen_hypercall, xen_hypercall_intel); +} + +/* + * Evaluate processor vendor in order to select the correct hypercall + * function for HVM/PVH guests. + * Might be called very early in boot before vendor has been set by + * early_cpu_init(). + */ +noinstr void *__xen_hypercall_setfunc(void) +{ + void (*func)(void); + + /* + * Xen is supported only on CPUs with CPUID, so testing for + * X86_FEATURE_CPUID is a test for early_cpu_init() having been + * run. + * + * Note that __xen_hypercall_setfunc() is noinstr only due to a nasty + * dependency chain: it is being called via the xen_hypercall static + * call when running as a PVH or HVM guest. Hypercalls need to be + * noinstr due to PV guests using hypercalls in noinstr code. So we + * can safely tag the function body as "instrumentation ok", since + * the PV guest requirement is not of interest here (xen_get_vendor() + * calls noinstr functions, and static_call_update_early() might do + * so, too). + */ + instrumentation_begin(); + + if (!boot_cpu_has(X86_FEATURE_CPUID)) + xen_get_vendor(); + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + func = xen_hypercall_amd; + else + func = xen_hypercall_intel; + + static_call_update_early(xen_hypercall, func); + + instrumentation_end(); + + return func; +} + static int xen_cpu_up_online(unsigned int cpu) { xen_init_lock_cpu(cpu); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 24d2957a4726d..973a74fc966a4 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -300,6 +300,10 @@ static uint32_t __init xen_platform_hvm(void) if (xen_pv_domain()) return 0; + /* Set correct hypercall function. */ + if (xen_domain) + xen_hypercall_setfunc(); + if (xen_pvh_domain() && nopv) { /* Guest booting via the Xen-PVH boot entry goes here */ pr_info("\"nopv\" parameter is ignored in PVH guest\n"); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index d6818c6cafda1..a8eb7e0c473cf 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1341,6 +1341,9 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_domain_type = XEN_PV_DOMAIN; xen_start_flags = xen_start_info->flags; + /* Interrupts are guaranteed to be off initially. */ + early_boot_irqs_disabled = true; + static_call_update_early(xen_hypercall, xen_hypercall_pv); xen_setup_features(); @@ -1431,7 +1434,6 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); local_irq_disable(); - early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index ca6edfe4c14b1..b518f36d1ca2e 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -20,9 +20,32 @@ #include #include +#include #include <../entry/calling.h> .pushsection .noinstr.text, "ax" +/* + * PV hypercall interface to the hypervisor. + * + * Called via inline asm(), so better preserve %rcx and %r11. + * + * Input: + * %eax: hypercall number + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %rax + */ +SYM_FUNC_START(xen_hypercall_pv) + ANNOTATE_NOENDBR + push %rcx + push %r11 + UNWIND_HINT_SAVE + syscall + UNWIND_HINT_RESTORE + pop %r11 + pop %rcx + RET +SYM_FUNC_END(xen_hypercall_pv) + /* * Disabling events is simply a matter of making the event mask * non-zero. diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7f6c69dbb8165..c173ba6740e93 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -6,9 +6,11 @@ #include #include +#include #include #include +#include #include #include #include @@ -87,6 +89,87 @@ SYM_CODE_END(xen_cpu_bringup_again) #endif #endif + .pushsection .noinstr.text, "ax" +/* + * Xen hypercall interface to the hypervisor. + * + * Input: + * %eax: hypercall number + * 32-bit: + * %ebx, %ecx, %edx, %esi, %edi: args 1..5 for the hypercall + * 64-bit: + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %[er]ax + */ +SYM_FUNC_START(xen_hypercall_hvm) + ENDBR + FRAME_BEGIN + /* Save all relevant registers (caller save and arguments). */ +#ifdef CONFIG_X86_32 + push %eax + push %ebx + push %ecx + push %edx + push %esi + push %edi +#else + push %rax + push %rcx + push %rdx + push %rdi + push %rsi + push %r11 + push %r10 + push %r9 + push %r8 +#ifdef CONFIG_FRAME_POINTER + pushq $0 /* Dummy push for stack alignment. */ +#endif +#endif + /* Set the vendor specific function. */ + call __xen_hypercall_setfunc + /* Set ZF = 1 if AMD, Restore saved registers. */ +#ifdef CONFIG_X86_32 + lea xen_hypercall_amd, %ebx + cmp %eax, %ebx + pop %edi + pop %esi + pop %edx + pop %ecx + pop %ebx + pop %eax +#else + lea xen_hypercall_amd(%rip), %rbx + cmp %rax, %rbx +#ifdef CONFIG_FRAME_POINTER + pop %rax /* Dummy pop. */ +#endif + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %rsi + pop %rdi + pop %rdx + pop %rcx + pop %rax +#endif + /* Use correct hypercall function. */ + jz xen_hypercall_amd + jmp xen_hypercall_intel +SYM_FUNC_END(xen_hypercall_hvm) + +SYM_FUNC_START(xen_hypercall_amd) + vmmcall + RET +SYM_FUNC_END(xen_hypercall_amd) + +SYM_FUNC_START(xen_hypercall_intel) + vmcall + RET +SYM_FUNC_END(xen_hypercall_intel) + .popsection + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index e1b782e823e6b..63c13a2ccf556 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -326,4 +326,13 @@ static inline void xen_smp_intr_free_pv(unsigned int cpu) {} static inline void xen_smp_count_cpus(void) { } #endif /* CONFIG_SMP */ +#ifdef CONFIG_XEN_PV +void xen_hypercall_pv(void); +#endif +void xen_hypercall_hvm(void); +void xen_hypercall_amd(void); +void xen_hypercall_intel(void); +void xen_hypercall_setfunc(void); +void *__xen_hypercall_setfunc(void); + #endif /* XEN_OPS_H */ From b1c2cb86f4a7861480ad54bb9a58df3cbebf8e92 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 14:47:13 +0200 Subject: [PATCH 0601/1450] x86/xen: use new hypercall functions instead of hypercall page Call the Xen hypervisor via the new xen_hypercall_func static-call instead of the hypercall page. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- arch/x86/include/asm/xen/hypercall.h | 33 +++++++++++++++++----------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 6b4dd4de08a6d..7d5f8ad667741 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -39,9 +39,11 @@ #include #include #include +#include #include +#include #include #include #include @@ -91,9 +93,17 @@ extern struct { char _entry[32]; } hypercall_page[]; void xen_hypercall_func(void); DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); -#define __HYPERCALL "call hypercall_page+%c[offset]" -#define __HYPERCALL_ENTRY(x) \ - [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) +#ifdef MODULE +#define __ADDRESSABLE_xen_hypercall +#else +#define __ADDRESSABLE_xen_hypercall __ADDRESSABLE_ASM_STR(__SCK__xen_hypercall) +#endif + +#define __HYPERCALL \ + __ADDRESSABLE_xen_hypercall \ + "call __SCT__xen_hypercall" + +#define __HYPERCALL_ENTRY(x) "a" (x) #ifdef CONFIG_X86_32 #define __HYPERCALL_RETREG "eax" @@ -151,7 +161,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_0ARG(); \ asm volatile (__HYPERCALL \ : __HYPERCALL_0PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER0); \ (type)__res; \ }) @@ -162,7 +172,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_1ARG(a1); \ asm volatile (__HYPERCALL \ : __HYPERCALL_1PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER1); \ (type)__res; \ }) @@ -173,7 +183,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_2ARG(a1, a2); \ asm volatile (__HYPERCALL \ : __HYPERCALL_2PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER2); \ (type)__res; \ }) @@ -184,7 +194,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_3ARG(a1, a2, a3); \ asm volatile (__HYPERCALL \ : __HYPERCALL_3PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER3); \ (type)__res; \ }) @@ -195,7 +205,7 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); __HYPERCALL_4ARG(a1, a2, a3, a4); \ asm volatile (__HYPERCALL \ : __HYPERCALL_4PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER4); \ (type)__res; \ }) @@ -209,12 +219,9 @@ xen_single_call(unsigned int call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); - if (call >= PAGE_SIZE / sizeof(hypercall_page[0])) - return -EINVAL; - - asm volatile(CALL_NOSPEC + asm volatile(__HYPERCALL : __HYPERCALL_5PARAM - : [thunk_target] "a" (&hypercall_page[call]) + : __HYPERCALL_ENTRY(call) : __HYPERCALL_CLOBBER5); return (long)__res; From 7fa0da5373685e7ed249af3fa317ab1e1ba8b0a6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 17 Oct 2024 15:27:31 +0200 Subject: [PATCH 0602/1450] x86/xen: remove hypercall page The hypercall page is no longer needed. It can be removed, as from the Xen perspective it is optional. But, from Linux's perspective, it removes naked RET instructions that escape the speculative protections that Call Depth Tracking and/or Untrain Ret are trying to achieve. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Reviewed-by: Andrew Cooper Reviewed-by: Jan Beulich --- arch/x86/include/asm/xen/hypercall.h | 2 -- arch/x86/kernel/callthunks.c | 5 ----- arch/x86/kernel/vmlinux.lds.S | 4 ---- arch/x86/xen/enlighten.c | 2 -- arch/x86/xen/enlighten_hvm.c | 9 +-------- arch/x86/xen/enlighten_pvh.c | 7 ------- arch/x86/xen/xen-head.S | 24 ------------------------ 7 files changed, 1 insertion(+), 52 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7d5f8ad667741..97771b9d33af3 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -88,8 +88,6 @@ struct xen_dm_op_buf; * there aren't more than 5 arguments...) */ -extern struct { char _entry[32]; } hypercall_page[]; - void xen_hypercall_func(void); DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 4656474567533..f17d166078823 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -142,11 +142,6 @@ static bool skip_addr(void *dest) if (dest >= (void *)relocate_kernel && dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) return true; -#endif -#ifdef CONFIG_XEN - if (dest >= (void *)hypercall_page && - dest < (void*)hypercall_page + PAGE_SIZE) - return true; #endif return false; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index fab3ac9a4574a..6a17396c8174e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -519,14 +519,10 @@ INIT_PER_CPU(irq_stack_backing_store); * linker will never mark as relocatable. (Using just ABSOLUTE() is not * sufficient for that). */ -#ifdef CONFIG_XEN #ifdef CONFIG_XEN_PV xen_elfnote_entry_value = ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen); #endif -xen_elfnote_hypercall_page_value = - ABSOLUTE(xen_elfnote_hypercall_page) + ABSOLUTE(hypercall_page); -#endif #ifdef CONFIG_PVH xen_elfnote_phys32_entry_value = ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1887435af2fbf..43dcd8c7badc0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -22,8 +22,6 @@ #include "xen-ops.h" -EXPORT_SYMBOL_GPL(hypercall_page); - DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm); EXPORT_STATIC_CALL_TRAMP(xen_hypercall); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 973a74fc966a4..fe57ff85d004b 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -106,15 +106,8 @@ static void __init init_hvm_pv_info(void) /* PVH set up hypercall page in xen_prepare_pvh(). */ if (xen_pvh_domain()) pv_info.name = "Xen PVH"; - else { - u64 pfn; - uint32_t msr; - + else pv_info.name = "Xen HVM"; - msr = cpuid_ebx(base + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - } xen_setup_features(); diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index bf68c329fc013..0e3d930bcb89e 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -129,17 +129,10 @@ static void __init pvh_arch_setup(void) void __init xen_pvh_init(struct boot_params *boot_params) { - u32 msr; - u64 pfn; - xen_pvh = 1; xen_domain_type = XEN_HVM_DOMAIN; xen_start_flags = pvh_start_info.flags; - msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - x86_init.oem.arch_setup = pvh_arch_setup; x86_init.oem.banner = xen_banner; diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index c173ba6740e93..9252652afe596 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -22,28 +22,6 @@ #include #include -.pushsection .noinstr.text, "ax" - .balign PAGE_SIZE -SYM_CODE_START(hypercall_page) - .rept (PAGE_SIZE / 32) - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - ANNOTATE_UNRET_SAFE - ret - /* - * Xen will write the hypercall page, and sort out ENDBR. - */ - .skip 31, 0xcc - .endr - -#define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 -#include -#undef HYPERCALL -SYM_CODE_END(hypercall_page) -.popsection - #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) @@ -199,8 +177,6 @@ SYM_FUNC_END(xen_hypercall_intel) #else # define FEATURES_DOM0 0 #endif - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .globl xen_elfnote_hypercall_page; - xen_elfnote_hypercall_page: _ASM_PTR xen_elfnote_hypercall_page_value - .) ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0) ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") From 94901b7a74d82bfd30420f1d9d00898278fdc8bf Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 12 Dec 2024 22:00:15 +0900 Subject: [PATCH 0603/1450] rust: net::phy fix module autoloading The alias symbol name was renamed. Adjust module_phy_driver macro to create the proper symbol name to fix module autoloading. Fixes: 054a9cd395a7 ("modpost: rename alias symbol for MODULE_DEVICE_TABLE()") Signed-off-by: FUJITA Tomonori Link: https://patch.msgid.link/20241212130015.238863-1-fujita.tomonori@gmail.com Signed-off-by: Paolo Abeni --- rust/kernel/net/phy.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index b89c681d97c01..2fbfb6a94c11e 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -860,7 +860,7 @@ impl DeviceMask { /// ]; /// #[cfg(MODULE)] /// #[no_mangle] -/// static __mod_mdio__phydev_device_table: [::kernel::bindings::mdio_device_id; 2] = _DEVICE_TABLE; +/// static __mod_device_table__mdio__phydev: [::kernel::bindings::mdio_device_id; 2] = _DEVICE_TABLE; /// ``` #[macro_export] macro_rules! module_phy_driver { @@ -883,7 +883,7 @@ macro_rules! module_phy_driver { #[cfg(MODULE)] #[no_mangle] - static __mod_mdio__phydev_device_table: [$crate::bindings::mdio_device_id; + static __mod_device_table__mdio__phydev: [$crate::bindings::mdio_device_id; $crate::module_phy_driver!(@count_devices $($dev),+) + 1] = _DEVICE_TABLE; }; From abcc2ddae5f82aa6cfca162e3db643dd33f0a2e8 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Wed, 27 Nov 2024 09:40:04 -0800 Subject: [PATCH 0604/1450] i915/guc: Reset engine utilization buffer before registration On GT reset, we store total busyness counts for all engines and re-register the utilization buffer with GuC. At that time we should reset the buffer, so that we don't get spurious busyness counts on subsequent queries. To repro this issue, run igt@perf_pmu@busy-hang followed by igt@perf_pmu@most-busy-idle-check-all for a couple iterations. Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20241127174006.190128-2-umesh.nerlige.ramappa@intel.com (cherry picked from commit abd318237fa6556c1e5225529af145ef15d5ff0d) Signed-off-by: Tvrtko Ursulin --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 9ede6f240d793..b1d0c66e166ff 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1243,6 +1243,21 @@ static void __get_engine_usage_record(struct intel_engine_cs *engine, } while (++i < 6); } +static void __set_engine_usage_record(struct intel_engine_cs *engine, + u32 last_in, u32 id, u32 total) +{ + struct iosys_map rec_map = intel_guc_engine_usage_record_map(engine); + +#define record_write(map_, field_, val_) \ + iosys_map_wr_field(map_, 0, struct guc_engine_usage_record, field_, val_) + + record_write(&rec_map, last_switch_in_stamp, last_in); + record_write(&rec_map, current_context_index, id); + record_write(&rec_map, total_runtime, total); + +#undef record_write +} + static void guc_update_engine_gt_clks(struct intel_engine_cs *engine) { struct intel_engine_guc_stats *stats = &engine->stats.guc; @@ -1543,6 +1558,9 @@ static void guc_timestamp_ping(struct work_struct *wrk) static int guc_action_enable_usage_stats(struct intel_guc *guc) { + struct intel_gt *gt = guc_to_gt(guc); + struct intel_engine_cs *engine; + enum intel_engine_id id; u32 offset = intel_guc_engine_usage_offset(guc); u32 action[] = { INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF, @@ -1550,6 +1568,9 @@ static int guc_action_enable_usage_stats(struct intel_guc *guc) 0, }; + for_each_engine(engine, gt, id) + __set_engine_usage_record(engine, 0, 0xffffffff, 0); + return intel_guc_send(guc, action, ARRAY_SIZE(action)); } From 59a0b46788d58fdcee8d2f6b4e619d264a1799bf Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Wed, 27 Nov 2024 09:40:05 -0800 Subject: [PATCH 0605/1450] i915/guc: Ensure busyness counter increases motonically Active busyness of an engine is calculated using gt timestamp and the context switch in time. While capturing the gt timestamp, it's possible that the context switches out. This race could result in an active busyness value that is greater than the actual context runtime value by a small amount. This leads to a negative delta and throws off busyness calculations for the user. If a subsequent count is smaller than the previous one, just return the previous one, since we expect the busyness to catch up. Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20241127174006.190128-3-umesh.nerlige.ramappa@intel.com (cherry picked from commit cf907f6d294217985e9dafd9985dce874e04ca37) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_engine_types.h | 5 +++++ drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index ba55c059063db..fe1f85e5dda33 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -343,6 +343,11 @@ struct intel_engine_guc_stats { * @start_gt_clk: GT clock time of last idle to active transition. */ u64 start_gt_clk; + + /** + * @total: The last value of total returned + */ + u64 total; }; union intel_engine_tlb_inv_reg { diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index b1d0c66e166ff..9dcf76d440d83 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1378,9 +1378,12 @@ static ktime_t guc_engine_busyness(struct intel_engine_cs *engine, ktime_t *now) total += intel_gt_clock_interval_to_ns(gt, clk); } + if (total > stats->total) + stats->total = total; + spin_unlock_irqrestore(&guc->timestamp.lock, flags); - return ns_to_ktime(total); + return ns_to_ktime(stats->total); } static void guc_enable_busyness_worker(struct intel_guc *guc) From 1622ed27d26ab4c234476be746aa55bcd39159dd Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Wed, 27 Nov 2024 09:40:06 -0800 Subject: [PATCH 0606/1450] i915/guc: Accumulate active runtime on gt reset On gt reset, if a context is running, then accumulate it's active time into the busyness counter since there will be no chance for the context to switch out and update it's run time. v2: Move comment right above the if (John) Fixes: 77cdd054dd2c ("drm/i915/pmu: Connect engine busyness stats from GuC to pmu") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20241127174006.190128-4-umesh.nerlige.ramappa@intel.com (cherry picked from commit 7ed047da59cfa1acb558b95169d347acc8d85da1) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 9dcf76d440d83..c0bd730383f21 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1449,8 +1449,21 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) guc_update_pm_timestamp(guc, &unused); for_each_engine(engine, gt, id) { + struct intel_engine_guc_stats *stats = &engine->stats.guc; + guc_update_engine_gt_clks(engine); - engine->stats.guc.prev_total = 0; + + /* + * If resetting a running context, accumulate the active + * time as well since there will be no context switch. + */ + if (stats->running) { + u64 clk = guc->timestamp.gt_stamp - stats->start_gt_clk; + + stats->total_gt_clks += clk; + } + stats->prev_total = 0; + stats->running = 0; } spin_unlock_irqrestore(&guc->timestamp.lock, flags); From e21ebe51af688eb98fd6269240212a3c7300deea Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 17 Dec 2024 12:21:21 +0200 Subject: [PATCH 0607/1450] xhci: Turn NEC specific quirk for handling Stop Endpoint errors generic xHC hosts from several vendors have the same issue where endpoints start so slowly that a later queued 'Stop Endpoint' command may complete before endpoint is up and running. The 'Stop Endpoint' command fails with context state error as the endpoint still appears as stopped. See commit 42b758137601 ("usb: xhci: Limit Stop Endpoint retries") for details CC: stable@vger.kernel.org Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20241217102122.2316814-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 4cf5363875c70..09b05a62375e0 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -1199,8 +1199,6 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id, * Keep retrying until the EP starts and stops again, on * chips where this is known to help. Wait for 100ms. */ - if (!(xhci->quirks & XHCI_NEC_HOST)) - break; if (time_is_before_jiffies(ep->stop_time + msecs_to_jiffies(100))) break; fallthrough; From b9252f80b807801056e67e3a672fb1be0ecb81d8 Mon Sep 17 00:00:00 2001 From: Niklas Neronin Date: Tue, 17 Dec 2024 12:21:22 +0200 Subject: [PATCH 0608/1450] usb: xhci: fix ring expansion regression in 6.13-rc1 The source and destination rings were incorrectly assigned during the ring linking process. The "source" ring, which contains the new segments, was not spliced into the "destination" ring, leading to incorrect ring expansion. Fixes: fe688e500613 ("usb: xhci: refactor xhci_link_rings() to use source and destination rings") Reported-by: Jeff Chua Closes: https://lore.kernel.org/lkml/CAAJw_ZtppNqC9XA=-WVQDr+vaAS=di7jo15CzSqONeX48H75MA@mail.gmail.com/ Signed-off-by: Niklas Neronin Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20241217102122.2316814-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 15db90c54a45a..92703efda1f7b 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -436,7 +436,7 @@ int xhci_ring_expansion(struct xhci_hcd *xhci, struct xhci_ring *ring, goto free_segments; } - xhci_link_rings(xhci, ring, &new_ring); + xhci_link_rings(xhci, &new_ring, ring); trace_xhci_ring_expansion(ring); xhci_dbg_trace(xhci, trace_xhci_dbg_ring_expansion, "ring expansion succeed, now has %d segments", From 34c899af6c1a9d65aa85c765b2eecb1b9a88e8b8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:39 +0900 Subject: [PATCH 0609/1450] af_unix: Set error only when needed in unix_stream_connect(). We will introduce skb drop reason for AF_UNIX, then we need to set an errno and a drop reason for each path. Let's set an error only when it's needed in unix_stream_connect(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6b17623004439..23f419f561b83 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1575,12 +1575,12 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; } - err = -ENOMEM; - /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); - if (skb == NULL) + if (!skb) { + err = -ENOMEM; goto out; + } restart: /* Find listening sock. */ @@ -1600,16 +1600,17 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto restart; } - err = -ECONNREFUSED; - if (other->sk_state != TCP_LISTEN) - goto out_unlock; - if (other->sk_shutdown & RCV_SHUTDOWN) + if (other->sk_state != TCP_LISTEN || + other->sk_shutdown & RCV_SHUTDOWN) { + err = -ECONNREFUSED; goto out_unlock; + } if (unix_recvq_full_lockless(other)) { - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; goto out_unlock; + } timeo = unix_wait_for_peer(other, timeo); From e26ee0a736bd949ce6fa51829fd0a2f6381391de Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:40 +0900 Subject: [PATCH 0610/1450] af_unix: Clean up error paths in unix_stream_connect(). The label order is weird in unix_stream_connect(), and all NULL checks are unnecessary if reordered. Let's clean up the error paths to make it easy to set a drop reason for each path. While at it, a comment with the old style is updated. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 23f419f561b83..21e17e739f88a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1563,15 +1563,14 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); /* First of all allocate resources. - If we will make it after state is locked, - we will have to recheck all again in any case. + * If we will make it after state is locked, + * we will have to recheck all again in any case. */ /* create new sock for complete connection */ newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); - newsk = NULL; goto out; } @@ -1579,7 +1578,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (!skb) { err = -ENOMEM; - goto out; + goto out_free_sk; } restart: @@ -1587,8 +1586,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); - other = NULL; - goto out; + goto out_free_skb; } unix_state_lock(other); @@ -1613,11 +1611,12 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, } timeo = unix_wait_for_peer(other, timeo); + sock_put(other); err = sock_intr_errno(timeo); if (signal_pending(current)) - goto out; - sock_put(other); + goto out_free_skb; + goto restart; } @@ -1702,15 +1701,13 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, return 0; out_unlock: - if (other) - unix_state_unlock(other); - -out: + unix_state_unlock(other); + sock_put(other); +out_free_skb: kfree_skb(skb); - if (newsk) - unix_release_sock(newsk, 0); - if (other) - sock_put(other); +out_free_sk: + unix_release_sock(newsk, 0); +out: return err; } From 6c444255b193b5b9c5a18c3784d960e10e1833a2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:41 +0900 Subject: [PATCH 0611/1450] af_unix: Set error only when needed in unix_stream_sendmsg(). We will introduce skb drop reason for AF_UNIX, then we need to set an errno and a drop reason for each path. Let's set an error only when it's needed in unix_stream_sendmsg(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 21e17e739f88a..660d8b8130cab 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2254,8 +2254,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, wait_for_unix_gc(scm.fp); - err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { + err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; @@ -2268,10 +2268,11 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { - err = -ENOTCONN; other = unix_peer(sk); - if (!other) + if (!other) { + err = -ENOTCONN; goto out_err; + } } if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) From d460b04bc452cf15810b79c15381fffd9d201915 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:42 +0900 Subject: [PATCH 0612/1450] af_unix: Clean up error paths in unix_stream_sendmsg(). If we move send_sig() to the SEND_SHUTDOWN check before the while loop, then we can reuse the same kfree_skb() after the pipe_err_free label. Let's gather the scattered kfree_skb()s in error paths. While at it, some style issues are fixed, and the pipe_err_free label is renamed to out_pipe to match other label names. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 660d8b8130cab..d30bcd50527e9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2275,8 +2275,13 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, } } - if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) - goto pipe_err; + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) { + if (!(msg->msg_flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + + err = -EPIPE; + goto out_err; + } while (sent < len) { size = len - sent; @@ -2305,20 +2310,18 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Only send the fds in the first buffer */ err = unix_scm_to_skb(&scm, skb, !fds_sent); - if (err < 0) { - kfree_skb(skb); - goto out_err; - } + if (err < 0) + goto out_free; + fds_sent = true; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; err = skb_splice_from_iter(skb, &msg->msg_iter, size, sk->sk_allocation); - if (err < 0) { - kfree_skb(skb); - goto out_err; - } + if (err < 0) + goto out_free; + size = err; refcount_add(size, &sk->sk_wmem_alloc); } else { @@ -2326,17 +2329,15 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, skb->data_len = data_len; skb->len = size; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); - if (err) { - kfree_skb(skb); - goto out_err; - } + if (err) + goto out_free; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) - goto pipe_err_free; + goto out_pipe; maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); @@ -2359,13 +2360,13 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, return sent; -pipe_err_free: +out_pipe: unix_state_unlock(other); - kfree_skb(skb); -pipe_err: - if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) + if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; +out_free: + kfree_skb(skb); out_err: scm_destroy(&scm); return sent ? : err; From 001a25088c35ab69bd4b2f208e47eb8acbce6353 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:43 +0900 Subject: [PATCH 0613/1450] af_unix: Set error only when needed in unix_dgram_sendmsg(). We will introduce skb drop reason for AF_UNIX, then we need to set an errno and a drop reason for each path. Let's set an error only when it's needed in unix_dgram_sendmsg(). Then, we need not (re)set 0 to err. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d30bcd50527e9..07d6fba99a7c4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1978,9 +1978,10 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, wait_for_unix_gc(scm.fp); - err = -EOPNOTSUPP; - if (msg->msg_flags&MSG_OOB) + if (msg->msg_flags & MSG_OOB) { + err = -EOPNOTSUPP; goto out; + } if (msg->msg_namelen) { err = unix_validate_addr(sunaddr, msg->msg_namelen); @@ -1995,10 +1996,11 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } else { sunaddr = NULL; - err = -ENOTCONN; other = unix_peer_get(sk); - if (!other) + if (!other) { + err = -ENOTCONN; goto out; + } } if ((test_bit(SOCK_PASSCRED, &sock->flags) || @@ -2009,9 +2011,10 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - err = -EMSGSIZE; - if (len > READ_ONCE(sk->sk_sndbuf) - 32) + if (len > READ_ONCE(sk->sk_sndbuf) - 32) { + err = -EMSGSIZE; goto out; + } if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, @@ -2043,9 +2046,10 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, restart: if (!other) { - err = -ECONNRESET; - if (sunaddr == NULL) + if (!sunaddr) { + err = -ECONNRESET; goto out_free; + } other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, sk->sk_type); @@ -2065,9 +2069,11 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, sk_locked = 0; unix_state_lock(other); restart_locked: - err = -EPERM; - if (!unix_may_send(sk, other)) + + if (!unix_may_send(sk, other)) { + err = -EPERM; goto out_unlock; + } if (unlikely(sock_flag(other, SOCK_DEAD))) { /* @@ -2080,7 +2086,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (!sk_locked) unix_state_lock(sk); - err = 0; if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE @@ -2108,9 +2113,10 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto restart; } - err = -EPIPE; - if (other->sk_shutdown & RCV_SHUTDOWN) + if (other->sk_shutdown & RCV_SHUTDOWN) { + err = -EPIPE; goto out_unlock; + } if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); From f4dd63165b08ba3b72117973d5daea456f36377d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:44 +0900 Subject: [PATCH 0614/1450] af_unix: Move !sunaddr case in unix_dgram_sendmsg(). When other is NULL in unix_dgram_sendmsg(), we check if sunaddr is NULL before looking up a receiver socket. There are three paths going through the check, but it's always false for 2 out of the 3 paths: the first socket lookup and the second 'goto restart'. The condition can be true for the first 'goto restart' only when SOCK_DEAD is flagged for the socket found with msg->msg_name. Let's move the check to the single appropriate path. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 07d6fba99a7c4..111f953849901 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2046,11 +2046,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, restart: if (!other) { - if (!sunaddr) { - err = -ECONNRESET; - goto out_free; - } - other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { @@ -2105,6 +2100,9 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = -ECONNREFUSED; } else { unix_state_unlock(sk); + + if (!sunaddr) + err = -ECONNRESET; } other = NULL; From 3c05329a2abe312ed85a60a325b930063f61e817 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:45 +0900 Subject: [PATCH 0615/1450] af_unix: Use msg->{msg_name,msg_namelen} in unix_dgram_sendmsg(). In unix_dgram_sendmsg(), we use a local variable sunaddr pointing NULL or msg->msg_name based on msg->msg_namelen. Let's remove sunaddr and simplify the usage. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 111f953849901..ae74fdcf5dcd1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1962,7 +1962,6 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { - DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *sk = sock->sk, *other = NULL; struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; @@ -1984,7 +1983,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, } if (msg->msg_namelen) { - err = unix_validate_addr(sunaddr, msg->msg_namelen); + err = unix_validate_addr(msg->msg_name, msg->msg_namelen); if (err) goto out; @@ -1995,7 +1994,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (err) goto out; } else { - sunaddr = NULL; other = unix_peer_get(sk); if (!other) { err = -ENOTCONN; @@ -2046,8 +2044,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, restart: if (!other) { - other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, - sk->sk_type); + other = unix_find_other(sock_net(sk), msg->msg_name, + msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); other = NULL; @@ -2101,7 +2099,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, } else { unix_state_unlock(sk); - if (!sunaddr) + if (!msg->msg_namelen) err = -ECONNRESET; } From a700b43358ccc3c5ae857eeea37ff50ce0529b1c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:46 +0900 Subject: [PATCH 0616/1450] af_unix: Split restart label in unix_dgram_sendmsg(). There are two paths jumping to the restart label in unix_dgram_sendmsg(). One requires another lookup and sk_filter(), but the other doesn't. Let's split the label to make each flow more straightforward. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ae74fdcf5dcd1..513d0fd12e6a9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2042,8 +2042,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); -restart: if (!other) { +lookup: other = unix_find_other(sock_net(sk), msg->msg_name, msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { @@ -2059,6 +2059,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out_free; } +restart: sk_locked = 0; unix_state_lock(other); restart_locked: @@ -2106,7 +2107,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, other = NULL; if (err) goto out_free; - goto restart; + + goto lookup; } if (other->sk_shutdown & RCV_SHUTDOWN) { From 689c398885cc27d2a5bb2ad5d70324107d4a78ec Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:47 +0900 Subject: [PATCH 0617/1450] af_unix: Defer sock_put() to clean up path in unix_dgram_sendmsg(). When other has SOCK_DEAD in unix_dgram_sendmsg(), we call sock_put() for it first and then set NULL to other before jumping to the error path. This is to skip sock_put() in the error path. Let's not set NULL to other and defer the sock_put() to the error path to clean up the labels later. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 513d0fd12e6a9..b8adfb41d11bc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2075,7 +2075,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, * datagram error */ unix_state_unlock(other); - sock_put(other); if (!sk_locked) unix_state_lock(sk); @@ -2104,7 +2103,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = -ECONNRESET; } - other = NULL; if (err) goto out_free; From 106d979b85e575b0ab10224fcde5c3eb94566e05 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:48 +0900 Subject: [PATCH 0618/1450] af_unix: Clean up SOCK_DEAD error paths in unix_dgram_sendmsg(). When other has SOCK_DEAD in unix_dgram_sendmsg(), we hold unix_state_lock() for the sender socket first. However, we do not need it for sk->sk_type. Let's move the lock down a bit. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b8adfb41d11bc..22c689b0044fc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2070,23 +2070,23 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, } if (unlikely(sock_flag(other, SOCK_DEAD))) { - /* - * Check with 1003.1g - what should - * datagram error - */ - unix_state_unlock(other); + /* Check with 1003.1g - what should datagram error */ - if (!sk_locked) - unix_state_lock(sk); + unix_state_unlock(other); if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants. */ - unix_state_unlock(sk); err = -EPIPE; - } else if (unix_peer(sk) == other) { + goto out_free; + } + + if (!sk_locked) + unix_state_lock(sk); + + if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); @@ -2096,15 +2096,15 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; - } else { - unix_state_unlock(sk); - - if (!msg->msg_namelen) - err = -ECONNRESET; + goto out_free; } - if (err) + unix_state_unlock(sk); + + if (!msg->msg_namelen) { + err = -ECONNRESET; goto out_free; + } goto lookup; } From 62c6db251e667e8a240dc8209c00313240120fd6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:49 +0900 Subject: [PATCH 0619/1450] af_unix: Clean up error paths in unix_dgram_sendmsg(). The error path is complicated in unix_dgram_sendmsg() because there are two timings when other could be non-NULL: when it's fetched from unix_peer_get() and when it's looked up by unix_find_other(). Let's move unix_peer_get() to the else branch for unix_find_other() and clean up the error paths. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 22c689b0044fc..239ce2f77d553 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1993,12 +1993,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, NULL); if (err) goto out; - } else { - other = unix_peer_get(sk); - if (!other) { - err = -ENOTCONN; - goto out; - } } if ((test_bit(SOCK_PASSCRED, &sock->flags) || @@ -2026,7 +2020,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, skb = sock_alloc_send_pskb(sk, len - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, PAGE_ALLOC_COSTLY_ORDER); - if (skb == NULL) + if (!skb) goto out; err = unix_scm_to_skb(&scm, skb, true); @@ -2042,13 +2036,18 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - if (!other) { + if (msg->msg_namelen) { lookup: other = unix_find_other(sock_net(sk), msg->msg_name, msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); - other = NULL; + goto out_free; + } + } else { + other = unix_peer_get(sk); + if (!other) { + err = -ENOTCONN; goto out_free; } } @@ -2056,7 +2055,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (sk_filter(other, skb) < 0) { /* Toss the packet but do not return any error to the sender */ err = len; - goto out_free; + goto out_sock_put; } restart: @@ -2080,7 +2079,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, * unlike SOCK_DGRAM wants. */ err = -EPIPE; - goto out_free; + goto out_sock_put; } if (!sk_locked) @@ -2096,14 +2095,14 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; - goto out_free; + goto out_sock_put; } unix_state_unlock(sk); if (!msg->msg_namelen) { err = -ECONNRESET; - goto out_free; + goto out_sock_put; } goto lookup; @@ -2132,7 +2131,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = sock_intr_errno(timeo); if (signal_pending(current)) - goto out_free; + goto out_sock_put; goto restart; } @@ -2173,11 +2172,11 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (sk_locked) unix_state_unlock(sk); unix_state_unlock(other); +out_sock_put: + sock_put(other); out_free: kfree_skb(skb); out: - if (other) - sock_put(other); scm_destroy(&scm); return err; } From bf61ffeb9cc48ee7d1945f26578291da5d9305e4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 13 Dec 2024 20:08:50 +0900 Subject: [PATCH 0620/1450] af_unix: Remove unix_our_peer(). unix_our_peer() is used only in unix_may_send(). Let's inline it in unix_may_send(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 239ce2f77d553..8f2b605ce5b37 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -286,14 +286,9 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) } #endif /* CONFIG_SECURITY_NETWORK */ -static inline int unix_our_peer(struct sock *sk, struct sock *osk) -{ - return unix_peer(osk) == sk; -} - static inline int unix_may_send(struct sock *sk, struct sock *osk) { - return unix_peer(osk) == NULL || unix_our_peer(sk, osk); + return !unix_peer(osk) || unix_peer(osk) == sk; } static inline int unix_recvq_full_lockless(const struct sock *sk) From 70465acbb0ce1bb69447acf32f136c8153cda0de Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Mon, 2 Dec 2024 09:53:17 +0800 Subject: [PATCH 0621/1450] exfat: fix exfat_find_empty_entry() not returning error on failure On failure, "dentry" is the error code. If the error code indicates that there is no space, a new cluster may need to be allocated; for other errors, it should be returned directly. Only on success, "dentry" is the index of the directory entry, and it needs to be converted into the directory entry index within the cluster where it is located. Fixes: 8a3f5711ad74 ("exfat: reduce FAT chain traversal") Reported-by: syzbot+6f6c9397e0078ef60bce@syzkaller.appspotmail.com Tested-by: syzbot+6f6c9397e0078ef60bce@syzkaller.appspotmail.com Signed-off-by: Yuezhang Mo Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 97d2774760fe3..099f806450721 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -330,8 +330,8 @@ static int exfat_find_empty_entry(struct inode *inode, while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir, num_entries, es)) < 0) { - if (dentry == -EIO) - break; + if (dentry != -ENOSPC) + return dentry; if (exfat_check_max_dentries(inode)) return -ENOSPC; From 7d2f320e12744e5906a4fab40381060a81d22c12 Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Fri, 13 Dec 2024 18:01:58 +0530 Subject: [PATCH 0622/1450] net: ethernet: oa_tc6: fix infinite loop error when tx credits becomes 0 SPI thread wakes up to perform SPI transfer whenever there is an TX skb from n/w stack or interrupt from MAC-PHY. Ethernet frame from TX skb is transferred based on the availability tx credits in the MAC-PHY which is reported from the previous SPI transfer. Sometimes there is a possibility that TX skb is available to transmit but there is no tx credits from MAC-PHY. In this case, there will not be any SPI transfer but the thread will be running in an endless loop until tx credits available again. So checking the availability of tx credits along with TX skb will prevent the above infinite loop. When the tx credits available again that will be notified through interrupt which will trigger the SPI transfer to get the available tx credits. Fixes: 53fbde8ab21e ("net: ethernet: oa_tc6: implement transmit path to transfer tx ethernet frames") Reviewed-by: Jacob Keller Signed-off-by: Parthiban Veerasooran Signed-off-by: Paolo Abeni --- drivers/net/ethernet/oa_tc6.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c index f9c0dcd965c2e..4c8b0ca922b72 100644 --- a/drivers/net/ethernet/oa_tc6.c +++ b/drivers/net/ethernet/oa_tc6.c @@ -1111,8 +1111,9 @@ static int oa_tc6_spi_thread_handler(void *data) /* This kthread will be waken up if there is a tx skb or mac-phy * interrupt to perform spi transfer with tx chunks. */ - wait_event_interruptible(tc6->spi_wq, tc6->waiting_tx_skb || - tc6->int_flag || + wait_event_interruptible(tc6->spi_wq, tc6->int_flag || + (tc6->waiting_tx_skb && + tc6->tx_credits) || kthread_should_stop()); if (kthread_should_stop()) From e592b5110b3e9393881b0a019d86832bbf71a47f Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Fri, 13 Dec 2024 18:01:59 +0530 Subject: [PATCH 0623/1450] net: ethernet: oa_tc6: fix tx skb race condition between reference pointers There are two skb pointers to manage tx skb's enqueued from n/w stack. waiting_tx_skb pointer points to the tx skb which needs to be processed and ongoing_tx_skb pointer points to the tx skb which is being processed. SPI thread prepares the tx data chunks from the tx skb pointed by the ongoing_tx_skb pointer. When the tx skb pointed by the ongoing_tx_skb is processed, the tx skb pointed by the waiting_tx_skb is assigned to ongoing_tx_skb and the waiting_tx_skb pointer is assigned with NULL. Whenever there is a new tx skb from n/w stack, it will be assigned to waiting_tx_skb pointer if it is NULL. Enqueuing and processing of a tx skb handled in two different threads. Consider a scenario where the SPI thread processed an ongoing_tx_skb and it moves next tx skb from waiting_tx_skb pointer to ongoing_tx_skb pointer without doing any NULL check. At this time, if the waiting_tx_skb pointer is NULL then ongoing_tx_skb pointer is also assigned with NULL. After that, if a new tx skb is assigned to waiting_tx_skb pointer by the n/w stack and there is a chance to overwrite the tx skb pointer with NULL in the SPI thread. Finally one of the tx skb will be left as unhandled, resulting packet missing and memory leak. - Consider the below scenario where the TXC reported from the previous transfer is 10 and ongoing_tx_skb holds an tx ethernet frame which can be transported in 20 TXCs and waiting_tx_skb is still NULL. tx_credits = 10; /* 21 are filled in the previous transfer */ ongoing_tx_skb = 20; waiting_tx_skb = NULL; /* Still NULL */ - So, (tc6->ongoing_tx_skb || tc6->waiting_tx_skb) becomes true. - After oa_tc6_prepare_spi_tx_buf_for_tx_skbs() ongoing_tx_skb = 10; waiting_tx_skb = NULL; /* Still NULL */ - Perform SPI transfer. - Process SPI rx buffer to get the TXC from footers. - Now let's assume previously filled 21 TXCs are freed so we are good to transport the next remaining 10 tx chunks from ongoing_tx_skb. tx_credits = 21; ongoing_tx_skb = 10; waiting_tx_skb = NULL; - So, (tc6->ongoing_tx_skb || tc6->waiting_tx_skb) becomes true again. - In the oa_tc6_prepare_spi_tx_buf_for_tx_skbs() ongoing_tx_skb = NULL; waiting_tx_skb = NULL; - Now the below bad case might happen, Thread1 (oa_tc6_start_xmit) Thread2 (oa_tc6_spi_thread_handler) --------------------------- ----------------------------------- - if waiting_tx_skb is NULL - if ongoing_tx_skb is NULL - ongoing_tx_skb = waiting_tx_skb - waiting_tx_skb = skb - waiting_tx_skb = NULL ... - ongoing_tx_skb = NULL - if waiting_tx_skb is NULL - waiting_tx_skb = skb To overcome the above issue, protect the moving of tx skb reference from waiting_tx_skb pointer to ongoing_tx_skb pointer and assigning new tx skb to waiting_tx_skb pointer, so that the other thread can't access the waiting_tx_skb pointer until the current thread completes moving the tx skb reference safely. Fixes: 53fbde8ab21e ("net: ethernet: oa_tc6: implement transmit path to transfer tx ethernet frames") Signed-off-by: Parthiban Veerasooran Reviewed-by: Larysa Zaremba Signed-off-by: Paolo Abeni --- drivers/net/ethernet/oa_tc6.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c index 4c8b0ca922b72..db200e4ec284d 100644 --- a/drivers/net/ethernet/oa_tc6.c +++ b/drivers/net/ethernet/oa_tc6.c @@ -113,6 +113,7 @@ struct oa_tc6 { struct mii_bus *mdiobus; struct spi_device *spi; struct mutex spi_ctrl_lock; /* Protects spi control transfer */ + spinlock_t tx_skb_lock; /* Protects tx skb handling */ void *spi_ctrl_tx_buf; void *spi_ctrl_rx_buf; void *spi_data_tx_buf; @@ -1004,8 +1005,10 @@ static u16 oa_tc6_prepare_spi_tx_buf_for_tx_skbs(struct oa_tc6 *tc6) for (used_tx_credits = 0; used_tx_credits < tc6->tx_credits; used_tx_credits++) { if (!tc6->ongoing_tx_skb) { + spin_lock_bh(&tc6->tx_skb_lock); tc6->ongoing_tx_skb = tc6->waiting_tx_skb; tc6->waiting_tx_skb = NULL; + spin_unlock_bh(&tc6->tx_skb_lock); } if (!tc6->ongoing_tx_skb) break; @@ -1210,7 +1213,9 @@ netdev_tx_t oa_tc6_start_xmit(struct oa_tc6 *tc6, struct sk_buff *skb) return NETDEV_TX_OK; } + spin_lock_bh(&tc6->tx_skb_lock); tc6->waiting_tx_skb = skb; + spin_unlock_bh(&tc6->tx_skb_lock); /* Wake spi kthread to perform spi transfer */ wake_up_interruptible(&tc6->spi_wq); @@ -1240,6 +1245,7 @@ struct oa_tc6 *oa_tc6_init(struct spi_device *spi, struct net_device *netdev) tc6->netdev = netdev; SET_NETDEV_DEV(netdev, &spi->dev); mutex_init(&tc6->spi_ctrl_lock); + spin_lock_init(&tc6->tx_skb_lock); /* Set the SPI controller to pump at realtime priority */ tc6->spi->rt = true; From 0cb2c504d79e7caa3abade3f466750c82ad26f01 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 14 Dec 2024 10:49:12 +0900 Subject: [PATCH 0624/1450] net: ethernet: bgmac-platform: fix an OF node reference leak The OF node obtained by of_parse_phandle() is not freed. Call of_node_put() to balance the refcount. This bug was found by an experimental static analysis tool that I am developing. Fixes: 1676aba5ef7e ("net: ethernet: bgmac: device tree phy enablement") Signed-off-by: Joe Hattori Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241214014912.2810315-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bgmac-platform.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index ecce23cecbea4..4e266ce41180b 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -171,6 +171,7 @@ static int platform_phy_connect(struct bgmac *bgmac) static int bgmac_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; + struct device_node *phy_node; struct bgmac *bgmac; struct resource *regs; int ret; @@ -236,7 +237,9 @@ static int bgmac_probe(struct platform_device *pdev) bgmac->cco_ctl_maskset = platform_bgmac_cco_ctl_maskset; bgmac->get_bus_clock = platform_bgmac_get_bus_clock; bgmac->cmn_maskset32 = platform_bgmac_cmn_maskset32; - if (of_parse_phandle(np, "phy-handle", 0)) { + phy_node = of_parse_phandle(np, "phy-handle", 0); + if (phy_node) { + of_node_put(phy_node); bgmac->phy_connect = platform_phy_connect; } else { bgmac->phy_connect = bgmac_phy_connect_direct; From ff9f17ce2e53887e74fc0e72711ece42526836ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Sat, 14 Dec 2024 17:50:59 +0100 Subject: [PATCH 0625/1450] net/sched: Add drop reasons for AQM-based qdiscs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we have generic QDISC_CONGESTED and QDISC_OVERLIMIT drop reasons, let's have all the qdiscs that contain an AQM apply them consistently when dropping packets. Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241214-fq-codel-drop-reasons-v1-1-2a814e884c37@redhat.com Signed-off-by: Paolo Abeni --- net/sched/sch_codel.c | 5 +++-- net/sched/sch_fq_codel.c | 3 ++- net/sched/sch_fq_pie.c | 6 ++++-- net/sched/sch_gred.c | 4 ++-- net/sched/sch_pie.c | 5 ++++- net/sched/sch_red.c | 4 +++- net/sched/sch_sfb.c | 4 +++- 7 files changed, 21 insertions(+), 10 deletions(-) diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 3e8d4fe4d91e3..81189d02fee76 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -89,7 +89,8 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } q = qdisc_priv(sch); q->drop_overlimit++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_OVERLIMIT); } static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 4f908c11ba952..799f5397ad4c1 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -168,6 +168,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, skb = dequeue_head(flow); len += qdisc_pkt_len(skb); mem += get_codel_cb(skb)->mem_usage; + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); @@ -274,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index c38f33ff80bde..93c36afbf5762 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -130,6 +130,7 @@ static inline void flow_queue_add(struct fq_pie_flow *flow, static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct fq_pie_sched_data *q = qdisc_priv(sch); struct fq_pie_flow *sel_flow; int ret; @@ -161,6 +162,8 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->overmemory++; } + reason = SKB_DROP_REASON_QDISC_CONGESTED; + if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars, sel_flow->backlog, skb->len)) { enqueue = true; @@ -198,8 +201,7 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; sel_flow->vars.accu_prob = 0; - __qdisc_drop(skb, to_free); - qdisc_qstats_drop(sch); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; } diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 7d2151c62c4a1..ab6234b4fcd54 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -251,10 +251,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->stats.pdrop++; drop: - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); congestion_drop: - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_CONGESTED); return NET_XMIT_CN; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b3dcb845b3275..bb1fa9aa530b2 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -85,6 +85,7 @@ EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; @@ -93,6 +94,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto out; } + reason = SKB_DROP_REASON_QDISC_CONGESTED; + if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, skb->len)) { enqueue = true; @@ -121,7 +124,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; q->vars.accu_prob = 0; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, reason); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 6029bc29b51e5..ef8a2afed26bd 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -70,6 +70,7 @@ static int red_use_nodrop(struct red_sched_data *q) static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_CONGESTED; struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; unsigned int len; @@ -107,6 +108,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; case RED_HARD_MARK: + reason = SKB_DROP_REASON_QDISC_OVERLIMIT; qdisc_qstats_overlimit(sch); if (red_use_harddrop(q) || !red_use_ecn(q)) { q->stats.forced_drop++; @@ -143,7 +145,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (!skb) return NET_XMIT_CN | ret; - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index b717e15a3a17b..d2835f1168e1d 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -280,6 +280,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct sfb_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; @@ -380,6 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } r = get_random_u16() & SFB_MAX_PROB; + reason = SKB_DROP_REASON_QDISC_CONGESTED; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { @@ -414,7 +416,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, return ret; drop: - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) From d22f955cc2cb9684dd45396f974101f288869485 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Sat, 14 Dec 2024 19:43:06 +0000 Subject: [PATCH 0626/1450] rust: net::phy scope ThisModule usage in the module_phy_driver macro Similar to the use of $crate::Module, ThisModule should be referred to as $crate::ThisModule in the macro evaluation. The reason the macro previously did not cause any errors is because all the users of the macro would use kernel::prelude::*, bringing ThisModule into scope. Signed-off-by: Rahul Rameshbabu Reviewed-by: FUJITA Tomonori Reviewed-by: Alice Ryhl Link: https://patch.msgid.link/20241214194242.19505-1-sergeantsagara@protonmail.com Signed-off-by: Paolo Abeni --- rust/kernel/net/phy.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index b89c681d97c01..00c3100f5ebdb 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -837,7 +837,7 @@ impl DeviceMask { /// [::kernel::net::phy::create_phy_driver::()]; /// /// impl ::kernel::Module for Module { -/// fn init(module: &'static ThisModule) -> Result { +/// fn init(module: &'static ::kernel::ThisModule) -> Result { /// let drivers = unsafe { &mut DRIVERS }; /// let mut reg = ::kernel::net::phy::Registration::register( /// module, @@ -903,7 +903,7 @@ macro_rules! module_phy_driver { [$($crate::net::phy::create_phy_driver::<$driver>()),+]; impl $crate::Module for Module { - fn init(module: &'static ThisModule) -> Result { + fn init(module: &'static $crate::ThisModule) -> Result { // SAFETY: The anonymous constant guarantees that nobody else can access // the `DRIVERS` static. The array is used only in the C side. let drivers = unsafe { &mut DRIVERS }; From 2b2fc0be98a828cf33a88a28e9745e8599fb05cf Mon Sep 17 00:00:00 2001 From: Zhang Kunbo Date: Tue, 17 Dec 2024 07:18:36 +0000 Subject: [PATCH 0627/1450] fs: fix missing declaration of init_files fs/file.c should include include/linux/init_task.h for declaration of init_files. This fixes the sparse warning: fs/file.c:501:21: warning: symbol 'init_files' was not declared. Should it be static? Signed-off-by: Zhang Kunbo Link: https://lore.kernel.org/r/20241217071836.2634868-1-zhangkunbo@huawei.com Signed-off-by: Christian Brauner --- fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/file.c b/fs/file.c index fb1011cf6b4a6..25c6e53b03f8f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "internal.h" From e8d0ba147d901022bcb69da8d8fd817f84e9f3ca Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 17 Dec 2024 11:10:19 +0200 Subject: [PATCH 0628/1450] ASoC: SOF: Intel: hda-dai: Do not release the link DMA on STOP The linkDMA should not be released on stop trigger since a stream re-start might happen without closing of the stream. This leaves a short time for other streams to 'steal' the linkDMA since it has been released. This issue is not easy to reproduce under normal conditions as usually after stop the stream is closed, or the same stream is restarted, but if another stream got in between the stop and start, like this: aplay -Dhw:0,3 -c2 -r48000 -fS32_LE /dev/zero -d 120 CTRL+z aplay -Dhw:0,0 -c2 -r48000 -fS32_LE /dev/zero -d 120 then the link DMA channels will be mixed up, resulting firmware error or crash. Fixes: ab5593793e90 ("ASoC: SOF: Intel: hda: Always clean up link DMA during stop") Cc: stable@vger.kernel.org Closes: https://github.com/thesofproject/sof/issues/9695 Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Reviewed-by: Liam Girdwood Reviewed-by: Bard Liao Link: https://patch.msgid.link/20241217091019.31798-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-dai.c | 25 +++++++++++++++++++------ sound/soc/sof/intel/hda.h | 2 -- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index c13f89b7065ec..0db2a3e554fb2 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -103,8 +103,10 @@ hda_dai_get_ops(struct snd_pcm_substream *substream, struct snd_soc_dai *cpu_dai return sdai->platform_private; } -int hda_link_dma_cleanup(struct snd_pcm_substream *substream, struct hdac_ext_stream *hext_stream, - struct snd_soc_dai *cpu_dai) +static int +hda_link_dma_cleanup(struct snd_pcm_substream *substream, + struct hdac_ext_stream *hext_stream, + struct snd_soc_dai *cpu_dai, bool release) { const struct hda_dai_widget_dma_ops *ops = hda_dai_get_ops(substream, cpu_dai); struct sof_intel_hda_stream *hda_stream; @@ -128,6 +130,17 @@ int hda_link_dma_cleanup(struct snd_pcm_substream *substream, struct hdac_ext_st snd_hdac_ext_bus_link_clear_stream_id(hlink, stream_tag); } + if (!release) { + /* + * Force stream reconfiguration without releasing the channel on + * subsequent stream restart (without free), including LinkDMA + * reset. + * The stream is released via hda_dai_hw_free() + */ + hext_stream->link_prepared = 0; + return 0; + } + if (ops->release_hext_stream) ops->release_hext_stream(sdev, cpu_dai, substream); @@ -211,7 +224,7 @@ static int __maybe_unused hda_dai_hw_free(struct snd_pcm_substream *substream, if (!hext_stream) return 0; - return hda_link_dma_cleanup(substream, hext_stream, cpu_dai); + return hda_link_dma_cleanup(substream, hext_stream, cpu_dai, true); } static int __maybe_unused hda_dai_hw_params_data(struct snd_pcm_substream *substream, @@ -304,7 +317,8 @@ static int __maybe_unused hda_dai_trigger(struct snd_pcm_substream *substream, i switch (cmd) { case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_SUSPEND: - ret = hda_link_dma_cleanup(substream, hext_stream, dai); + ret = hda_link_dma_cleanup(substream, hext_stream, dai, + cmd == SNDRV_PCM_TRIGGER_STOP ? false : true); if (ret < 0) { dev_err(sdev->dev, "%s: failed to clean up link DMA\n", __func__); return ret; @@ -660,8 +674,7 @@ static int hda_dai_suspend(struct hdac_bus *bus) } ret = hda_link_dma_cleanup(hext_stream->link_substream, - hext_stream, - cpu_dai); + hext_stream, cpu_dai, true); if (ret < 0) return ret; } diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h index 22bd9c3c8216c..ee4ccc1a54900 100644 --- a/sound/soc/sof/intel/hda.h +++ b/sound/soc/sof/intel/hda.h @@ -1038,8 +1038,6 @@ const struct hda_dai_widget_dma_ops * hda_select_dai_widget_ops(struct snd_sof_dev *sdev, struct snd_sof_widget *swidget); int hda_dai_config(struct snd_soc_dapm_widget *w, unsigned int flags, struct snd_sof_dai_config_data *data); -int hda_link_dma_cleanup(struct snd_pcm_substream *substream, struct hdac_ext_stream *hext_stream, - struct snd_soc_dai *cpu_dai); static inline struct snd_sof_dev *widget_to_sdev(struct snd_soc_dapm_widget *w) { From 7ed2d91588779f0a2b27fd502ce2aaf1fab9b3ca Mon Sep 17 00:00:00 2001 From: Gianfranco Trad Date: Sun, 15 Dec 2024 02:17:34 +0100 Subject: [PATCH 0629/1450] qed: fix possible uninit pointer read in qed_mcp_nvm_info_populate() Coverity reports an uninit pointer read in qed_mcp_nvm_info_populate(). If EOPNOTSUPP is returned from qed_mcp_bist_nvm_get_num_images() ensure nvm_info.num_images is set to 0 to avoid possible uninit assignment to p_hwfn->nvm_info.image_att later on in out label. Closes: https://scan5.scan.coverity.com/#/project-view/63204/10063?selectedIssue=1636666 Suggested-by: Simon Horman Signed-off-by: Gianfranco Trad Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241215011733.351325-2-gianf.trad@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index b45efc272fdbb..c7f497c36f66c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -3358,6 +3358,7 @@ int qed_mcp_nvm_info_populate(struct qed_hwfn *p_hwfn) p_ptt, &nvm_info.num_images); if (rc == -EOPNOTSUPP) { DP_INFO(p_hwfn, "DRV_MSG_CODE_BIST_TEST is not supported\n"); + nvm_info.num_images = 0; goto out; } else if (rc || !nvm_info.num_images) { DP_ERR(p_hwfn, "Failed getting number of images\n"); From a37eecb705f33726f1fb7cd2a67e514a15dfe693 Mon Sep 17 00:00:00 2001 From: Evgenii Shatokhin Date: Mon, 9 Dec 2024 10:46:59 +0300 Subject: [PATCH 0630/1450] pinctrl: mcp23s08: Fix sleeping in atomic context due to regmap locking If a device uses MCP23xxx IO expander to receive IRQs, the following bug can happen: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:283 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, ... preempt_count: 1, expected: 0 ... Call Trace: ... __might_resched+0x104/0x10e __might_sleep+0x3e/0x62 mutex_lock+0x20/0x4c regmap_lock_mutex+0x10/0x18 regmap_update_bits_base+0x2c/0x66 mcp23s08_irq_set_type+0x1ae/0x1d6 __irq_set_trigger+0x56/0x172 __setup_irq+0x1e6/0x646 request_threaded_irq+0xb6/0x160 ... We observed the problem while experimenting with a touchscreen driver which used MCP23017 IO expander (I2C). The regmap in the pinctrl-mcp23s08 driver uses a mutex for protection from concurrent accesses, which is the default for regmaps without .fast_io, .disable_locking, etc. mcp23s08_irq_set_type() calls regmap_update_bits_base(), and the latter locks the mutex. However, __setup_irq() locks desc->lock spinlock before calling these functions. As a result, the system tries to lock the mutex whole holding the spinlock. It seems, the internal regmap locks are not needed in this driver at all. mcp->lock seems to protect the regmap from concurrent accesses already, except, probably, in mcp_pinconf_get/set. mcp23s08_irq_set_type() and mcp23s08_irq_mask/unmask() are called under chip_bus_lock(), which calls mcp23s08_irq_bus_lock(). The latter takes mcp->lock and enables regmap caching, so that the potentially slow I2C accesses are deferred until chip_bus_unlock(). The accesses to the regmap from mcp23s08_probe_one() do not need additional locking. In all remaining places where the regmap is accessed, except mcp_pinconf_get/set(), the driver already takes mcp->lock. This patch adds locking in mcp_pinconf_get/set() and disables internal locking in the regmap config. Among other things, it fixes the sleeping in atomic context described above. Fixes: 8f38910ba4f6 ("pinctrl: mcp23s08: switch to regmap caching") Cc: stable@vger.kernel.org Signed-off-by: Evgenii Shatokhin Link: https://lore.kernel.org/20241209074659.1442898-1-e.shatokhin@yadro.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-mcp23s08.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pinctrl/pinctrl-mcp23s08.c b/drivers/pinctrl/pinctrl-mcp23s08.c index d66c3a3e84292..b96e6368a9568 100644 --- a/drivers/pinctrl/pinctrl-mcp23s08.c +++ b/drivers/pinctrl/pinctrl-mcp23s08.c @@ -86,6 +86,7 @@ const struct regmap_config mcp23x08_regmap = { .num_reg_defaults = ARRAY_SIZE(mcp23x08_defaults), .cache_type = REGCACHE_FLAT, .max_register = MCP_OLAT, + .disable_locking = true, /* mcp->lock protects the regmap */ }; EXPORT_SYMBOL_GPL(mcp23x08_regmap); @@ -132,6 +133,7 @@ const struct regmap_config mcp23x17_regmap = { .num_reg_defaults = ARRAY_SIZE(mcp23x17_defaults), .cache_type = REGCACHE_FLAT, .val_format_endian = REGMAP_ENDIAN_LITTLE, + .disable_locking = true, /* mcp->lock protects the regmap */ }; EXPORT_SYMBOL_GPL(mcp23x17_regmap); @@ -228,7 +230,9 @@ static int mcp_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: + mutex_lock(&mcp->lock); ret = mcp_read(mcp, MCP_GPPU, &data); + mutex_unlock(&mcp->lock); if (ret < 0) return ret; status = (data & BIT(pin)) ? 1 : 0; @@ -257,7 +261,9 @@ static int mcp_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: + mutex_lock(&mcp->lock); ret = mcp_set_bit(mcp, MCP_GPPU, pin, arg); + mutex_unlock(&mcp->lock); break; default: dev_dbg(mcp->dev, "Invalid config param %04x\n", param); From 69d803c40edeaf94089fbc8751c9b746cdc35044 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Mon, 16 Dec 2024 22:21:52 +0800 Subject: [PATCH 0631/1450] nfsd: Revert "nfsd: release svc_expkey/svc_export with rcu_work" This reverts commit f8c989a0c89a75d30f899a7cabdc14d72522bb8d. Before this commit, svc_export_put or expkey_put will call path_put with sync mode. After this commit, path_put will be called with async mode. And this can lead the unexpected results show as follow. mkfs.xfs -f /dev/sda echo "/ *(rw,no_root_squash,fsid=0)" > /etc/exports echo "/mnt *(rw,no_root_squash,fsid=1)" >> /etc/exports exportfs -ra service nfs-server start mount -t nfs -o vers=4.0 127.0.0.1:/mnt /mnt1 mount /dev/sda /mnt/sda touch /mnt1/sda/file exportfs -r umount /mnt/sda # failed unexcepted The touch will finally call nfsd_cross_mnt, add refcount to mount, and then add cache_head. Before this commit, exportfs -r will call cache_flush to cleanup all cache_head, and path_put in svc_export_put/expkey_put will be finished with sync mode. So, the latter umount will always success. However, after this commit, path_put will be called with async mode, the latter umount may failed, and if we add some delay, umount will success too. Personally I think this bug and should be fixed. We first revert before bugfix patch, and then fix the original bug with a different way. Fixes: f8c989a0c89a ("nfsd: release svc_expkey/svc_export with rcu_work") Signed-off-by: Yang Erkun Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 31 ++++++------------------------- fs/nfsd/export.h | 4 ++-- 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index eacafe46e3b67..aa4712362b3b3 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -40,24 +40,15 @@ #define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) -static void expkey_put_work(struct work_struct *work) +static void expkey_put(struct kref *ref) { - struct svc_expkey *key = - container_of(to_rcu_work(work), struct svc_expkey, ek_rcu_work); + struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); if (test_bit(CACHE_VALID, &key->h.flags) && !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); - kfree(key); -} - -static void expkey_put(struct kref *ref) -{ - struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); - - INIT_RCU_WORK(&key->ek_rcu_work, expkey_put_work); - queue_rcu_work(system_wq, &key->ek_rcu_work); + kfree_rcu(key, ek_rcu); } static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) @@ -364,26 +355,16 @@ static void export_stats_destroy(struct export_stats *stats) EXP_STATS_COUNTERS_NUM); } -static void svc_export_put_work(struct work_struct *work) +static void svc_export_put(struct kref *ref) { - struct svc_export *exp = - container_of(to_rcu_work(work), struct svc_export, ex_rcu_work); - + struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); export_stats_destroy(exp->ex_stats); kfree(exp->ex_stats); kfree(exp->ex_uuid); - kfree(exp); -} - -static void svc_export_put(struct kref *ref) -{ - struct svc_export *exp = container_of(ref, struct svc_export, h.ref); - - INIT_RCU_WORK(&exp->ex_rcu_work, svc_export_put_work); - queue_rcu_work(system_wq, &exp->ex_rcu_work); + kfree_rcu(exp, ex_rcu); } static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 6f2fbaae01fa2..4d92b99c1ffdd 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -75,7 +75,7 @@ struct svc_export { u32 ex_layout_types; struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; - struct rcu_work ex_rcu_work; + struct rcu_head ex_rcu; unsigned long ex_xprtsec_modes; struct export_stats *ex_stats; }; @@ -92,7 +92,7 @@ struct svc_expkey { u32 ek_fsid[6]; struct path ek_path; - struct rcu_work ek_rcu_work; + struct rcu_head ek_rcu; }; #define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) From 020b40f3562495f3c703a283ece145ffec19e82d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 08:21:46 -0700 Subject: [PATCH 0632/1450] io_uring: make ctx->timeout_lock a raw spinlock Chase reports that their tester complaints about a locking context mismatch: ============================= [ BUG: Invalid wait context ] 6.13.0-rc1-gf137f14b7ccb-dirty #9 Not tainted ----------------------------- syz.1.25198/182604 is trying to lock: ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: spin_lock_irq include/linux/spinlock.h:376 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe io_uring/io_uring.c:218 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 other info that might help us debug this: context-{5:5} 1 lock held by syz.1.25198/182604: #0: ffff88802b7d48c0 (&acct->lock){+.+.}-{2:2}, at: io_acct_cancel_pending_work+0x2d/0x6b0 io_uring/io-wq.c:1049 stack backtrace: CPU: 0 UID: 0 PID: 182604 Comm: syz.1.25198 Not tainted 6.13.0-rc1-gf137f14b7ccb-dirty #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x82/0xd0 lib/dump_stack.c:120 print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline] check_wait_context kernel/locking/lockdep.c:4898 [inline] __lock_acquire+0x883/0x3c80 kernel/locking/lockdep.c:5176 lock_acquire.part.0+0x11b/0x370 kernel/locking/lockdep.c:5849 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:119 [inline] _raw_spin_lock_irq+0x36/0x50 kernel/locking/spinlock.c:170 spin_lock_irq include/linux/spinlock.h:376 [inline] io_match_task_safe io_uring/io_uring.c:218 [inline] io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 io_acct_cancel_pending_work+0xb8/0x6b0 io_uring/io-wq.c:1052 io_wq_cancel_pending_work io_uring/io-wq.c:1074 [inline] io_wq_cancel_cb+0xb0/0x390 io_uring/io-wq.c:1112 io_uring_try_cancel_requests+0x15e/0xd70 io_uring/io_uring.c:3062 io_uring_cancel_generic+0x6ec/0x8c0 io_uring/io_uring.c:3140 io_uring_files_cancel include/linux/io_uring.h:20 [inline] do_exit+0x494/0x27a0 kernel/exit.c:894 do_group_exit+0xb3/0x250 kernel/exit.c:1087 get_signal+0x1d77/0x1ef0 kernel/signal.c:3017 arch_do_signal_or_restart+0x79/0x5b0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xd8/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f which is because io_uring has ctx->timeout_lock nesting inside the io-wq acct lock, the latter of which is used from inside the scheduler and hence is a raw spinlock, while the former is a "normal" spinlock and can hence be sleeping on PREEMPT_RT. Change ctx->timeout_lock to be a raw spinlock to solve this nesting dependency on PREEMPT_RT=y. Reported-by: chase xd Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 10 ++++----- io_uring/timeout.c | 40 +++++++++++++++++----------------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 011860ade268e..fd4cdb0860a28 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -345,7 +345,7 @@ struct io_ring_ctx { /* timeouts */ struct { - spinlock_t timeout_lock; + raw_spinlock_t timeout_lock; struct list_head timeout_list; struct list_head ltimeout_list; unsigned cq_last_tm_flush; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06ff41484e29c..605625e932eb8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -215,9 +215,9 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, struct io_ring_ctx *ctx = head->ctx; /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { matched = io_match_linked(head); } @@ -333,7 +333,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); + raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); @@ -498,10 +498,10 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index f3d502717aebb..bbe58638eca79 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -74,10 +74,10 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) if (!io_timeout_finish(timeout, data)) { if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return; } } @@ -109,7 +109,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) u32 seq; struct io_timeout *timeout, *tmp; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { @@ -134,7 +134,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -200,9 +200,9 @@ void io_disarm_next(struct io_kiocb *req) } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); } @@ -238,11 +238,11 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); @@ -285,9 +285,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { struct io_kiocb *req; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -330,7 +330,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); prev = timeout->head; timeout->head = NULL; @@ -345,7 +345,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) } list_del(&timeout->list); timeout->prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; io_req_task_work_add(req); @@ -472,12 +472,12 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) @@ -572,7 +572,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = timeout->off; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -611,7 +611,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) list_add(&timeout->list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -620,7 +620,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -633,7 +633,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } @@ -668,7 +668,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx * timeout_lockfirst to keep locking ordering. */ spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); @@ -676,7 +676,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx io_kill_timeout(req, -ECANCELED)) canceled++; } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); return canceled != 0; } From 62e2a47ceab8f3f7d2e3f0e03fdd1c5e0059fd8b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 16 Dec 2024 19:28:06 -0500 Subject: [PATCH 0633/1450] NFS/pnfs: Fix a live lock between recalled layouts and layoutget When the server is recalling a layout, we should ignore the count of outstanding layoutget calls, since the server is expected to return either NFS4ERR_RECALLCONFLICT or NFS4ERR_RETURNCONFLICT for as long as the recall is outstanding. Currently, we may end up livelocking, causing the layout to eventually be forcibly revoked. Fixes: bf0291dd2267 ("pNFS: Ensure LAYOUTGET and LAYOUTRETURN are properly serialised") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0d16b383a4526..5f582713bf05e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1308,7 +1308,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, enum pnfs_iomode *iomode) { /* Serialise LAYOUTGET/LAYOUTRETURN */ - if (atomic_read(&lo->plh_outstanding) != 0) + if (atomic_read(&lo->plh_outstanding) != 0 && lo->plh_return_seq == 0) return false; if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) return false; From bedb4e6088a886f587d2ea44e0c198c8ce2182c9 Mon Sep 17 00:00:00 2001 From: Zhang Kunbo Date: Tue, 17 Dec 2024 07:19:21 +0000 Subject: [PATCH 0634/1450] fs/nfs: fix missing declaration of nfs_idmap_cache_timeout fs/nfs/super.c should include fs/nfs/nfs4idmap.h for declaration of nfs_idmap_cache_timeout. This fixes the sparse warning: fs/nfs/super.c:1397:14: warning: symbol 'nfs_idmap_cache_timeout' was not declared. Should it be static? Signed-off-by: Zhang Kunbo Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ae5c5e39afa03..aeb715b4a6902 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -73,6 +73,7 @@ #include "nfs.h" #include "netns.h" #include "sysfs.h" +#include "nfs4idmap.h" #define NFSDBG_FACILITY NFSDBG_VFS From 185e1b1d91e419445d3fd99c1c0376a970438acf Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Mon, 16 Dec 2024 11:25:38 +0900 Subject: [PATCH 0635/1450] platform/x86: mlx-platform: call pci_dev_put() to balance the refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mlxplat_pci_fpga_device_init() calls pci_get_device() but does not release the refcount on error path. Call pci_dev_put() on the error path and in mlxplat_pci_fpga_device_exit() to fix this. This bug was found by an experimental static analysis tool that I am developing. Fixes: 02daa222fbdd ("platform: mellanox: Add initial support for PCIe based programming logic device") Signed-off-by: Joe Hattori Reviewed-by: Vadim Pasternak Link: https://lore.kernel.org/r/20241216022538.381209-1-joe@pf.is.s.u-tokyo.ac.jp Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/mlx-platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c index 671021cd1f598..9c7f30a47f1f1 100644 --- a/drivers/platform/x86/mlx-platform.c +++ b/drivers/platform/x86/mlx-platform.c @@ -6237,6 +6237,7 @@ mlxplat_pci_fpga_device_init(unsigned int device, const char *res_name, struct p fail_pci_request_regions: pci_disable_device(pci_dev); fail_pci_enable_device: + pci_dev_put(pci_dev); return err; } @@ -6247,6 +6248,7 @@ mlxplat_pci_fpga_device_exit(struct pci_dev *pci_bridge, iounmap(pci_bridge_addr); pci_release_regions(pci_bridge); pci_disable_device(pci_bridge); + pci_dev_put(pci_bridge); } static int From a6629626c584200daf495cc9a740048b455addcd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:19 -0500 Subject: [PATCH 0636/1450] tracing: Fix test_event_printk() to process entire print argument The test_event_printk() analyzes print formats of trace events looking for cases where it may dereference a pointer that is not in the ring buffer which can possibly be a bug when the trace event is read from the ring buffer and the content of that pointer no longer exists. The function needs to accurately go from one print format argument to the next. It handles quotes and parenthesis that may be included in an argument. When it finds the start of the next argument, it uses a simple "c = strstr(fmt + i, ',')" to find the end of that argument! In order to include "%s" dereferencing, it needs to process the entire content of the print format argument and not just the content of the first ',' it finds. As there may be content like: ({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char *access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" }; union kvm_mmu_page_role role; role.word = REC->role; trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe %sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level, role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "", access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? "" : "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ? "unsync" : "sync", 0); saved_ptr; }) Which is an example of a full argument of an existing event. As the code already handles finding the next print format argument, process the argument at the end of it and not the start of it. This way it has both the start of the argument as well as the end of it. Add a helper function "process_pointer()" that will do the processing during the loop as well as at the end. It also makes the code cleaner and easier to read. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 82 ++++++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 77e68efbd43e2..14e160a5b9054 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -265,8 +265,7 @@ static bool test_field(const char *fmt, struct trace_event_call *call) len = p - fmt; for (; field->type; field++) { - if (strncmp(field->name, fmt, len) || - field->name[len]) + if (strncmp(field->name, fmt, len) || field->name[len]) continue; array_descriptor = strchr(field->type, '['); /* This is an array and is OK to dereference. */ @@ -275,6 +274,32 @@ static bool test_field(const char *fmt, struct trace_event_call *call) return false; } +/* Return true if the argument pointer is safe */ +static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) +{ + const char *r, *e, *a; + + e = fmt + len; + + /* Find the REC-> in the argument */ + r = strstr(fmt, "REC->"); + if (r && r < e) { + /* + * Addresses of events on the buffer, or an array on the buffer is + * OK to dereference. There's ways to fool this, but + * this is to catch common mistakes, not malicious code. + */ + a = strchr(fmt, '&'); + if ((a && (a < r)) || test_field(r, call)) + return true; + } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) { + return true; + } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) { + return true; + } + return false; +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -285,12 +310,12 @@ static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; bool first = true; - const char *fmt, *c, *r, *a; + const char *fmt; int parens = 0; char in_quote = 0; int start_arg = 0; int arg = 0; - int i; + int i, e; fmt = call->print_fmt; @@ -403,42 +428,41 @@ static void test_event_printk(struct trace_event_call *call) case ',': if (in_quote || parens) continue; + e = i; i++; while (isspace(fmt[i])) i++; - start_arg = i; - if (!(dereference_flags & (1ULL << arg))) - goto next_arg; - /* Find the REC-> in the argument */ - c = strchr(fmt + i, ','); - r = strstr(fmt + i, "REC->"); - if (r && (!c || r < c)) { - /* - * Addresses of events on the buffer, - * or an array on the buffer is - * OK to dereference. - * There's ways to fool this, but - * this is to catch common mistakes, - * not malicious code. - */ - a = strchr(fmt + i, '&'); - if ((a && (a < r)) || test_field(r, call)) + /* + * If start_arg is zero, then this is the start of the + * first argument. The processing of the argument happens + * when the end of the argument is found, as it needs to + * handle paranthesis and such. + */ + if (!start_arg) { + start_arg = i; + /* Balance out the i++ in the for loop */ + i--; + continue; + } + + if (dereference_flags & (1ULL << arg)) { + if (process_pointer(fmt + start_arg, e - start_arg, call)) dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); } - next_arg: - i--; + start_arg = i; arg++; + /* Balance out the i++ in the for loop */ + i--; } } + if (dereference_flags & (1ULL << arg)) { + if (process_pointer(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } + /* * If you triggered the below warning, the trace event reported * uses an unsafe dereference pointer %p*. As the data stored From 917110481f6bc1c96b1e54b62bb114137fbc6d17 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:20 -0500 Subject: [PATCH 0637/1450] tracing: Add missing helper functions in event pointer dereference check The process_pointer() helper function looks to see if various trace event macros are used. These macros are for storing data in the event. This makes it safe to dereference as the dereference will then point into the event on the ring buffer where the content of the data stays with the event itself. A few helper functions were missing. Those were: __get_rel_dynamic_array() __get_dynamic_array_len() __get_rel_dynamic_array_len() __get_rel_sockaddr() Also add a helper function find_print_string() to not need to use a middle man variable to test if the string exists. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.521836792@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 14e160a5b9054..df75c06bb23fc 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -274,6 +274,15 @@ static bool test_field(const char *fmt, struct trace_event_call *call) return false; } +/* Look for a string within an argument */ +static bool find_print_string(const char *arg, const char *str, const char *end) +{ + const char *r; + + r = strstr(arg, str); + return r && r < end; +} + /* Return true if the argument pointer is safe */ static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) { @@ -292,9 +301,17 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c a = strchr(fmt, '&'); if ((a && (a < r)) || test_field(r, call)) return true; - } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) { + } else if (find_print_string(fmt, "__get_dynamic_array(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) { + return true; + } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_sockaddr(", e)) { return true; - } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) { + } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) { return true; } return false; From 65a25d9f7ac02e0cf361356e834d1c71d36acca9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:21 -0500 Subject: [PATCH 0638/1450] tracing: Add "%s" check in test_event_printk() The test_event_printk() code makes sure that when a trace event is registered, any dereferenced pointers in from the event's TP_printk() are pointing to content in the ring buffer. But currently it does not handle "%s", as there's cases where the string pointer saved in the ring buffer points to a static string in the kernel that will never be freed. As that is a valid case, the pointer needs to be checked at runtime. Currently the runtime check is done via trace_check_vprintf(), but to not have to replicate everything in vsnprintf() it does some logic with the va_list that may not be reliable across architectures. In order to get rid of that logic, more work in the test_event_printk() needs to be done. Some of the strings can be validated at this time when it is obvious the string is valid because the string will be saved in the ring buffer content. Do all the validation of strings in the ring buffer at boot in test_event_printk(), and make sure that the field of the strings that point into the kernel are accessible. This will allow adding checks at runtime that will validate the fields themselves and not rely on paring the TP_printk() format at runtime. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.685917008@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 104 ++++++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index df75c06bb23fc..521ad2fd1fe70 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -244,19 +244,16 @@ int trace_event_get_offsets(struct trace_event_call *call) return tail->offset + tail->size; } -/* - * Check if the referenced field is an array and return true, - * as arrays are OK to dereference. - */ -static bool test_field(const char *fmt, struct trace_event_call *call) + +static struct trace_event_fields *find_event_field(const char *fmt, + struct trace_event_call *call) { struct trace_event_fields *field = call->class->fields_array; - const char *array_descriptor; const char *p = fmt; int len; if (!(len = str_has_prefix(fmt, "REC->"))) - return false; + return NULL; fmt += len; for (p = fmt; *p; p++) { if (!isalnum(*p) && *p != '_') @@ -267,11 +264,26 @@ static bool test_field(const char *fmt, struct trace_event_call *call) for (; field->type; field++) { if (strncmp(field->name, fmt, len) || field->name[len]) continue; - array_descriptor = strchr(field->type, '['); - /* This is an array and is OK to dereference. */ - return array_descriptor != NULL; + + return field; } - return false; + return NULL; +} + +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field; + + field = find_event_field(fmt, call); + if (!field) + return false; + + /* This is an array and is OK to dereference. */ + return strchr(field->type, '[') != NULL; } /* Look for a string within an argument */ @@ -317,6 +329,53 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c return false; } +/* Return true if the string is safe */ +static bool process_string(const char *fmt, int len, struct trace_event_call *call) +{ + const char *r, *e, *s; + + e = fmt + len; + + /* + * There are several helper functions that return strings. + * If the argument contains a function, then assume its field is valid. + * It is considered that the argument has a function if it has: + * alphanumeric or '_' before a parenthesis. + */ + s = fmt; + do { + r = strstr(s, "("); + if (!r || r >= e) + break; + for (int i = 1; r - i >= s; i++) { + char ch = *(r - i); + if (isspace(ch)) + continue; + if (isalnum(ch) || ch == '_') + return true; + /* Anything else, this isn't a function */ + break; + } + /* A function could be wrapped in parethesis, try the next one */ + s = r + 1; + } while (s < e); + + /* + * If there's any strings in the argument consider this arg OK as it + * could be: REC->field ? "foo" : "bar" and we don't want to get into + * verifying that logic here. + */ + if (find_print_string(fmt, "\"", e)) + return true; + + /* Dereferenced strings are also valid like any other pointer */ + if (process_pointer(fmt, len, call)) + return true; + + /* Make sure the field is found, and consider it OK for now if it is */ + return find_event_field(fmt, call) != NULL; +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -326,6 +385,7 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; + u64 string_flags = 0; bool first = true; const char *fmt; int parens = 0; @@ -416,8 +476,16 @@ static void test_event_printk(struct trace_event_call *call) star = true; continue; } - if ((fmt[i + j] == 's') && star) - arg++; + if ((fmt[i + j] == 's')) { + if (star) + arg++; + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + string_flags |= 1ULL << arg; + } break; } break; @@ -464,7 +532,10 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (process_pointer(fmt + start_arg, e - start_arg, call)) + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, e - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, e - start_arg, call)) dereference_flags &= ~(1ULL << arg); } @@ -476,7 +547,10 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (process_pointer(fmt + start_arg, i - start_arg, call)) + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, i - start_arg, call)) dereference_flags &= ~(1ULL << arg); } From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:22 -0500 Subject: [PATCH 0639/1450] tracing: Check "%s" dereference via the field and not the TP_printk format The TP_printk() portion of a trace event is executed at the time a event is read from the trace. This can happen seconds, minutes, hours, days, months, years possibly later since the event was recorded. If the print format contains a dereference to a string via "%s", and that string was allocated, there's a chance that string could be freed before it is read by the trace file. To protect against such bugs, there are two functions that verify the event. The first one is test_event_printk(), which is called when the event is created. It reads the TP_printk() format as well as its arguments to make sure nothing may be dereferencing a pointer that was not copied into the ring buffer along with the event. If it is, it will trigger a WARN_ON(). For strings that use "%s", it is not so easy. The string may not reside in the ring buffer but may still be valid. Strings that are static and part of the kernel proper which will not be freed for the life of the running system, are safe to dereference. But to know if it is a pointer to a static string or to something on the heap can not be determined until the event is triggered. This brings us to the second function that tests for the bad dereferencing of strings, trace_check_vprintf(). It would walk through the printf format looking for "%s", and when it finds it, it would validate that the pointer is safe to read. If not, it would produces a WARN_ON() as well and write into the ring buffer "[UNSAFE-MEMORY]". The problem with this is how it used va_list to have vsnprintf() handle all the cases that it didn't need to check. Instead of re-implementing vsnprintf(), it would make a copy of the format up to the %s part, and call vsnprintf() with the current va_list ap variable, where the ap would then be ready to point at the string in question. For architectures that passed va_list by reference this was possible. For architectures that passed it by copy it was not. A test_can_verify() function was used to differentiate between the two, and if it wasn't possible, it would disable it. Even for architectures where this was feasible, it was a stretch to rely on such a method that is undocumented, and could cause issues later on with new optimizations of the compiler. Instead, the first function test_event_printk() was updated to look at "%s" as well. If the "%s" argument is a pointer outside the event in the ring buffer, it would find the field type of the event that is the problem and mark the structure with a new flag called "needs_test". The event itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that this event has a field that needs to be verified before the event can be printed using the printf format. When the event fields are created from the field type structure, the fields would copy the field type's "needs_test" value. Finally, before being printed, a new function ignore_event() is called which will check if the event has the TEST_STR flag set (if not, it returns false). If the flag is set, it then iterates through the events fields looking for the ones that have the "needs_test" flag set. Then it uses the offset field from the field structure to find the pointer in the ring buffer event. It runs the tests to make sure that pointer is safe to print and if not, it triggers the WARN_ON() and also adds to the trace output that the event in question has an unsafe memory access. The ignore_event() makes the trace_check_vprintf() obsolete so it is removed. Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +- kernel/trace/trace.c | 255 ++++++++--------------------------- kernel/trace/trace.h | 6 +- kernel/trace/trace_events.c | 32 +++-- kernel/trace/trace_output.c | 6 +- 5 files changed, 88 insertions(+), 217 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2a5df5b62cfc7..91b8ffbdfa8c4 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -273,7 +273,8 @@ struct trace_event_fields { const char *name; const int size; const int align; - const int is_signed; + const unsigned int is_signed:1; + unsigned int needs_test:1; const int filter_type; const int len; }; @@ -324,6 +325,7 @@ enum { TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, + TRACE_EVENT_FL_TEST_STR_BIT, }; /* @@ -340,6 +342,7 @@ enum { * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. + * TEST_STR - The event has a "%s" that points to a string outside the event */ enum { TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), @@ -352,6 +355,7 @@ enum { TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), + TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be62f0ea1814d..7cc18b9bce271 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3611,17 +3611,12 @@ char *trace_iter_expand_format(struct trace_iterator *iter) } /* Returns true if the string is safe to dereference from an event */ -static bool trace_safe_str(struct trace_iterator *iter, const char *str, - bool star, int len) +static bool trace_safe_str(struct trace_iterator *iter, const char *str) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; - /* Ignore strings with no length */ - if (star && !len) - return true; - /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) @@ -3661,181 +3656,69 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static DEFINE_STATIC_KEY_FALSE(trace_no_verify); - -static int test_can_verify_check(const char *fmt, ...) -{ - char buf[16]; - va_list ap; - int ret; - - /* - * The verifier is dependent on vsnprintf() modifies the va_list - * passed to it, where it is sent as a reference. Some architectures - * (like x86_32) passes it by value, which means that vsnprintf() - * does not modify the va_list passed to it, and the verifier - * would then need to be able to understand all the values that - * vsnprintf can use. If it is passed by value, then the verifier - * is disabled. - */ - va_start(ap, fmt); - vsnprintf(buf, 16, "%d", ap); - ret = va_arg(ap, int); - va_end(ap); - - return ret; -} - -static void test_can_verify(void) -{ - if (!test_can_verify_check("%d %d", 0, 1)) { - pr_info("trace event string verifier disabled\n"); - static_branch_inc(&trace_no_verify); - } -} - /** - * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * ignore_event - Check dereferenced fields while writing to the seq buffer * @iter: The iterator that holds the seq buffer and the event being printed - * @fmt: The format used to print the event - * @ap: The va_list holding the data to print from @fmt. * - * This writes the data into the @iter->seq buffer using the data from - * @fmt and @ap. If the format has a %s, then the source of the string - * is examined to make sure it is safe to print, otherwise it will - * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string - * pointer. + * At boot up, test_event_printk() will flag any event that dereferences + * a string with "%s" that does exist in the ring buffer. It may still + * be valid, as the string may point to a static string in the kernel + * rodata that never gets freed. But if the string pointer is pointing + * to something that was allocated, there's a chance that it can be freed + * by the time the user reads the trace. This would cause a bad memory + * access by the kernel and possibly crash the system. + * + * This function will check if the event has any fields flagged as needing + * to be checked at runtime and perform those checks. + * + * If it is found that a field is unsafe, it will write into the @iter->seq + * a message stating what was found to be unsafe. + * + * @return: true if the event is unsafe and should be ignored, + * false otherwise. */ -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) +bool ignore_event(struct trace_iterator *iter) { - long text_delta = 0; - long data_delta = 0; - const char *p = fmt; - const char *str; - bool good; - int i, j; + struct ftrace_event_field *field; + struct trace_event *trace_event; + struct trace_event_call *event; + struct list_head *head; + struct trace_seq *seq; + const void *ptr; - if (WARN_ON_ONCE(!fmt)) - return; + trace_event = ftrace_find_event(iter->ent->type); - if (static_branch_unlikely(&trace_no_verify)) - goto print; + seq = &iter->seq; - /* - * When the kernel is booted with the tp_printk command line - * parameter, trace events go directly through to printk(). - * It also is checked by this function, but it does not - * have an associated trace_array (tr) for it. - */ - if (iter->tr) { - text_delta = iter->tr->text_delta; - data_delta = iter->tr->data_delta; + if (!trace_event) { + trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); + return true; } - /* Don't bother checking when doing a ftrace_dump() */ - if (iter->fmt == static_fmt_buf) - goto print; - - while (*p) { - bool star = false; - int len = 0; - - j = 0; - - /* - * We only care about %s and variants - * as well as %p[sS] if delta is non-zero - */ - for (i = 0; p[i]; i++) { - if (i + 1 >= iter->fmt_size) { - /* - * If we can't expand the copy buffer, - * just print it. - */ - if (!trace_iter_expand_format(iter)) - goto print; - } - - if (p[i] == '\\' && p[i+1]) { - i++; - continue; - } - if (p[i] == '%') { - /* Need to test cases like %08.*s */ - for (j = 1; p[i+j]; j++) { - if (isdigit(p[i+j]) || - p[i+j] == '.') - continue; - if (p[i+j] == '*') { - star = true; - continue; - } - break; - } - if (p[i+j] == 's') - break; - - if (text_delta && p[i+1] == 'p' && - ((p[i+2] == 's' || p[i+2] == 'S'))) - break; - - star = false; - } - j = 0; - } - /* If no %s found then just print normally */ - if (!p[i]) - break; - - /* Copy up to the %s, and print that */ - strncpy(iter->fmt, p, i); - iter->fmt[i] = '\0'; - trace_seq_vprintf(&iter->seq, iter->fmt, ap); + event = container_of(trace_event, struct trace_event_call, event); + if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) + return false; - /* Add delta to %pS pointers */ - if (p[i+1] == 'p') { - unsigned long addr; - char fmt[4]; + head = trace_get_fields(event); + if (!head) { + trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", + trace_event_name(event)); + return true; + } - fmt[0] = '%'; - fmt[1] = 'p'; - fmt[2] = p[i+2]; /* Either %ps or %pS */ - fmt[3] = '\0'; + /* Offsets are from the iter->ent that points to the raw event */ + ptr = iter->ent; - addr = va_arg(ap, unsigned long); - addr += text_delta; - trace_seq_printf(&iter->seq, fmt, (void *)addr); + list_for_each_entry(field, head, link) { + const char *str; + bool good; - p += i + 3; + if (!field->needs_test) continue; - } - /* - * If iter->seq is full, the above call no longer guarantees - * that ap is in sync with fmt processing, and further calls - * to va_arg() can return wrong positional arguments. - * - * Ensure that ap is no longer used in this case. - */ - if (iter->seq.full) { - p = ""; - break; - } - - if (star) - len = va_arg(ap, int); - - /* The ap now points to the string data of the %s */ - str = va_arg(ap, const char *); + str = *(const char **)(ptr + field->offset); - good = trace_safe_str(iter, str, star, len); - - /* Could be from the last boot */ - if (data_delta && !good) { - str += data_delta; - good = trace_safe_str(iter, str, star, len); - } + good = trace_safe_str(iter, str); /* * If you hit this warning, it is likely that the @@ -3846,44 +3729,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, * instead. See samples/trace_events/trace-events-sample.h * for reference. */ - if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'", - fmt, seq_buf_str(&iter->seq.seq))) { - int ret; - - /* Try to safely read the string */ - if (star) { - if (len + 1 > iter->fmt_size) - len = iter->fmt_size - 1; - if (len < 0) - len = 0; - ret = copy_from_kernel_nofault(iter->fmt, str, len); - iter->fmt[len] = 0; - star = false; - } else { - ret = strncpy_from_kernel_nofault(iter->fmt, str, - iter->fmt_size); - } - if (ret < 0) - trace_seq_printf(&iter->seq, "(0x%px)", str); - else - trace_seq_printf(&iter->seq, "(0x%px:%s)", - str, iter->fmt); - str = "[UNSAFE-MEMORY]"; - strcpy(iter->fmt, "%s"); - } else { - strncpy(iter->fmt, p + i, j + 1); - iter->fmt[j+1] = '\0'; + if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", + trace_event_name(event), field->name)) { + trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", + trace_event_name(event), field->name); + return true; } - if (star) - trace_seq_printf(&iter->seq, iter->fmt, len, str); - else - trace_seq_printf(&iter->seq, iter->fmt, str); - - p += i + j + 1; } - print: - if (*p) - trace_seq_vprintf(&iter->seq, p, ap); + return false; } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) @@ -10777,8 +10630,6 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); - test_can_verify(); - return 0; out_free_pipe_cpumask: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 266740b4e1212..9691b47b5f3da 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -667,9 +667,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) __printf(2, 0); char *trace_iter_expand_format(struct trace_iterator *iter); +bool ignore_event(struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -1413,7 +1412,8 @@ struct ftrace_event_field { int filter_type; int offset; int size; - int is_signed; + unsigned int is_signed:1; + unsigned int needs_test:1; int len; }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 521ad2fd1fe70..1545cc8b49d02 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -82,7 +82,7 @@ static int system_refcount_dec(struct event_subsystem *system) } static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) +__find_event_field(struct list_head *head, const char *name) { struct ftrace_event_field *field; @@ -114,7 +114,8 @@ trace_find_event_field(struct trace_event_call *call, char *name) static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, - int is_signed, int filter_type, int len) + int is_signed, int filter_type, int len, + int need_test) { struct ftrace_event_field *field; @@ -133,6 +134,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field->offset = offset; field->size = size; field->is_signed = is_signed; + field->needs_test = need_test; field->len = len; list_add(&field->link, head); @@ -151,13 +153,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, 0); + is_signed, filter_type, 0, 0); } EXPORT_SYMBOL_GPL(trace_define_field); static int trace_define_field_ext(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, - int filter_type, int len) + int filter_type, int len, int need_test) { struct list_head *head; @@ -166,13 +168,13 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, len); + is_signed, filter_type, len, need_test); } #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ - filter_type, 0); \ + filter_type, 0, 0); \ if (ret) \ return ret; @@ -181,7 +183,8 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER, 0); \ + is_signed_type(type), FILTER_OTHER, \ + 0, 0); \ if (ret) \ return ret; @@ -332,6 +335,7 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c /* Return true if the string is safe */ static bool process_string(const char *fmt, int len, struct trace_event_call *call) { + struct trace_event_fields *field; const char *r, *e, *s; e = fmt + len; @@ -372,8 +376,16 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca if (process_pointer(fmt, len, call)) return true; - /* Make sure the field is found, and consider it OK for now if it is */ - return find_event_field(fmt, call) != NULL; + /* Make sure the field is found */ + field = find_event_field(fmt, call); + if (!field) + return false; + + /* Test this field's string before printing the event */ + call->flags |= TRACE_EVENT_FL_TEST_STR; + field->needs_test = 1; + + return true; } /* @@ -2586,7 +2598,7 @@ event_define_fields(struct trace_event_call *call) ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, field->is_signed, field->filter_type, - field->len); + field->len, field->needs_test); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index da748b7cbc4d5..03d56f711ad14 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep); void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) { + struct trace_seq *s = &iter->seq; va_list ap; + if (ignore_event(iter)) + return; + va_start(ap, fmt); - trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); + trace_seq_vprintf(s, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); From b6ccddd6fe1fd49c7a82b6fbed01cccad21a29c7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 11 Dec 2024 08:11:46 -0800 Subject: [PATCH 0640/1450] perf/x86/intel/uncore: Add Clearwater Forest support From the perspective of the uncore PMU, the Clearwater Forest is the same as the previous Sierra Forest. The only difference is the event list, which will be supported in the perf tool later. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241211161146.235253-1-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d98fac5676846..e7aba7349231d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1910,6 +1910,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_uncore_init), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_uncore_init), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_uncore_init), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &gnr_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); From b8c3a2502a205321fe66c356f4b70cabd8e1a5fc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 12:45:02 -0800 Subject: [PATCH 0641/1450] perf/x86/intel/ds: Add PEBS format 6 The only difference between 5 and 6 is the new counters snapshotting group, without the following counters snapshotting enabling patches, it's impossible to utilize the feature in a PEBS record. It's safe to share the same code path with format 5. Add format 6, so the end user can at least utilize the legacy PEBS features. Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241216204505.748363-1-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 1a4b326ca2ce1..6ba6549f26fac 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2517,6 +2517,7 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; + case 6: case 5: x86_pmu.pebs_ept = 1; fallthrough; From 4a077914578183ec397ad09f7156a357e00e5d72 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 12 Dec 2024 14:21:33 -0800 Subject: [PATCH 0642/1450] locking/rtmutex: Make sure we wake anything on the wake_q when we release the lock->wait_lock Bert reported seeing occasional boot hangs when running with PREEPT_RT and bisected it down to commit 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock"). It looks like I missed a few spots where we drop the wait_lock and potentially call into schedule without waking up the tasks on the wake_q structure. Since the tasks being woken are ww_mutex tasks they need to be able to run to release the mutex and unblock the task that currently is planning to wake them. Thus we can deadlock. So make sure we wake the wake_q tasks when we unlock the wait_lock. Closes: https://lore.kernel.org/lkml/20241211182502.2915-1-spasswolf@web.de Fixes: 894d1b3db41c ("locking/mutex: Remove wakeups from under mutex::wait_lock") Reported-by: Bert Karwatzki Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241212222138.2400498-1-jstultz@google.com --- kernel/locking/rtmutex.c | 18 ++++++++++++++++-- kernel/locking/rtmutex_api.c | 2 +- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index e858de203eb6f..697a56d3d949b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1292,7 +1292,13 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, */ get_task_struct(owner); + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */ + wake_up_q(wake_q); + wake_q_init(wake_q); + preempt_enable(); + res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); @@ -1596,6 +1602,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter + * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock * * Must be called with lock->wait_lock held and interrupts disabled */ @@ -1603,7 +1610,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + struct wake_q_head *wake_q) __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); @@ -1634,7 +1642,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } + preempt_enable(); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) rt_mutex_schedule(); @@ -1708,7 +1722,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); if (likely(!ret)) - ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); + ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q); if (likely(!ret)) { /* acquired the lock */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 33ea31d6a7b3b..191e4720e5466 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -383,7 +383,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ set_current_state(TASK_INTERRUPTIBLE); - ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter); + ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. From 8fc38062be3f692ff8816da84fde71972530bcc4 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 16 Dec 2024 08:42:47 +0100 Subject: [PATCH 0643/1450] fbdev: Fix recursive dependencies wrt BACKLIGHT_CLASS_DEVICE Do not select BACKLIGHT_CLASS_DEVICE from FB_BACKLIGHT. The latter only controls backlight support within fbdev core code and data structures. Make fbdev drivers depend on BACKLIGHT_CLASS_DEVICE and let users select it explicitly. Fixes warnings about recursive dependencies, such as error: recursive dependency detected! symbol BACKLIGHT_CLASS_DEVICE is selected by FB_BACKLIGHT symbol FB_BACKLIGHT is selected by FB_SH_MOBILE_LCDC symbol FB_SH_MOBILE_LCDC depends on FB_DEVICE symbol FB_DEVICE depends on FB_CORE symbol FB_CORE is selected by DRM_GEM_DMA_HELPER symbol DRM_GEM_DMA_HELPER is selected by DRM_PANEL_ILITEK_ILI9341 symbol DRM_PANEL_ILITEK_ILI9341 depends on BACKLIGHT_CLASS_DEVICE BACKLIGHT_CLASS_DEVICE is user-selectable, so making drivers adapt to it is the correct approach in any case. For most drivers, backlight support is also configurable separately. v3: - Select BACKLIGHT_CLASS_DEVICE in PowerMac defconfigs (Christophe) - Fix PMAC_BACKLIGHT module dependency corner cases (Christophe) v2: - s/BACKLIGHT_DEVICE_CLASS/BACKLIGHT_CLASS_DEVICE (Helge) - Fix fbdev driver-dependency corner case (Arnd) Signed-off-by: Thomas Zimmermann Reviewed-by: Arnd Bergmann Link: https://patchwork.freedesktop.org/patch/msgid/20241216074450.8590-2-tzimmermann@suse.de --- arch/powerpc/configs/pmac32_defconfig | 1 + arch/powerpc/configs/ppc6xx_defconfig | 1 + drivers/auxdisplay/Kconfig | 2 +- drivers/macintosh/Kconfig | 1 + drivers/staging/fbtft/Kconfig | 1 + drivers/video/fbdev/Kconfig | 18 +++++++++++++----- drivers/video/fbdev/core/Kconfig | 3 +-- 7 files changed, 19 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 57ded82c28409..e8b3f67bf3f56 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -208,6 +208,7 @@ CONFIG_FB_ATY=y CONFIG_FB_ATY_CT=y CONFIG_FB_ATY_GX=y CONFIG_FB_3DFX=y +CONFIG_BACKLIGHT_CLASS_DEVICE=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 4d77e17541e95..ca0c90e958379 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -716,6 +716,7 @@ CONFIG_FB_TRIDENT=m CONFIG_FB_SM501=m CONFIG_FB_IBM_GXT4500=y CONFIG_LCD_PLATFORM=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y CONFIG_LOGO=y diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 21545ffba0658..8934e6ad5772b 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -489,7 +489,7 @@ config IMG_ASCII_LCD config HT16K33 tristate "Holtek Ht16K33 LED controller with keyscan" - depends on FB && I2C && INPUT + depends on FB && I2C && INPUT && BACKLIGHT_CLASS_DEVICE select FB_SYSMEM_HELPERS select INPUT_MATRIXKMAP select FB_BACKLIGHT diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index fb38f684444f2..d00e713c10920 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -120,6 +120,7 @@ config PMAC_MEDIABAY config PMAC_BACKLIGHT bool "Backlight control for LCD screens" depends on PPC_PMAC && ADB_PMU && FB = y && (BROKEN || !PPC64) + depends on BACKLIGHT_CLASS_DEVICE=y select FB_BACKLIGHT help Say Y here to enable Macintosh specific extensions of the generic diff --git a/drivers/staging/fbtft/Kconfig b/drivers/staging/fbtft/Kconfig index 77ab44362f167..dcf6a70455cc2 100644 --- a/drivers/staging/fbtft/Kconfig +++ b/drivers/staging/fbtft/Kconfig @@ -3,6 +3,7 @@ menuconfig FB_TFT tristate "Support for small TFT LCD display modules" depends on FB && SPI depends on FB_DEVICE + depends on BACKLIGHT_CLASS_DEVICE depends on GPIOLIB || COMPILE_TEST select FB_BACKLIGHT select FB_SYSMEM_HELPERS_DEFERRED diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index de035071fedb1..55c6686f091e7 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -649,6 +649,7 @@ config FB_S1D13XXX config FB_ATMEL tristate "AT91 LCD Controller support" depends on FB && OF && HAVE_CLK && HAS_IOMEM + depends on BACKLIGHT_CLASS_DEVICE depends on HAVE_FB_ATMEL || COMPILE_TEST select FB_BACKLIGHT select FB_IOMEM_HELPERS @@ -660,7 +661,6 @@ config FB_ATMEL config FB_NVIDIA tristate "nVidia Framebuffer Support" depends on FB && PCI - select FB_BACKLIGHT if FB_NVIDIA_BACKLIGHT select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -700,6 +700,8 @@ config FB_NVIDIA_DEBUG config FB_NVIDIA_BACKLIGHT bool "Support for backlight control" depends on FB_NVIDIA + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_NVIDIA + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -707,7 +709,6 @@ config FB_NVIDIA_BACKLIGHT config FB_RIVA tristate "nVidia Riva support" depends on FB && PCI - select FB_BACKLIGHT if FB_RIVA_BACKLIGHT select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -747,6 +748,8 @@ config FB_RIVA_DEBUG config FB_RIVA_BACKLIGHT bool "Support for backlight control" depends on FB_RIVA + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_RIVA + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -934,7 +937,6 @@ config FB_MATROX_MAVEN config FB_RADEON tristate "ATI Radeon display support" depends on FB && PCI - select FB_BACKLIGHT if FB_RADEON_BACKLIGHT select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -960,6 +962,8 @@ config FB_RADEON_I2C config FB_RADEON_BACKLIGHT bool "Support for backlight control" depends on FB_RADEON + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_RADEON + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -975,7 +979,6 @@ config FB_RADEON_DEBUG config FB_ATY128 tristate "ATI Rage128 display support" depends on FB && PCI - select FB_BACKLIGHT if FB_ATY128_BACKLIGHT select FB_IOMEM_HELPERS select FB_MACMODES if PPC_PMAC help @@ -989,6 +992,8 @@ config FB_ATY128 config FB_ATY128_BACKLIGHT bool "Support for backlight control" depends on FB_ATY128 + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_ATY128 + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -999,7 +1004,6 @@ config FB_ATY select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT - select FB_BACKLIGHT if FB_ATY_BACKLIGHT select FB_IOMEM_FOPS select FB_MACMODES if PPC select FB_ATY_CT if SPARC64 && PCI @@ -1040,6 +1044,8 @@ config FB_ATY_GX config FB_ATY_BACKLIGHT bool "Support for backlight control" depends on FB_ATY + depends on BACKLIGHT_CLASS_DEVICE=y || BACKLIGHT_CLASS_DEVICE=FB_ATY + select FB_BACKLIGHT default y help Say Y here if you want to control the backlight of your display. @@ -1528,6 +1534,7 @@ config FB_SH_MOBILE_LCDC depends on FB && HAVE_CLK && HAS_IOMEM depends on SUPERH || COMPILE_TEST depends on FB_DEVICE + depends on BACKLIGHT_CLASS_DEVICE select FB_BACKLIGHT select FB_DEFERRED_IO select FB_DMAMEM_HELPERS @@ -1793,6 +1800,7 @@ config FB_SSD1307 tristate "Solomon SSD1307 framebuffer support" depends on FB && I2C depends on GPIOLIB || COMPILE_TEST + depends on BACKLIGHT_CLASS_DEVICE select FB_BACKLIGHT select FB_SYSMEM_HELPERS_DEFERRED help diff --git a/drivers/video/fbdev/core/Kconfig b/drivers/video/fbdev/core/Kconfig index 0ab8848ba2f10..d554d8c543d49 100644 --- a/drivers/video/fbdev/core/Kconfig +++ b/drivers/video/fbdev/core/Kconfig @@ -183,9 +183,8 @@ config FB_SYSMEM_HELPERS_DEFERRED select FB_SYSMEM_HELPERS config FB_BACKLIGHT - tristate + bool depends on FB - select BACKLIGHT_CLASS_DEVICE config FB_MODE_HELPERS bool "Enable Video Mode Handling Helpers" From 8ce35bf0ef5a659f3a15237152770a7c1d13c996 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 16 Dec 2024 08:42:48 +0100 Subject: [PATCH 0644/1450] drm/fbdev: Select FB_CORE dependency for fbdev on DMA and TTM Select FB_CORE if GEM's DMA and TTM implementations support fbdev emulation. Fixes linker errors about missing symbols from the fbdev subsystem. Also see [1] for a related SHMEM fix. Fixes: dadd28d4142f ("drm/client: Add client-lib module") Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/series/141411/ # 1 Reviewed-by: Arnd Bergmann Link: https://patchwork.freedesktop.org/patch/msgid/20241216074450.8590-3-tzimmermann@suse.de --- drivers/gpu/drm/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index a0690049b292e..ccee570eab7db 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -358,6 +358,7 @@ config DRM_TTM_HELPER tristate depends on DRM select DRM_TTM + select FB_CORE if DRM_FBDEV_EMULATION select FB_SYSMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION help Helpers for ttm-based gem objects @@ -365,6 +366,7 @@ config DRM_TTM_HELPER config DRM_GEM_DMA_HELPER tristate depends on DRM + select FB_CORE if DRM_FBDEV_EMULATION select FB_DMAMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION help Choose this if you need the GEM DMA helper functions From 2182e0f200d097805f2f6bc0042de8695c60f386 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2024 08:42:49 +0100 Subject: [PATCH 0645/1450] drm: rework FB_CORE dependency The 'select FB_CORE' statement moved from CONFIG_DRM to DRM_CLIENT_LIB, but there are now configurations that have code calling into fb_core as built-in even though the client_lib itself is a loadable module: x86_64-linux-ld: drivers/gpu/drm/drm_fb_helper.o: in function `drm_fb_helper_set_suspend': drm_fb_helper.c:(.text+0x2c6): undefined reference to `fb_set_suspend' x86_64-linux-ld: drivers/gpu/drm/drm_fb_helper.o: in function `drm_fb_helper_resume_worker': drm_fb_helper.c:(.text+0x2e1): undefined reference to `fb_set_suspend' In addition to DRM_CLIENT_LIB, the 'select' needs to be at least in DRM_KMS_HELPER and DRM_GEM_SHMEM_HELPER, so add it here. This patch is the KMS_HELPER part of [1]. Fixes: dadd28d4142f ("drm/client: Add client-lib module") Signed-off-by: Arnd Bergmann Reviewed-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/series/141411/ # 1 Link: https://patchwork.freedesktop.org/patch/msgid/20241216074450.8590-4-tzimmermann@suse.de Signed-off-by: Thomas Zimmermann --- drivers/gpu/drm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index ccee570eab7db..772fc7625639d 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -99,6 +99,7 @@ config DRM_KUNIT_TEST config DRM_KMS_HELPER tristate depends on DRM + select FB_CORE if DRM_FBDEV_EMULATION help CRTC helpers for KMS drivers. From 20d00cfae627f048560c46ba5849011a34515103 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Mon, 16 Dec 2024 15:15:30 +0100 Subject: [PATCH 0646/1450] checkpatch: don't complain on _Generic() use Improve CamelCase recognition logic to avoid reporting on _Generic() use. Other C keywords, such as _Bool, are intentionally omitted, as those should be rather avoided in new source code. Reviewed-by: Wojciech Drewek Reviewed-by: Simon Horman Signed-off-by: Mateusz Polchlopek Acked-by: Joe Perches Signed-off-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- scripts/checkpatch.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9eed3683ad76c..a2066a6c9dd8b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5843,6 +5843,8 @@ sub process { #CamelCase if ($var !~ /^$Constant$/ && $var =~ /[A-Z][a-z]|[a-z][A-Z]/ && +#Ignore C keywords + $var !~ /^_Generic$/ && #Ignore some autogenerated defines and enum values $var !~ /^(?:[A-Z]+_){1,5}[A-Z]{1,3}[a-z]/ && #Ignore Page variants From 346947223bacf96155f603528823a60b18b92d9a Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Mon, 16 Dec 2024 15:15:31 +0100 Subject: [PATCH 0647/1450] devlink: add devlink_fmsg_put() macro Add devlink_fmsg_put() that dispatches based on the type of the value to put, example: bool -> devlink_fmsg_bool_pair_put(). Reviewed-by: Wojciech Drewek Reviewed-by: Simon Horman Signed-off-by: Mateusz Polchlopek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- include/net/devlink.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/net/devlink.h b/include/net/devlink.h index fbb9a2668e247..b5e1427ea4d72 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1261,6 +1261,17 @@ enum devlink_trap_group_generic_id { .min_burst = _min_burst, \ } +#define devlink_fmsg_put(fmsg, name, value) ( \ + _Generic((value), \ + bool : devlink_fmsg_bool_pair_put, \ + u8 : devlink_fmsg_u8_pair_put, \ + u16 : devlink_fmsg_u32_pair_put, \ + u32 : devlink_fmsg_u32_pair_put, \ + u64 : devlink_fmsg_u64_pair_put, \ + char * : devlink_fmsg_string_pair_put, \ + const char * : devlink_fmsg_string_pair_put) \ + (fmsg, name, (value))) + enum { /* device supports reload operations */ DEVLINK_F_RELOAD = 1UL << 0, From 3dbfde7f6bc7b8efff26e3e98fdd8cba20287da7 Mon Sep 17 00:00:00 2001 From: Mateusz Polchlopek Date: Mon, 16 Dec 2024 15:15:32 +0100 Subject: [PATCH 0648/1450] devlink: add devlink_fmsg_dump_skb() function Add devlink_fmsg_dump_skb() function that adds some diagnostic information about skb (like length, pkt type, MAC, etc) to devlink fmsg mechanism using bunch of devlink_fmsg_put() function calls. Signed-off-by: Mateusz Polchlopek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- include/net/devlink.h | 2 ++ net/devlink/health.c | 67 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/include/net/devlink.h b/include/net/devlink.h index b5e1427ea4d72..58e33959c8521 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1268,6 +1268,7 @@ enum devlink_trap_group_generic_id { u16 : devlink_fmsg_u32_pair_put, \ u32 : devlink_fmsg_u32_pair_put, \ u64 : devlink_fmsg_u64_pair_put, \ + int : devlink_fmsg_u32_pair_put, \ char * : devlink_fmsg_string_pair_put, \ const char * : devlink_fmsg_string_pair_put) \ (fmsg, name, (value))) @@ -2005,6 +2006,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port); size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port); +void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb); #else diff --git a/net/devlink/health.c b/net/devlink/health.c index b8d3084e6fe06..57db6799722a9 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -1238,3 +1238,70 @@ int devlink_nl_health_reporter_test_doit(struct sk_buff *skb, return reporter->ops->test(reporter, info->extack); } + +/** + * devlink_fmsg_dump_skb - Dump sk_buffer structure + * @fmsg: devlink formatted message pointer + * @skb: pointer to skb + * + * Dump diagnostic information about sk_buff structure, like headroom, length, + * tailroom, MAC, etc. + */ +void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb) +{ + struct skb_shared_info *sh = skb_shinfo(skb); + struct sock *sk = skb->sk; + bool has_mac, has_trans; + + has_mac = skb_mac_header_was_set(skb); + has_trans = skb_transport_header_was_set(skb); + + devlink_fmsg_pair_nest_start(fmsg, "skb"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "actual len", skb->len); + devlink_fmsg_put(fmsg, "head len", skb_headlen(skb)); + devlink_fmsg_put(fmsg, "data len", skb->data_len); + devlink_fmsg_put(fmsg, "tail len", skb_tailroom(skb)); + devlink_fmsg_put(fmsg, "MAC", has_mac ? skb->mac_header : -1); + devlink_fmsg_put(fmsg, "MAC len", + has_mac ? skb_mac_header_len(skb) : -1); + devlink_fmsg_put(fmsg, "network hdr", skb->network_header); + devlink_fmsg_put(fmsg, "network hdr len", + has_trans ? skb_network_header_len(skb) : -1); + devlink_fmsg_put(fmsg, "transport hdr", + has_trans ? skb->transport_header : -1); + devlink_fmsg_put(fmsg, "csum", (__force u32)skb->csum); + devlink_fmsg_put(fmsg, "csum_ip_summed", (u8)skb->ip_summed); + devlink_fmsg_put(fmsg, "csum_complete_sw", !!skb->csum_complete_sw); + devlink_fmsg_put(fmsg, "csum_valid", !!skb->csum_valid); + devlink_fmsg_put(fmsg, "csum_level", (u8)skb->csum_level); + devlink_fmsg_put(fmsg, "sw_hash", !!skb->sw_hash); + devlink_fmsg_put(fmsg, "l4_hash", !!skb->l4_hash); + devlink_fmsg_put(fmsg, "proto", ntohs(skb->protocol)); + devlink_fmsg_put(fmsg, "pkt_type", (u8)skb->pkt_type); + devlink_fmsg_put(fmsg, "iif", skb->skb_iif); + + if (sk) { + devlink_fmsg_pair_nest_start(fmsg, "sk"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "family", sk->sk_type); + devlink_fmsg_put(fmsg, "type", sk->sk_type); + devlink_fmsg_put(fmsg, "proto", sk->sk_protocol); + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); + } + + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); + + devlink_fmsg_pair_nest_start(fmsg, "shinfo"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "tx_flags", sh->tx_flags); + devlink_fmsg_put(fmsg, "nr_frags", sh->nr_frags); + devlink_fmsg_put(fmsg, "gso_size", sh->gso_size); + devlink_fmsg_put(fmsg, "gso_type", sh->gso_type); + devlink_fmsg_put(fmsg, "gso_segs", sh->gso_segs); + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_dump_skb); From 2846fe5614ac15117fddaa45b86e7e77d91dd569 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Mon, 16 Dec 2024 15:15:33 +0100 Subject: [PATCH 0649/1450] ice: rename devlink_port.[ch] to port.[ch] Drop "devlink_" prefix from files that sit in devlink/. I'm going to add more files there, and repeating "devlink" does not feel good. This is also the scheme used in most other places, most notably the devlink core files are named like that. devlink.[ch] stays as is. Reviewed-by: Kalesh AP Signed-off-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/Makefile | 2 +- drivers/net/ethernet/intel/ice/devlink/devlink.c | 2 +- .../net/ethernet/intel/ice/devlink/{devlink_port.c => port.c} | 2 +- .../net/ethernet/intel/ice/devlink/{devlink_port.h => port.h} | 0 drivers/net/ethernet/intel/ice/ice_eswitch.h | 2 +- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- drivers/net/ethernet/intel/ice/ice_repr.c | 2 +- drivers/net/ethernet/intel/ice/ice_sf_eth.c | 2 +- 8 files changed, 7 insertions(+), 7 deletions(-) rename drivers/net/ethernet/intel/ice/devlink/{devlink_port.c => port.c} (99%) rename drivers/net/ethernet/intel/ice/devlink/{devlink_port.h => port.h} (100%) diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile index 3307d551f431a..56aa23aee472b 100644 --- a/drivers/net/ethernet/intel/ice/Makefile +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -32,7 +32,7 @@ ice-y := ice_main.o \ ice_parser_rt.o \ ice_idc.o \ devlink/devlink.o \ - devlink/devlink_port.o \ + devlink/port.o \ ice_sf_eth.o \ ice_sf_vsi_vlan_ops.o \ ice_ddp.o \ diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index 415445cefdb2a..1b10682c00b8a 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -6,7 +6,7 @@ #include "ice.h" #include "ice_lib.h" #include "devlink.h" -#include "devlink_port.h" +#include "port.h" #include "ice_eswitch.h" #include "ice_fw_update.h" #include "ice_dcb_lib.h" diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink_port.c b/drivers/net/ethernet/intel/ice/devlink/port.c similarity index 99% rename from drivers/net/ethernet/intel/ice/devlink/devlink_port.c rename to drivers/net/ethernet/intel/ice/devlink/port.c index c6779d9dffffd..767419a67fef2 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink_port.c +++ b/drivers/net/ethernet/intel/ice/devlink/port.c @@ -5,7 +5,7 @@ #include "ice.h" #include "devlink.h" -#include "devlink_port.h" +#include "port.h" #include "ice_lib.h" #include "ice_fltr.h" diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink_port.h b/drivers/net/ethernet/intel/ice/devlink/port.h similarity index 100% rename from drivers/net/ethernet/intel/ice/devlink/devlink_port.h rename to drivers/net/ethernet/intel/ice/devlink/port.h diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.h b/drivers/net/ethernet/intel/ice/ice_eswitch.h index ac7db100e2cd3..5c7dcf21b222b 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.h +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.h @@ -5,7 +5,7 @@ #define _ICE_ESWITCH_H_ #include -#include "devlink/devlink_port.h" +#include "devlink/port.h" #ifdef CONFIG_ICE_SWITCHDEV void ice_eswitch_detach_vf(struct ice_pf *pf, struct ice_vf *vf); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 0ab35607e5d56..d641dd8b81846 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -14,7 +14,7 @@ #include "ice_dcb_lib.h" #include "ice_dcb_nl.h" #include "devlink/devlink.h" -#include "devlink/devlink_port.h" +#include "devlink/port.h" #include "ice_sf_eth.h" #include "ice_hwmon.h" /* Including ice_trace.h with CREATE_TRACE_POINTS defined will generate the diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index 970a99a52bf18..fb7a1b9a43139 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -4,7 +4,7 @@ #include "ice.h" #include "ice_eswitch.h" #include "devlink/devlink.h" -#include "devlink/devlink_port.h" +#include "devlink/port.h" #include "ice_sriov.h" #include "ice_tc_lib.h" #include "ice_dcb_lib.h" diff --git a/drivers/net/ethernet/intel/ice/ice_sf_eth.c b/drivers/net/ethernet/intel/ice/ice_sf_eth.c index 75d7147e1c01c..1a2c94375ca71 100644 --- a/drivers/net/ethernet/intel/ice/ice_sf_eth.c +++ b/drivers/net/ethernet/intel/ice/ice_sf_eth.c @@ -5,8 +5,8 @@ #include "ice_txrx.h" #include "ice_fltr.h" #include "ice_sf_eth.h" -#include "devlink/devlink_port.h" #include "devlink/devlink.h" +#include "devlink/port.h" static const struct net_device_ops ice_sf_netdev_ops = { .ndo_open = ice_open, From d75d72a858f0c00ca8ae161b48cdb403807be4de Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Nov 2024 11:11:55 -0500 Subject: [PATCH 0650/1450] btrfs: fix improper generation check in snapshot delete We have been using the following check if (generation <= root->root_key.offset) to make decisions about whether or not to visit a node during snapshot delete. This is because for normal subvolumes this is set to 0, and for snapshots it's set to the creation generation. The idea being that if the generation of the node is less than or equal to our creation generation then we don't need to visit that node, because it doesn't belong to us, we can simply drop our reference and move on. However reloc roots don't have their generation stored in root->root_key.offset, instead that is the objectid of their corresponding fs root. This means we can incorrectly not walk into nodes that need to be dropped when deleting a reloc root. There are a variety of consequences to making the wrong choice in two distinct areas. visit_node_for_delete() 1. False positive. We think we are newer than the block when we really aren't. We don't visit the node and drop our reference to the node and carry on. This would result in leaked space. 2. False negative. We do decide to walk down into a block that we should have just dropped our reference to. However this means that the child node will have refs > 1, so we will switch to UPDATE_BACKREF, and then the subsequent walk_down_proc() will notice that btrfs_header_owner(node) != root->root_key.objectid and it'll break out of the loop, and then walk_up_proc() will drop our reference, so this appears to be ok. do_walk_down() 1. False positive. We are in UPDATE_BACKREF and incorrectly decide that we are done and don't need to update the backref for our lower nodes. This is another case that simply won't happen with relocation, as we only have to do UPDATE_BACKREF if the node below us was shared and didn't have FULL_BACKREF set, and since we don't own that node because we're a reloc root we actually won't end up in this case. 2. False negative. Again this is tricky because as described above, we simply wouldn't be here from relocation, because we don't own any of the nodes because we never set btrfs_header_owner() to the reloc root objectid, and we always use FULL_BACKREF, we never actually need to set FULL_BACKREF on any children. Having spent a lot of time stressing relocation/snapshot delete recently I've not seen this pop in practice. But this is objectively incorrect, so fix this to get the correct starting generation based on the root we're dropping to keep me from thinking there's a problem here. CC: stable@vger.kernel.org Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 19 +++++++++++++++++++ fs/btrfs/extent-tree.c | 6 +++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 307dedf95c706..2c341956a01ce 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -370,6 +370,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi WRITE_ONCE(root->last_trans, transid); } +/* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 412e318e4a22d..43a771f7bd7af 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5285,7 +5285,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control * * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); - if (!wc->update_ref || generation <= root->root_key.offset) + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* @@ -5340,7 +5340,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5683,7 +5683,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { + generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } From 6c3864e055486fadb5b97793b57688082e14b43b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:32 +0100 Subject: [PATCH 0651/1450] btrfs: use bio_is_zone_append() in the completion handler Otherwise it won't catch bios turned into regular writes by the block level zone write plugging. The additional test it adds is for emulated zone append. Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") CC: stable@vger.kernel.org # 6.12 Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1f216d07eff65..011cc97be3b57 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -355,7 +355,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_bio_end_io(bbio, bbio->bio.bi_status); } @@ -398,7 +398,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; btrfs_bio_end_io(bbio, bbio->bio.bi_status); @@ -412,7 +412,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } From be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:33 +0100 Subject: [PATCH 0652/1450] btrfs: split bios to the fs sector size boundary Btrfs like other file systems can't really deal with I/O not aligned to it's internal block size (which strangely is called sector size in btrfs, for historical reasons), but the block layer split helper doesn't even know about that. Round down the split boundary so that all I/Os are aligned. Fixes: d5e4377d5051 ("btrfs: split zone append bios in btrfs_submit_bio") CC: stable@vger.kernel.org # 6.12 Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 011cc97be3b57..78f5606baacb5 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -649,8 +649,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) map_length = min(map_length, bbio->fs_info->max_zone_append_size); sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, &nr_segs, map_length); - if (sector_offset) - return sector_offset << SECTOR_SHIFT; + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } return map_length; } From dfb92681a19e1d5172420baa242806414b3eff6f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 4 Dec 2024 13:30:46 +1030 Subject: [PATCH 0653/1450] btrfs: tree-checker: reject inline extent items with 0 ref count [BUG] There is a bug report in the mailing list where btrfs_run_delayed_refs() failed to drop the ref count for logical 25870311358464 num_bytes 2113536. The involved leaf dump looks like this: item 166 key (25870311358464 168 2113536) itemoff 10091 itemsize 50 extent refs 1 gen 84178 flags 1 ref#0: shared data backref parent 32399126528000 count 0 <<< ref#1: shared data backref parent 31808973717504 count 1 Notice the count number is 0. [CAUSE] There is no concrete evidence yet, but considering 0 -> 1 is also a single bit flipped, it's possible that hardware memory bitflip is involved, causing the on-disk extent tree to be corrupted. [FIX] To prevent us reading such corrupted extent item, or writing such damaged extent item back to disk, enhance the handling of BTRFS_EXTENT_DATA_REF_KEY and BTRFS_SHARED_DATA_REF_KEY keys for both inlined and key items, to detect such 0 ref count and reject them. CC: stable@vger.kernel.org # 5.4+ Link: https://lore.kernel.org/linux-btrfs/7c69dd49-c346-4806-86e7-e6f863a66f48@app.fastmail.com/ Reported-by: Frankie Fisher Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 148d8cefa40ee..dfeee033f31fb 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1527,6 +1527,11 @@ static int check_extent_item(struct extent_buffer *leaf, dref_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ @@ -1539,6 +1544,11 @@ static int check_extent_item(struct extent_buffer *leaf, inline_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: @@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, { u32 expect_item_size = 0; - if (key->type == BTRFS_SHARED_DATA_REF_KEY) + if (key->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + + sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data backref count, should have non-zero value"); + return -EUCLEAN; + } + expect_item_size = sizeof(struct btrfs_shared_data_ref); + } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, @@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf, offset, leaf->fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid extent data backref count, should have non-zero value"); + return -EUCLEAN; + } } return 0; } From 058387d9c6b70e225da82492e1e193635c3fac3f Mon Sep 17 00:00:00 2001 From: Willow Cunningham Date: Mon, 7 Oct 2024 17:29:54 -0400 Subject: [PATCH 0654/1450] arm64: dts: broadcom: Fix L2 linesize for Raspberry Pi 5 Set the cache-line-size parameter of the L2 cache for each core to the correct value of 64 bytes. Previously, the L2 cache line size was incorrectly set to 128 bytes for the Broadcom BCM2712. This causes validation tests for the Performance Application Programming Interface (PAPI) tool to fail as they depend on sysfs accurately reporting cache line sizes. The correct value of 64 bytes is stated in the official documentation of the ARM Cortex A-72, which is linked in the comments of arm64/boot/dts/broadcom/bcm2712.dtsi as the source for cache-line-size. Fixes: faa3381267d0 ("arm64: dts: broadcom: Add minimal support for Raspberry Pi 5") Signed-off-by: Willow Cunningham Link: https://lore.kernel.org/r/20241007212954.214724-1-willow.e.cunningham@maine.edu Signed-off-by: Florian Fainelli --- arch/arm64/boot/dts/broadcom/bcm2712.dtsi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi index 6e5a984c1d4ea..26a29e5e5078d 100644 --- a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi +++ b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi @@ -67,7 +67,7 @@ l2_cache_l0: l2-cache-l0 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; @@ -91,7 +91,7 @@ l2_cache_l1: l2-cache-l1 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; @@ -115,7 +115,7 @@ l2_cache_l2: l2-cache-l2 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; @@ -139,7 +139,7 @@ l2_cache_l3: l2-cache-l3 { compatible = "cache"; cache-size = <0x80000>; - cache-line-size = <128>; + cache-line-size = <64>; cache-sets = <1024>; //512KiB(size)/64(line-size)=8192ways/8-way set cache-level = <2>; cache-unified; From 2a82874a3b7be3f424eb6e94cd4f225e928efe2a Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Mon, 16 Dec 2024 15:15:34 +0100 Subject: [PATCH 0655/1450] ice: add Tx hang devlink health reporter Add Tx hang devlink health reporter, see struct ice_tx_hang_event to see what exactly is reported. For now dump descriptors with little metadata and skb diagnostic information. Reviewed-by: Igor Bagnucki Reviewed-by: Wojciech Drewek Co-developed-by: Mateusz Polchlopek Signed-off-by: Mateusz Polchlopek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/Makefile | 1 + .../net/ethernet/intel/ice/devlink/health.c | 192 ++++++++++++++++++ .../net/ethernet/intel/ice/devlink/health.h | 47 +++++ drivers/net/ethernet/intel/ice/ice.h | 2 + drivers/net/ethernet/intel/ice/ice_main.c | 18 +- 5 files changed, 255 insertions(+), 5 deletions(-) create mode 100644 drivers/net/ethernet/intel/ice/devlink/health.c create mode 100644 drivers/net/ethernet/intel/ice/devlink/health.h diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile index 56aa23aee472b..9e0d9f7104411 100644 --- a/drivers/net/ethernet/intel/ice/Makefile +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -32,6 +32,7 @@ ice-y := ice_main.o \ ice_parser_rt.o \ ice_idc.o \ devlink/devlink.o \ + devlink/health.o \ devlink/port.o \ ice_sf_eth.o \ ice_sf_vsi_vlan_ops.o \ diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c new file mode 100644 index 0000000000000..984d910fc41db --- /dev/null +++ b/drivers/net/ethernet/intel/ice/devlink/health.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024, Intel Corporation. */ + +#include "health.h" +#include "ice.h" + +#define ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, obj, name) \ + devlink_fmsg_put(fmsg, #name, (obj)->name) + +/** + * ice_devlink_health_report - boilerplate to call given @reporter + * + * @reporter: devlink health reporter to call, do nothing on NULL + * @msg: message to pass up, "event name" is fine + * @priv_ctx: typically some event struct + */ +static void ice_devlink_health_report(struct devlink_health_reporter *reporter, + const char *msg, void *priv_ctx) +{ + if (!reporter) + return; + + /* We do not do auto recovering, so return value of the below function + * will always be 0, thus we do ignore it. + */ + devlink_health_report(reporter, msg, priv_ctx); +} + +/** + * ice_fmsg_put_ptr - put hex value of pointer into fmsg + * + * @fmsg: devlink fmsg under construction + * @name: name to pass + * @ptr: 64 bit value to print as hex and put into fmsg + */ +static void ice_fmsg_put_ptr(struct devlink_fmsg *fmsg, const char *name, + void *ptr) +{ + char buf[sizeof(ptr) * 3]; + + sprintf(buf, "%p", ptr); + devlink_fmsg_put(fmsg, name, buf); +} + +struct ice_tx_hang_event { + u32 head; + u32 intr; + u16 vsi_num; + u16 queue; + u16 next_to_clean; + u16 next_to_use; + struct ice_tx_ring *tx_ring; +}; + +static int ice_tx_hang_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct ice_tx_hang_event *event = priv_ctx; + struct sk_buff *skb; + + if (!event) + return 0; + + skb = event->tx_ring->tx_buf->skb; + devlink_fmsg_obj_nest_start(fmsg); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, head); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, intr); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, vsi_num); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, queue); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, next_to_clean); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, event, next_to_use); + devlink_fmsg_put(fmsg, "irq-mapping", event->tx_ring->q_vector->name); + ice_fmsg_put_ptr(fmsg, "desc-ptr", event->tx_ring->desc); + ice_fmsg_put_ptr(fmsg, "dma-ptr", (void *)(long)event->tx_ring->dma); + ice_fmsg_put_ptr(fmsg, "skb-ptr", skb); + devlink_fmsg_binary_pair_put(fmsg, "desc", event->tx_ring->desc, + event->tx_ring->count * sizeof(struct ice_tx_desc)); + devlink_fmsg_dump_skb(fmsg, skb); + devlink_fmsg_obj_nest_end(fmsg); + + return 0; +} + +void ice_prep_tx_hang_report(struct ice_pf *pf, struct ice_tx_ring *tx_ring, + u16 vsi_num, u32 head, u32 intr) +{ + struct ice_health_tx_hang_buf *buf = &pf->health_reporters.tx_hang_buf; + + buf->tx_ring = tx_ring; + buf->vsi_num = vsi_num; + buf->head = head; + buf->intr = intr; +} + +void ice_report_tx_hang(struct ice_pf *pf) +{ + struct ice_health_tx_hang_buf *buf = &pf->health_reporters.tx_hang_buf; + struct ice_tx_ring *tx_ring = buf->tx_ring; + + struct ice_tx_hang_event ev = { + .head = buf->head, + .intr = buf->intr, + .vsi_num = buf->vsi_num, + .queue = tx_ring->q_index, + .next_to_clean = tx_ring->next_to_clean, + .next_to_use = tx_ring->next_to_use, + .tx_ring = tx_ring, + }; + + ice_devlink_health_report(pf->health_reporters.tx_hang, "Tx hang", &ev); +} + +static struct devlink_health_reporter * +ice_init_devlink_rep(struct ice_pf *pf, + const struct devlink_health_reporter_ops *ops) +{ + struct devlink *devlink = priv_to_devlink(pf); + struct devlink_health_reporter *rep; + const u64 graceful_period = 0; + + rep = devl_health_reporter_create(devlink, ops, graceful_period, pf); + if (IS_ERR(rep)) { + struct device *dev = ice_pf_to_dev(pf); + + dev_err(dev, "failed to create devlink %s health report er", + ops->name); + return NULL; + } + return rep; +} + +#define ICE_DEFINE_HEALTH_REPORTER_OPS(_name) \ + static const struct devlink_health_reporter_ops ice_ ## _name ## _reporter_ops = { \ + .name = #_name, \ + .dump = ice_ ## _name ## _reporter_dump, \ +} + +ICE_DEFINE_HEALTH_REPORTER_OPS(tx_hang); + +/** + * ice_health_init - allocate and init all ice devlink health reporters and + * accompanied data + * + * @pf: PF struct + */ +void ice_health_init(struct ice_pf *pf) +{ + struct ice_health *reps = &pf->health_reporters; + + reps->tx_hang = ice_init_devlink_rep(pf, &ice_tx_hang_reporter_ops); +} + +/** + * ice_deinit_devl_reporter - destroy given devlink health reporter + * @reporter: reporter to destroy + */ +static void ice_deinit_devl_reporter(struct devlink_health_reporter *reporter) +{ + if (reporter) + devl_health_reporter_destroy(reporter); +} + +/** + * ice_health_deinit - deallocate all ice devlink health reporters and + * accompanied data + * + * @pf: PF struct + */ +void ice_health_deinit(struct ice_pf *pf) +{ + ice_deinit_devl_reporter(pf->health_reporters.tx_hang); +} + +static +void ice_health_assign_healthy_state(struct devlink_health_reporter *reporter) +{ + if (reporter) + devlink_health_reporter_state_update(reporter, + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY); +} + +/** + * ice_health_clear - clear devlink health issues after a reset + * @pf: the PF device structure + * + * Mark the PF in healthy state again after a reset has completed. + */ +void ice_health_clear(struct ice_pf *pf) +{ + ice_health_assign_healthy_state(pf->health_reporters.tx_hang); +} diff --git a/drivers/net/ethernet/intel/ice/devlink/health.h b/drivers/net/ethernet/intel/ice/devlink/health.h new file mode 100644 index 0000000000000..5ce601227acbe --- /dev/null +++ b/drivers/net/ethernet/intel/ice/devlink/health.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024, Intel Corporation. */ + +#ifndef _HEALTH_H_ +#define _HEALTH_H_ + +#include + +/** + * DOC: health.h + * + * This header file stores everything that is needed for broadly understood + * devlink health mechanism for ice driver. + */ + +struct ice_pf; +struct ice_tx_ring; + +/** + * struct ice_health - stores ice devlink health reporters and accompanied data + * @tx_hang: devlink health reporter for tx_hang event + * @tx_hang_buf: pre-allocated place to put info for Tx hang reporter from + * non-sleeping context + * @tx_ring: ring that the hang occurred on + * @head: descriptor head + * @intr: interrupt register value + * @vsi_num: VSI owning the queue that the hang occurred on + */ +struct ice_health { + struct devlink_health_reporter *tx_hang; + struct_group_tagged(ice_health_tx_hang_buf, tx_hang_buf, + struct ice_tx_ring *tx_ring; + u32 head; + u32 intr; + u16 vsi_num; + ); +}; + +void ice_health_init(struct ice_pf *pf); +void ice_health_deinit(struct ice_pf *pf); +void ice_health_clear(struct ice_pf *pf); + +void ice_prep_tx_hang_report(struct ice_pf *pf, struct ice_tx_ring *tx_ring, + u16 vsi_num, u32 head, u32 intr); +void ice_report_tx_hang(struct ice_pf *pf); + +#endif /* _HEALTH_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 2f5d6f974185a..71e05d30f0fda 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -78,6 +78,7 @@ #include "ice_irq.h" #include "ice_dpll.h" #include "ice_adapter.h" +#include "devlink/health.h" #define ICE_BAR0 0 #define ICE_REQ_DESC_MULTIPLE 32 @@ -665,6 +666,7 @@ struct ice_pf { struct ice_agg_node vf_agg_node[ICE_MAX_VF_AGG_NODES]; struct ice_dplls dplls; struct device *hwmon_dev; + struct ice_health health_reporters; u8 num_quanta_prof_used; }; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index d641dd8b81846..316f5109bd3f6 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2364,9 +2364,11 @@ static void ice_service_task(struct work_struct *work) struct ice_pf *pf = container_of(work, struct ice_pf, serv_task); unsigned long start_time = jiffies; - /* subtasks */ + if (pf->health_reporters.tx_hang_buf.tx_ring) { + ice_report_tx_hang(pf); + pf->health_reporters.tx_hang_buf.tx_ring = NULL; + } - /* process reset requests first */ ice_reset_subtask(pf); /* bail if a reset/recovery cycle is pending or rebuild failed */ @@ -5087,6 +5089,7 @@ static int ice_init_devlink(struct ice_pf *pf) return err; ice_devlink_init_regions(pf); + ice_health_init(pf); ice_devlink_register(pf); return 0; @@ -5095,6 +5098,7 @@ static int ice_init_devlink(struct ice_pf *pf) static void ice_deinit_devlink(struct ice_pf *pf) { ice_devlink_unregister(pf); + ice_health_deinit(pf); ice_devlink_destroy_regions(pf); ice_devlink_unregister_params(pf); } @@ -7793,6 +7797,8 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) /* if we get here, reset flow is successful */ clear_bit(ICE_RESET_FAILED, pf->state); + ice_health_clear(pf); + ice_plug_aux_dev(pf); if (ice_is_feature_supported(pf, ICE_F_SRIOV_LAG)) ice_lag_rebuild(pf); @@ -8283,16 +8289,18 @@ void ice_tx_timeout(struct net_device *netdev, unsigned int txqueue) if (tx_ring) { struct ice_hw *hw = &pf->hw; - u32 head, val = 0; + u32 head, intr = 0; head = FIELD_GET(QTX_COMM_HEAD_HEAD_M, rd32(hw, QTX_COMM_HEAD(vsi->txq_map[txqueue]))); /* Read interrupt register */ - val = rd32(hw, GLINT_DYN_CTL(tx_ring->q_vector->reg_idx)); + intr = rd32(hw, GLINT_DYN_CTL(tx_ring->q_vector->reg_idx)); netdev_info(netdev, "tx_timeout: VSI_num: %d, Q %u, NTC: 0x%x, HW_HEAD: 0x%x, NTU: 0x%x, INT: 0x%x\n", vsi->vsi_num, txqueue, tx_ring->next_to_clean, - head, tx_ring->next_to_use, val); + head, tx_ring->next_to_use, intr); + + ice_prep_tx_hang_report(pf, tx_ring, vsi->vsi_num, head, intr); } pf->tx_timeout_last_recovery = jiffies; From bc1027473986dbbd93f9eb41de33307f9abe1319 Mon Sep 17 00:00:00 2001 From: Ben Shelton Date: Mon, 16 Dec 2024 15:15:35 +0100 Subject: [PATCH 0656/1450] ice: Add MDD logging via devlink health Add a devlink health reporter for MDD events. The 'dump' handler will return the information captured in each call to ice_handle_mdd_event(). A device reset (CORER/PFR) will put the reporter back in healthy state. Signed-off-by: Ben Shelton Reviewed-by: Igor Bagnucki Reviewed-by: Wojciech Drewek Reviewed-by: Simon Horman Signed-off-by: Mateusz Polchlopek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Co-developed-by: Przemek Kitszel Signed-off-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ice/devlink/health.c | 77 +++++++++++++++++++ .../net/ethernet/intel/ice/devlink/health.h | 11 +++ drivers/net/ethernet/intel/ice/ice_main.c | 6 ++ 3 files changed, 94 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c index 984d910fc41db..d23ae3aafaa7c 100644 --- a/drivers/net/ethernet/intel/ice/devlink/health.c +++ b/drivers/net/ethernet/intel/ice/devlink/health.c @@ -26,6 +26,79 @@ static void ice_devlink_health_report(struct devlink_health_reporter *reporter, devlink_health_report(reporter, msg, priv_ctx); } +struct ice_mdd_event { + enum ice_mdd_src src; + u16 vf_num; + u16 queue; + u8 pf_num; + u8 event; +}; + +static const char *ice_mdd_src_to_str(enum ice_mdd_src src) +{ + switch (src) { + case ICE_MDD_SRC_TX_PQM: + return "tx_pqm"; + case ICE_MDD_SRC_TX_TCLAN: + return "tx_tclan"; + case ICE_MDD_SRC_TX_TDPU: + return "tx_tdpu"; + case ICE_MDD_SRC_RX: + return "rx"; + default: + return "invalid"; + } +} + +static int +ice_mdd_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct ice_mdd_event *mdd_event = priv_ctx; + const char *src; + + if (!mdd_event) + return 0; + + src = ice_mdd_src_to_str(mdd_event->src); + + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "src", src); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, pf_num); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, vf_num); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, event); + ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, queue); + devlink_fmsg_obj_nest_end(fmsg); + + return 0; +} + +/** + * ice_report_mdd_event - Report an MDD event through devlink health + * @pf: the PF device structure + * @src: the HW block that was the source of this MDD event + * @pf_num: the pf_num on which the MDD event occurred + * @vf_num: the vf_num on which the MDD event occurred + * @event: the event type of the MDD event + * @queue: the queue on which the MDD event occurred + * + * Report an MDD event that has occurred on this PF. + */ +void ice_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src, u8 pf_num, + u16 vf_num, u8 event, u16 queue) +{ + struct ice_mdd_event ev = { + .src = src, + .pf_num = pf_num, + .vf_num = vf_num, + .event = event, + .queue = queue, + }; + + ice_devlink_health_report(pf->health_reporters.mdd, "MDD event", &ev); +} + /** * ice_fmsg_put_ptr - put hex value of pointer into fmsg * @@ -136,6 +209,7 @@ ice_init_devlink_rep(struct ice_pf *pf, .dump = ice_ ## _name ## _reporter_dump, \ } +ICE_DEFINE_HEALTH_REPORTER_OPS(mdd); ICE_DEFINE_HEALTH_REPORTER_OPS(tx_hang); /** @@ -148,6 +222,7 @@ void ice_health_init(struct ice_pf *pf) { struct ice_health *reps = &pf->health_reporters; + reps->mdd = ice_init_devlink_rep(pf, &ice_mdd_reporter_ops); reps->tx_hang = ice_init_devlink_rep(pf, &ice_tx_hang_reporter_ops); } @@ -169,6 +244,7 @@ static void ice_deinit_devl_reporter(struct devlink_health_reporter *reporter) */ void ice_health_deinit(struct ice_pf *pf) { + ice_deinit_devl_reporter(pf->health_reporters.mdd); ice_deinit_devl_reporter(pf->health_reporters.tx_hang); } @@ -188,5 +264,6 @@ void ice_health_assign_healthy_state(struct devlink_health_reporter *reporter) */ void ice_health_clear(struct ice_pf *pf) { + ice_health_assign_healthy_state(pf->health_reporters.mdd); ice_health_assign_healthy_state(pf->health_reporters.tx_hang); } diff --git a/drivers/net/ethernet/intel/ice/devlink/health.h b/drivers/net/ethernet/intel/ice/devlink/health.h index 5ce601227acbe..532277fc57d78 100644 --- a/drivers/net/ethernet/intel/ice/devlink/health.h +++ b/drivers/net/ethernet/intel/ice/devlink/health.h @@ -16,9 +16,17 @@ struct ice_pf; struct ice_tx_ring; +enum ice_mdd_src { + ICE_MDD_SRC_TX_PQM, + ICE_MDD_SRC_TX_TCLAN, + ICE_MDD_SRC_TX_TDPU, + ICE_MDD_SRC_RX, +}; + /** * struct ice_health - stores ice devlink health reporters and accompanied data * @tx_hang: devlink health reporter for tx_hang event + * @mdd: devlink health reporter for MDD detection event * @tx_hang_buf: pre-allocated place to put info for Tx hang reporter from * non-sleeping context * @tx_ring: ring that the hang occurred on @@ -27,6 +35,7 @@ struct ice_tx_ring; * @vsi_num: VSI owning the queue that the hang occurred on */ struct ice_health { + struct devlink_health_reporter *mdd; struct devlink_health_reporter *tx_hang; struct_group_tagged(ice_health_tx_hang_buf, tx_hang_buf, struct ice_tx_ring *tx_ring; @@ -42,6 +51,8 @@ void ice_health_clear(struct ice_pf *pf); void ice_prep_tx_hang_report(struct ice_pf *pf, struct ice_tx_ring *tx_ring, u16 vsi_num, u32 head, u32 intr); +void ice_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src, u8 pf_num, + u16 vf_num, u8 event, u16 queue); void ice_report_tx_hang(struct ice_pf *pf); #endif /* _HEALTH_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 316f5109bd3f6..1701f7143f243 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -1816,6 +1816,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf) if (netif_msg_tx_err(pf)) dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n", event, queue, pf_num, vf_num); + ice_report_mdd_event(pf, ICE_MDD_SRC_TX_PQM, pf_num, vf_num, + event, queue); wr32(hw, GL_MDET_TX_PQM, 0xffffffff); } @@ -1829,6 +1831,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf) if (netif_msg_tx_err(pf)) dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n", event, queue, pf_num, vf_num); + ice_report_mdd_event(pf, ICE_MDD_SRC_TX_TCLAN, pf_num, vf_num, + event, queue); wr32(hw, GL_MDET_TX_TCLAN_BY_MAC(hw), U32_MAX); } @@ -1842,6 +1846,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf) if (netif_msg_rx_err(pf)) dev_info(dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n", event, queue, pf_num, vf_num); + ice_report_mdd_event(pf, ICE_MDD_SRC_RX, pf_num, vf_num, event, + queue); wr32(hw, GL_MDET_RX, 0xffffffff); } From f4bf0b909a6bf64a2220a42a7c8b8c2ee1b77b89 Mon Sep 17 00:00:00 2001 From: Maksim Kiselev Date: Tue, 10 Dec 2024 11:30:27 +0300 Subject: [PATCH 0657/1450] clk: thead: Fix TH1520 emmc and shdci clock rate In accordance with LicheePi 4A BSP the clock that comes to emmc/sdhci is 198Mhz which is got through frequency division of source clock VIDEO PLL by 4 [1]. But now the AP_SUBSYS driver sets the CLK EMMC SDIO to the same frequency as the VIDEO PLL, equal to 792 MHz. This causes emmc/sdhci to work 4 times slower. Let's fix this issue by adding fixed factor clock that divides VIDEO PLL by 4 for emmc/sdhci. Link: https://github.com/revyos/thead-kernel/blob/7563179071a314f41cdcdbfd8cf6e101e73707f3/drivers/clk/thead/clk-light-fm.c#L454 Fixes: ae81b69fd2b1 ("clk: thead: Add support for T-Head TH1520 AP_SUBSYS clocks") Signed-off-by: Maksim Kiselev Link: https://lore.kernel.org/r/20241210083029.92620-1-bigunclemax@gmail.com Tested-by: Xi Ruoyao Reviewed-by: Drew Fustini Signed-off-by: Stephen Boyd --- drivers/clk/thead/clk-th1520-ap.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/clk/thead/clk-th1520-ap.c b/drivers/clk/thead/clk-th1520-ap.c index 17e32ae08720c..1015fab952515 100644 --- a/drivers/clk/thead/clk-th1520-ap.c +++ b/drivers/clk/thead/clk-th1520-ap.c @@ -779,6 +779,13 @@ static struct ccu_div dpu1_clk = { }, }; +static CLK_FIXED_FACTOR_HW(emmc_sdio_ref_clk, "emmc-sdio-ref", + &video_pll_clk.common.hw, 4, 1, 0); + +static const struct clk_parent_data emmc_sdio_ref_clk_pd[] = { + { .hw = &emmc_sdio_ref_clk.hw }, +}; + static CCU_GATE(CLK_BROM, brom_clk, "brom", ahb2_cpusys_hclk_pd, 0x100, BIT(4), 0); static CCU_GATE(CLK_BMU, bmu_clk, "bmu", axi4_cpusys2_aclk_pd, 0x100, BIT(5), 0); static CCU_GATE(CLK_AON2CPU_A2X, aon2cpu_a2x_clk, "aon2cpu-a2x", axi4_cpusys2_aclk_pd, @@ -798,7 +805,7 @@ static CCU_GATE(CLK_PERISYS_APB4_HCLK, perisys_apb4_hclk, "perisys-apb4-hclk", p 0x150, BIT(12), 0); static CCU_GATE(CLK_NPU_AXI, npu_axi_clk, "npu-axi", axi_aclk_pd, 0x1c8, BIT(5), 0); static CCU_GATE(CLK_CPU2VP, cpu2vp_clk, "cpu2vp", axi_aclk_pd, 0x1e0, BIT(13), 0); -static CCU_GATE(CLK_EMMC_SDIO, emmc_sdio_clk, "emmc-sdio", video_pll_clk_pd, 0x204, BIT(30), 0); +static CCU_GATE(CLK_EMMC_SDIO, emmc_sdio_clk, "emmc-sdio", emmc_sdio_ref_clk_pd, 0x204, BIT(30), 0); static CCU_GATE(CLK_GMAC1, gmac1_clk, "gmac1", gmac_pll_clk_pd, 0x204, BIT(26), 0); static CCU_GATE(CLK_PADCTRL1, padctrl1_clk, "padctrl1", perisys_apb_pclk_pd, 0x204, BIT(24), 0); static CCU_GATE(CLK_DSMART, dsmart_clk, "dsmart", perisys_apb_pclk_pd, 0x204, BIT(23), 0); @@ -1059,6 +1066,10 @@ static int th1520_clk_probe(struct platform_device *pdev) return ret; priv->hws[CLK_PLL_GMAC_100M] = &gmac_pll_clk_100m.hw; + ret = devm_clk_hw_register(dev, &emmc_sdio_ref_clk.hw); + if (ret) + return ret; + ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_onecell_get, priv); if (ret) return ret; From 93433c1d919775f8ac0f7893692f42e6731a5373 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Mon, 25 Nov 2024 15:58:54 -0800 Subject: [PATCH 0658/1450] idpf: add support for SW triggered interrupts SW triggered interrupts are guaranteed to fire after their timer expires, unlike Tx and Rx interrupts which will only fire after the timer expires _and_ a descriptor write back is available to be processed by the driver. Add the necessary fields, defines, and initializations to enable a SW triggered interrupt in the vector's dyn_ctl register. Reviewed-by: Madhu Chittim Signed-off-by: Joshua Hay Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_dev.c | 3 +++ drivers/net/ethernet/intel/idpf/idpf_txrx.h | 8 +++++++- drivers/net/ethernet/intel/idpf/idpf_vf_dev.c | 3 +++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_dev.c index 6c913a703df64..41e4bd49402a8 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_dev.c +++ b/drivers/net/ethernet/intel/idpf/idpf_dev.c @@ -101,6 +101,9 @@ static int idpf_intr_reg_init(struct idpf_vport *vport) intr->dyn_ctl_itridx_s = PF_GLINT_DYN_CTL_ITR_INDX_S; intr->dyn_ctl_intrvl_s = PF_GLINT_DYN_CTL_INTERVAL_S; intr->dyn_ctl_wb_on_itr_m = PF_GLINT_DYN_CTL_WB_ON_ITR_M; + intr->dyn_ctl_swint_trig_m = PF_GLINT_DYN_CTL_SWINT_TRIG_M; + intr->dyn_ctl_sw_itridx_ena_m = + PF_GLINT_DYN_CTL_SW_ITR_INDX_ENA_M; spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing, IDPF_PF_ITR_IDX_SPACING); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 9c1fe84108ed2..0f71a6f5557b9 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -354,6 +354,8 @@ struct idpf_vec_regs { * @dyn_ctl_itridx_m: Mask for ITR index * @dyn_ctl_intrvl_s: Register bit offset for ITR interval * @dyn_ctl_wb_on_itr_m: Mask for WB on ITR feature + * @dyn_ctl_sw_itridx_ena_m: Mask for SW ITR index + * @dyn_ctl_swint_trig_m: Mask for dyn_ctl SW triggered interrupt enable * @rx_itr: RX ITR register * @tx_itr: TX ITR register * @icr_ena: Interrupt cause register offset @@ -367,6 +369,8 @@ struct idpf_intr_reg { u32 dyn_ctl_itridx_m; u32 dyn_ctl_intrvl_s; u32 dyn_ctl_wb_on_itr_m; + u32 dyn_ctl_sw_itridx_ena_m; + u32 dyn_ctl_swint_trig_m; void __iomem *rx_itr; void __iomem *tx_itr; void __iomem *icr_ena; @@ -437,7 +441,7 @@ struct idpf_q_vector { cpumask_var_t affinity_mask; __cacheline_group_end_aligned(cold); }; -libeth_cacheline_set_assert(struct idpf_q_vector, 112, +libeth_cacheline_set_assert(struct idpf_q_vector, 120, 24 + sizeof(struct napi_struct) + 2 * sizeof(struct dim), 8 + sizeof(cpumask_var_t)); @@ -471,6 +475,8 @@ struct idpf_tx_queue_stats { #define IDPF_ITR_IS_DYNAMIC(itr_mode) (itr_mode) #define IDPF_ITR_TX_DEF IDPF_ITR_20K #define IDPF_ITR_RX_DEF IDPF_ITR_20K +/* Index used for 'SW ITR' update in DYN_CTL register */ +#define IDPF_SW_ITR_UPDATE_IDX 2 /* Index used for 'No ITR' update in DYN_CTL register */ #define IDPF_NO_ITR_UPDATE_IDX 3 #define IDPF_ITR_IDX_SPACING(spacing, dflt) (spacing ? spacing : dflt) diff --git a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c index aad62e270ae40..aba828abcb171 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c +++ b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c @@ -101,6 +101,9 @@ static int idpf_vf_intr_reg_init(struct idpf_vport *vport) intr->dyn_ctl_itridx_s = VF_INT_DYN_CTLN_ITR_INDX_S; intr->dyn_ctl_intrvl_s = VF_INT_DYN_CTLN_INTERVAL_S; intr->dyn_ctl_wb_on_itr_m = VF_INT_DYN_CTLN_WB_ON_ITR_M; + intr->dyn_ctl_swint_trig_m = VF_INT_DYN_CTLN_SWINT_TRIG_M; + intr->dyn_ctl_sw_itridx_ena_m = + VF_INT_DYN_CTLN_SW_ITR_INDX_ENA_M; spacing = IDPF_ITR_IDX_SPACING(reg_vals[vec_id].itrn_index_spacing, IDPF_VF_ITR_IDX_SPACING); From 9048cf05a17a7bc26f0b8e2e53750b1237303970 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 17 Dec 2024 16:18:12 -0500 Subject: [PATCH 0659/1450] NFSD: fix management of pending async copies Currently the pending_async_copies count is decremented just before a struct nfsd4_copy is destroyed. After commit aa0ebd21df9c ("NFSD: Add nfsd4_copy time-to-live") nfsd4_copy structures sticks around for 10 lease periods after the COPY itself has completed, the pending_async_copies count stays high for a long time. This causes NFSD to avoid the use of background copy even though the actual background copy workload might no longer be running. In this patch, decrement pending_async_copies once async copy thread is done processing the copy work. Fixes: aa0ebd21df9c ("NFSD: Add nfsd4_copy time-to-live") Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f8a10f90bc7a4..ad44ad49274f0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1347,7 +1347,6 @@ static void nfs4_put_copy(struct nfsd4_copy *copy) { if (!refcount_dec_and_test(©->refcount)) return; - atomic_dec(©->cp_nn->pending_async_copies); kfree(copy->cp_src); kfree(copy); } @@ -1870,6 +1869,7 @@ static int nfsd4_do_async_copy(void *data) set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); trace_nfsd_copy_async_done(copy); nfsd4_send_cb_offload(copy); + atomic_dec(©->cp_nn->pending_async_copies); return 0; } @@ -1927,19 +1927,19 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Arbitrary cap on number of pending async copy operations */ if (atomic_inc_return(&nn->pending_async_copies) > (int)rqstp->rq_pool->sp_nrthreads) - goto out_err; + goto out_dec_async_copy_err; async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) - goto out_err; + goto out_dec_async_copy_err; if (!nfs4_init_copy_state(nn, copy)) - goto out_err; + goto out_dec_async_copy_err; memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) - goto out_err; + goto out_dec_async_copy_err; spin_lock(&async_copy->cp_clp->async_lock); list_add(&async_copy->copies, &async_copy->cp_clp->async_copies); @@ -1954,6 +1954,9 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, trace_nfsd_copy_done(copy, status); release_copy_files(copy); return status; +out_dec_async_copy_err: + if (async_copy) + atomic_dec(&nn->pending_async_copies); out_err: if (nfsd4_ssc_is_inter(copy)) { /* From 0c1683c681681c14f4389e3bfa8de10baf242ba8 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Mon, 25 Nov 2024 15:58:55 -0800 Subject: [PATCH 0660/1450] idpf: trigger SW interrupt when exiting wb_on_itr mode There is a race condition between exiting wb_on_itr and completion write backs. For example, we are in wb_on_itr mode and a Tx completion is generated by HW, ready to be written back, as we are re-enabling interrupts: HW SW | | | | idpf_tx_splitq_clean_all | | napi_complete_done | | | tx_completion_wb | idpf_vport_intr_update_itr_ena_irq That tx_completion_wb happens before the vector is fully re-enabled. Continuing with this example, it is a UDP stream and the tx_completion_wb is the last one in the flow (there are no rx packets). Because the HW generated the completion before the interrupt is fully enabled, the HW will not fire the interrupt once the timer expires and the write back will not happen. NAPI poll won't be called. We have indicated we're back in interrupt mode but nothing else will trigger the interrupt. Therefore, the completion goes unprocessed, triggering a Tx timeout. To mitigate this, fire a SW triggered interrupt upon exiting wb_on_itr. This interrupt will catch the rogue completion and avoid the timeout. Add logic to set the appropriate bits in the vector's dyn_ctl register. Fixes: 9c4a27da0ecc ("idpf: enable WB_ON_ITR") Reviewed-by: Madhu Chittim Signed-off-by: Joshua Hay Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 29 ++++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 34f4118c7bc0d..2fa9c36e33c9c 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3604,21 +3604,31 @@ static void idpf_vport_intr_dis_irq_all(struct idpf_vport *vport) /** * idpf_vport_intr_buildreg_itr - Enable default interrupt generation settings * @q_vector: pointer to q_vector - * @type: itr index - * @itr: itr value */ -static u32 idpf_vport_intr_buildreg_itr(struct idpf_q_vector *q_vector, - const int type, u16 itr) +static u32 idpf_vport_intr_buildreg_itr(struct idpf_q_vector *q_vector) { - u32 itr_val; + u32 itr_val = q_vector->intr_reg.dyn_ctl_intena_m; + int type = IDPF_NO_ITR_UPDATE_IDX; + u16 itr = 0; + + if (q_vector->wb_on_itr) { + /* + * Trigger a software interrupt when exiting wb_on_itr, to make + * sure we catch any pending write backs that might have been + * missed due to interrupt state transition. + */ + itr_val |= q_vector->intr_reg.dyn_ctl_swint_trig_m | + q_vector->intr_reg.dyn_ctl_sw_itridx_ena_m; + type = IDPF_SW_ITR_UPDATE_IDX; + itr = IDPF_ITR_20K; + } itr &= IDPF_ITR_MASK; /* Don't clear PBA because that can cause lost interrupts that * came in while we were cleaning/polling */ - itr_val = q_vector->intr_reg.dyn_ctl_intena_m | - (type << q_vector->intr_reg.dyn_ctl_itridx_s) | - (itr << (q_vector->intr_reg.dyn_ctl_intrvl_s - 1)); + itr_val |= (type << q_vector->intr_reg.dyn_ctl_itridx_s) | + (itr << (q_vector->intr_reg.dyn_ctl_intrvl_s - 1)); return itr_val; } @@ -3716,9 +3726,8 @@ void idpf_vport_intr_update_itr_ena_irq(struct idpf_q_vector *q_vector) /* net_dim() updates ITR out-of-band using a work item */ idpf_net_dim(q_vector); + intval = idpf_vport_intr_buildreg_itr(q_vector); q_vector->wb_on_itr = false; - intval = idpf_vport_intr_buildreg_itr(q_vector, - IDPF_NO_ITR_UPDATE_IDX, 0); writel(intval, q_vector->intr_reg.dyn_ctl); } From aef25be35d23ec768eed08bfcf7ca3cf9685bc28 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 21 Nov 2024 11:22:18 -0700 Subject: [PATCH 0661/1450] hexagon: Disable constant extender optimization for LLVM prior to 19.1.0 The Hexagon-specific constant extender optimization in LLVM may crash on Linux kernel code [1], such as fs/bcache/btree_io.c after commit 32ed4a620c54 ("bcachefs: Btree path tracepoints") in 6.12: clang: llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp:745: bool (anonymous namespace)::HexagonConstExtenders::ExtRoot::operator<(const HCE::ExtRoot &) const: Assertion `ThisB->getParent() == OtherB->getParent()' failed. Stack dump: 0. Program arguments: clang --target=hexagon-linux-musl ... fs/bcachefs/btree_io.c 1. parser at end of file 2. Code generation 3. Running pass 'Function Pass Manager' on module 'fs/bcachefs/btree_io.c'. 4. Running pass 'Hexagon constant-extender optimization' on function '@__btree_node_lock_nopath' Without assertions enabled, there is just a hang during compilation. This has been resolved in LLVM main (20.0.0) [2] and backported to LLVM 19.1.0 but the kernel supports LLVM 13.0.1 and newer, so disable the constant expander optimization using the '-mllvm' option when using a toolchain that is not fixed. Cc: stable@vger.kernel.org Link: https://github.com/llvm/llvm-project/issues/99714 [1] Link: https://github.com/llvm/llvm-project/commit/68df06a0b2998765cb0a41353fcf0919bbf57ddb [2] Link: https://github.com/llvm/llvm-project/commit/2ab8d93061581edad3501561722ebd5632d73892 [3] Reviewed-by: Brian Cain Signed-off-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- arch/hexagon/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/hexagon/Makefile b/arch/hexagon/Makefile index 92d005958dfb2..ff172cbe5881a 100644 --- a/arch/hexagon/Makefile +++ b/arch/hexagon/Makefile @@ -32,3 +32,9 @@ KBUILD_LDFLAGS += $(ldflags-y) TIR_NAME := r19 KBUILD_CFLAGS += -ffixed-$(TIR_NAME) -DTHREADINFO_REG=$(TIR_NAME) -D__linux__ KBUILD_AFLAGS += -DTHREADINFO_REG=$(TIR_NAME) + +# Disable HexagonConstExtenders pass for LLVM versions prior to 19.1.0 +# https://github.com/llvm/llvm-project/issues/99714 +ifneq ($(call clang-min-version, 190100),y) +KBUILD_CFLAGS += -mllvm -hexagon-cext=false +endif From 23579010cf0a12476e96a5f1acdf78a9c5843657 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 17 Dec 2024 20:58:13 +0100 Subject: [PATCH 0662/1450] bpf: Fix bpf_get_smp_processor_id() on !CONFIG_SMP On x86-64 calling bpf_get_smp_processor_id() in a kernel with CONFIG_SMP disabled can trigger the following bug, as pcpu_hot is unavailable: [ 8.471774] BUG: unable to handle page fault for address: 00000000936a290c [ 8.471849] #PF: supervisor read access in kernel mode [ 8.471881] #PF: error_code(0x0000) - not-present page Fix by inlining a return 0 in the !CONFIG_SMP case. Fixes: 1ae6921009e5 ("bpf: inline bpf_get_smp_processor_id() helper") Signed-off-by: Andrea Righi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241217195813.622568-1-arighi@nvidia.com --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f7f892a52a374..77f56674aaa99 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21281,11 +21281,15 @@ static int do_misc_fixups(struct bpf_verifier_env *env) * changed in some incompatible and hard to support * way, it's fine to back out this inlining logic */ +#ifdef CONFIG_SMP insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; - +#else + insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + cnt = 1; +#endif new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; From d3c9510dc900e9ff3ea330189c0465c9f00fba18 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 15 Dec 2024 13:29:38 -0800 Subject: [PATCH 0663/1450] net: page_pool: rename page_pool_is_last_ref() page_pool_is_last_ref() releases a reference while the name, to me at least, suggests it just checks if the refcount is 1. The semantics of the function are the same as those of atomic_dec_and_test() and refcount_dec_and_test(), so just use the _and_test() suffix. Reviewed-by: Alexander Lobakin Reviewed-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20241215212938.99210-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 4 ++-- net/core/page_pool.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index e555921e52339..776a3008ac28c 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -306,7 +306,7 @@ static inline void page_pool_ref_page(struct page *page) page_pool_ref_netmem(page_to_netmem(page)); } -static inline bool page_pool_is_last_ref(netmem_ref netmem) +static inline bool page_pool_unref_and_test(netmem_ref netmem) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_netmem(netmem, 1) == 0; @@ -321,7 +321,7 @@ static inline void page_pool_put_netmem(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_ref(netmem)) + if (!page_pool_unref_and_test(netmem)) return; page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e07ad73159550..9733206d6406d 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -897,7 +897,7 @@ void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) for (u32 i = 0; i < count; i++) { netmem_ref netmem = netmem_compound_head(data[i]); - if (page_pool_is_last_ref(netmem)) + if (page_pool_unref_and_test(netmem)) data[bulk_len++] = netmem; } From b9b8301d369b4c876de5255dbf067b19ba88ac71 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Dec 2024 08:37:03 +0000 Subject: [PATCH 0664/1450] net: netdevsim: fix nsim_pp_hold_write() nsim_pp_hold_write() has two problems: 1) It may return with rtnl held, as found by syzbot. 2) Its return value does not propagate an error if any. Fixes: 1580cbcbfe77 ("net: netdevsim: add some fake page pool use") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241216083703.1859921-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index 0be47fed4efc5..e068a9761c094 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -635,10 +635,10 @@ nsim_pp_hold_write(struct file *file, const char __user *data, page_pool_put_full_page(ns->page->pp, ns->page, false); ns->page = NULL; } - rtnl_unlock(); exit: - return count; + rtnl_unlock(); + return ret; } static const struct file_operations nsim_pp_hold_fops = { From 954a2b40719a21e763a1bba2f0da92347e058fce Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 16 Dec 2024 20:04:32 +0900 Subject: [PATCH 0665/1450] rtnetlink: Try the outer netns attribute in rtnl_get_peer_net(). Xiao Liang reported that the cited commit changed netns handling in newlink() of netkit, veth, and vxcan. Before the patch, if we don't find a netns attribute in the peer device attributes, we tried to find another netns attribute in the outer netlink attributes by passing it to rtnl_link_get_net(). Let's restore the original behaviour. Fixes: 48327566769a ("rtnetlink: fix double call of rtnl_link_get_net_ifla()") Reported-by: Xiao Liang Closes: https://lore.kernel.org/netdev/CABAhCORBVVU8P6AHcEkENMj+gD2d3ce9t=A_o48E0yOQp8_wUQ@mail.gmail.com/#t Signed-off-by: Kuniyuki Iwashima Tested-by: Xiao Liang Link: https://patch.msgid.link/20241216110432.51488-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ebcfc2debf1a3..d9f959c619d95 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3819,6 +3819,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, } static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *tbp[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -3826,7 +3827,7 @@ static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, int err; if (!data || !data[ops->peer_type]) - return NULL; + return rtnl_link_get_net_ifla(tbp); err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) @@ -3971,7 +3972,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (ops->peer_type) { - peer_net = rtnl_get_peer_net(ops, data, extack); + peer_net = rtnl_get_peer_net(ops, tb, data, extack); if (IS_ERR(peer_net)) { ret = PTR_ERR(peer_net); goto put_ops; From a2558b410de3b0b6c38222ac4858188a55bc52ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Dec 2024 12:30:08 +0100 Subject: [PATCH 0666/1450] net: bridge: constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Reviewed-by: Simon Horman Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241216-sysfs-const-bin_attr-net-v1-1-ec460b91f274@weissschuh.net Signed-off-by: Jakub Kicinski --- net/bridge/br_sysfs_br.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index ea733542244c7..c1176a5e02c43 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -1002,7 +1002,7 @@ static const struct attribute_group bridge_group = { * Returns the number of bytes read. */ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); @@ -1023,10 +1023,10 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, return n; } -static struct bin_attribute bridge_forward = { +static const struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, .mode = 0444, }, - .read = brforward_read, + .read_new = brforward_read, }; /* From 2d7b422fa7952e3f15fc0912b12530af1d265193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Dec 2024 12:30:09 +0100 Subject: [PATCH 0667/1450] net: phy: ks8995: constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241216-sysfs-const-bin_attr-net-v1-2-ec460b91f274@weissschuh.net Signed-off-by: Jakub Kicinski --- drivers/net/phy/spi_ks8995.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c index 7196e927c2cd8..076a370be849e 100644 --- a/drivers/net/phy/spi_ks8995.c +++ b/drivers/net/phy/spi_ks8995.c @@ -289,7 +289,7 @@ static int ks8995_reset(struct ks8995_switch *ks) } static ssize_t ks8995_registers_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev; struct ks8995_switch *ks8995; @@ -301,7 +301,7 @@ static ssize_t ks8995_registers_read(struct file *filp, struct kobject *kobj, } static ssize_t ks8995_registers_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev; struct ks8995_switch *ks8995; @@ -401,8 +401,8 @@ static const struct bin_attribute ks8995_registers_attr = { .mode = 0600, }, .size = KS8995_REGS_SIZE, - .read = ks8995_registers_read, - .write = ks8995_registers_write, + .read_new = ks8995_registers_read, + .write_new = ks8995_registers_write, }; /* ------------------------------------------------------------------------ */ From ae026eae08e7a0a118abc31192041e49bcda3a8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Dec 2024 12:30:11 +0100 Subject: [PATCH 0668/1450] netxen_nic: constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241216-sysfs-const-bin_attr-net-v1-4-ec460b91f274@weissschuh.net Signed-off-by: Jakub Kicinski --- .../ethernet/qlogic/netxen/netxen_nic_main.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c index 9cff0a8ffb2c5..3383ee1dad14e 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c @@ -2832,7 +2832,7 @@ netxen_sysfs_validate_crb(struct netxen_adapter *adapter, static ssize_t netxen_sysfs_read_crb(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -2860,7 +2860,7 @@ netxen_sysfs_read_crb(struct file *filp, struct kobject *kobj, static ssize_t netxen_sysfs_write_crb(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -2901,7 +2901,7 @@ netxen_sysfs_validate_mem(struct netxen_adapter *adapter, static ssize_t netxen_sysfs_read_mem(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -2922,7 +2922,7 @@ netxen_sysfs_read_mem(struct file *filp, struct kobject *kobj, } static ssize_t netxen_sysfs_write_mem(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -2946,20 +2946,20 @@ static ssize_t netxen_sysfs_write_mem(struct file *filp, struct kobject *kobj, static const struct bin_attribute bin_attr_crb = { .attr = { .name = "crb", .mode = 0644 }, .size = 0, - .read = netxen_sysfs_read_crb, - .write = netxen_sysfs_write_crb, + .read_new = netxen_sysfs_read_crb, + .write_new = netxen_sysfs_write_crb, }; static const struct bin_attribute bin_attr_mem = { .attr = { .name = "mem", .mode = 0644 }, .size = 0, - .read = netxen_sysfs_read_mem, - .write = netxen_sysfs_write_mem, + .read_new = netxen_sysfs_read_mem, + .write_new = netxen_sysfs_write_mem, }; static ssize_t netxen_sysfs_read_dimm(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -3082,7 +3082,7 @@ netxen_sysfs_read_dimm(struct file *filp, struct kobject *kobj, static const struct bin_attribute bin_attr_dimm = { .attr = { .name = "dimm", .mode = 0644 }, .size = sizeof(struct netxen_dimm_cfg), - .read = netxen_sysfs_read_dimm, + .read_new = netxen_sysfs_read_dimm, }; From 661cd8fc8e9039819ca0c22e0add52b632240a9e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Dec 2024 17:56:26 +0000 Subject: [PATCH 0669/1450] inetpeer: remove create argument of inet_getpeer_v[46]() All callers of inet_getpeer_v4() and inet_getpeer_v6() want to create an inetpeer. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241215175629.1248773-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inetpeer.h | 9 ++++----- net/ipv4/icmp.c | 2 +- net/ipv4/ip_fragment.c | 2 +- net/ipv4/route.c | 4 ++-- net/ipv6/icmp.c | 2 +- net/ipv6/ip6_output.c | 2 +- net/ipv6/ndisc.c | 2 +- 7 files changed, 11 insertions(+), 12 deletions(-) diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 74ff688568a0c..6f51f81d6cb19 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -101,25 +101,24 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, - int vif, int create) + int vif) { struct inetpeer_addr daddr; daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr, 1); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, - const struct in6_addr *v6daddr, - int create) + const struct in6_addr *v6daddr) { struct inetpeer_addr daddr; daddr.a6 = *v6daddr; daddr.family = AF_INET6; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr, 1); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 963a89ae9c26e..5eeb9f569a706 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -322,7 +322,7 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, goto out; vif = l3mdev_master_ifindex(dst->dev); - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); if (peer) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 07036a2943c19..46e1171299f22 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -89,7 +89,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) q->key.v4 = *key; qp->ecn = 0; qp->peer = q->fqdir->max_dist ? - inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif) : NULL; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0fbec35096186..297a9939c6e74 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -873,7 +873,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) rcu_read_unlock(); net = dev_net(rt->dst.dev); - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); @@ -976,7 +976,7 @@ static int ip_error(struct sk_buff *skb) } peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, - l3mdev_master_ifindex(skb->dev), 1); + l3mdev_master_ifindex(skb->dev)); send = true; if (peer) { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 071b0bc1179d8..4593e3992c67b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -222,7 +222,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); if (peer) inet_putpeer(peer); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 993106876604b..cdcbb3b6c5da3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -613,7 +613,7 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; - peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index aba94a3486737..f113554d13325 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1731,7 +1731,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) "Redirect: destination is not a neighbour\n"); goto release; } - peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); ret = inet_peer_xrlim_allow(peer, 1*HZ); if (peer) inet_putpeer(peer); From 7a596a50c4a4eab946aec149171c72321b4934aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Dec 2024 17:56:27 +0000 Subject: [PATCH 0670/1450] inetpeer: remove create argument of inet_getpeer() All callers of inet_getpeer() want to create an inetpeer. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241215175629.1248773-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inetpeer.h | 7 +++---- net/ipv4/inetpeer.c | 11 ++--------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 6f51f81d6cb19..f475757daafba 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -96,8 +96,7 @@ static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, - const struct inetpeer_addr *daddr, - int create); + const struct inetpeer_addr *daddr); static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, @@ -108,7 +107,7 @@ static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; - return inet_getpeer(base, &daddr, 1); + return inet_getpeer(base, &daddr); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, @@ -118,7 +117,7 @@ static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, daddr.a6 = *v6daddr; daddr.family = AF_INET6; - return inet_getpeer(base, &daddr, 1); + return inet_getpeer(base, &daddr); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 5ab56f4cb5297..bc79cc9d13ebb 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -169,13 +169,11 @@ static void inet_peer_gc(struct inet_peer_base *base, } struct inet_peer *inet_getpeer(struct inet_peer_base *base, - const struct inetpeer_addr *daddr, - int create) + const struct inetpeer_addr *daddr) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; - int invalidated; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. @@ -183,16 +181,11 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, rcu_read_lock(); seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); - invalidated = read_seqretry(&base->lock, seq); rcu_read_unlock(); if (p) return p; - /* If no writer did a change during our lookup, we can return early. */ - if (!create && !invalidated) - return NULL; - /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ @@ -201,7 +194,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); - if (!p && create) { + if (!p) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; From 50b362f21d6c10b0f7939c1482c6a1b43da82f1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Dec 2024 17:56:28 +0000 Subject: [PATCH 0671/1450] inetpeer: update inetpeer timestamp in inet_getpeer() inet_putpeer() will be removed in the following patch, because we will no longer use refcounts. Update inetpeer timestamp (p->dtime) at lookup time. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241215175629.1248773-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inetpeer.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index bc79cc9d13ebb..28c3ae5bc4a0b 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -95,6 +95,7 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, { struct rb_node **pp, *parent, *next; struct inet_peer *p; + u32 now; pp = &base->rb_root.rb_node; parent = NULL; @@ -110,6 +111,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, if (cmp == 0) { if (!refcount_inc_not_zero(&p->refcnt)) break; + now = jiffies; + if (READ_ONCE(p->dtime) != now) + WRITE_ONCE(p->dtime, now); return p; } if (gc_stack) { @@ -150,9 +154,6 @@ static void inet_peer_gc(struct inet_peer_base *base, for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; - /* The READ_ONCE() pairs with the WRITE_ONCE() - * in inet_putpeer() - */ delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) @@ -224,11 +225,6 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { - /* The WRITE_ONCE() pairs with itself (we run lockless) - * and the READ_ONCE() in inet_peer_gc() - */ - WRITE_ONCE(p->dtime, (__u32)jiffies); - if (refcount_dec_and_test(&p->refcnt)) kfree_rcu(p, rcu); } From a853c609504e2d1d83e71285e3622fda1f1451d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Dec 2024 17:56:29 +0000 Subject: [PATCH 0672/1450] inetpeer: do not get a refcount in inet_getpeer() All inet_getpeer() callers except ip4_frag_init() don't need to acquire a permanent refcount on the inetpeer. They can switch to full RCU protection. Move the refcount_inc_not_zero() into ip4_frag_init(), so that all the other callers no longer have to perform a pair of expensive atomic operations on a possibly contended cache line. inet_putpeer() no longer needs to be exported. After this patch, my DUT can receive 8,400,000 UDP packets per second targeting closed ports, using 50% less cpu cycles than before. Also change two calls to l3mdev_master_ifindex() by l3mdev_master_ifindex_rcu() (Ido ideas) Fixes: 8c2bd38b95f7 ("icmp: change the order of rate limits") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241215175629.1248773-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 9 ++++----- net/ipv4/inetpeer.c | 8 ++------ net/ipv4/ip_fragment.c | 15 ++++++++++----- net/ipv4/route.c | 15 ++++++++------- net/ipv6/icmp.c | 4 ++-- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/ndisc.c | 6 ++++-- 7 files changed, 32 insertions(+), 29 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5eeb9f569a706..094084b61bff8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -312,7 +312,6 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct dst_entry *dst = &rt->dst; struct inet_peer *peer; bool rc = true; - int vif; if (!apply_ratelimit) return true; @@ -321,12 +320,12 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) goto out; - vif = l3mdev_master_ifindex(dst->dev); - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif); + rcu_read_lock(); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, + l3mdev_master_ifindex_rcu(dst->dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 28c3ae5bc4a0b..e02484f4d22b8 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -109,8 +109,6 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { - if (!refcount_inc_not_zero(&p->refcnt)) - break; now = jiffies; if (READ_ONCE(p->dtime) != now) WRITE_ONCE(p->dtime, now); @@ -169,6 +167,7 @@ static void inet_peer_gc(struct inet_peer_base *base, } } +/* Must be called under RCU : No refcount change is done here. */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr) { @@ -179,10 +178,8 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ - rcu_read_lock(); seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); - rcu_read_unlock(); if (p) return p; @@ -200,7 +197,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; - refcount_set(&p->refcnt, 2); + refcount_set(&p->refcnt, 1); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; @@ -228,7 +225,6 @@ void inet_putpeer(struct inet_peer *p) if (refcount_dec_and_test(&p->refcnt)) kfree_rcu(p, rcu); } -EXPORT_SYMBOL_GPL(inet_putpeer); /* * Check transmit rate limitation for given message. diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 46e1171299f22..7a435746a22de 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -82,15 +82,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); - struct net *net = q->fqdir->net; - const struct frag_v4_compare_key *key = a; + struct net *net = q->fqdir->net; + struct inet_peer *p = NULL; q->key.v4 = *key; qp->ecn = 0; - qp->peer = q->fqdir->max_dist ? - inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif) : - NULL; + if (q->fqdir->max_dist) { + rcu_read_lock(); + p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif); + if (p && !refcount_inc_not_zero(&p->refcnt)) + p = NULL; + rcu_read_unlock(); + } + qp->peer = p; } static void ip4_frag_free(struct inet_frag_queue *q) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 297a9939c6e74..9f9d4e6ea1b92 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -870,11 +870,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); - rcu_read_unlock(); net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { + rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; @@ -893,7 +893,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; - goto out_put_peer; + goto out_unlock; } /* Check for load limit; set rate_last to the latest sent @@ -914,8 +914,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } -out_put_peer: - inet_putpeer(peer); +out_unlock: + rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) @@ -975,9 +975,9 @@ static int ip_error(struct sk_buff *skb) break; } + rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, - l3mdev_master_ifindex(skb->dev)); - + l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; @@ -989,8 +989,9 @@ static int ip_error(struct sk_buff *skb) peer->rate_tokens -= ip_rt_error_cost; else send = false; - inet_putpeer(peer); } + rcu_read_unlock(); + if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 4593e3992c67b..a6984a29fdb9d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -222,10 +222,10 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); + rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); } if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cdcbb3b6c5da3..d577bf2f30538 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -613,6 +613,7 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; + rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); /* Limit redirects both by destination (here) @@ -620,8 +621,7 @@ int ip6_forward(struct sk_buff *skb) */ if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); } else { int addrtype = ipv6_addr_type(&hdr->saddr); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index f113554d13325..d044c67019de6 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1731,10 +1731,12 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) "Redirect: destination is not a neighbour\n"); goto release; } + + rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); ret = inet_peer_xrlim_allow(peer, 1*HZ); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); + if (!ret) goto release; From 1ba06ca96ca255c079ce5ea6a75cc0bfd5e97921 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 14:18:44 +0100 Subject: [PATCH 0673/1450] mlxsw: Switch to napi_gro_receive() Benefit from the recent conversion of the driver to NAPI and enable GRO support through the use of napi_gro_receive(). Pass the NAPI pointer from the bus driver (mlxsw_pci) to the switch driver (mlxsw_spectrum) through the skb control block where various packet metadata is already encoded. The main motivation is to improve forwarding performance through the use of GRO fraglist [1]. In my testing, when the forwarding data path is simple (routing between two ports) there is not much difference in forwarding performance between GRO disabled and GRO enabled with fraglist. The improvement becomes more noticeable as the data path becomes more complex since it is traversed less times with GRO enabled. For example, with 10 ingress and 10 egress flower filters with different priorities on the two ports between which routing is performed, there is an improvement of about 140% in forwarded bandwidth. [1] https://lore.kernel.org/netdev/20200125102645.4782-1-steffen.klassert@secunet.com/ Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: Amit Cohen Signed-off-by: Petr Machata Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/21258fe55f608ccf1ee2783a5a4534220af28903.1734354812.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/core.h | 1 + drivers/net/ethernet/mellanox/mlxsw/pci.c | 4 +++- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index 6d11225594dd1..24c3ff6fcf71b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -73,6 +73,7 @@ struct mlxsw_tx_info { }; struct mlxsw_rx_md_info { + struct napi_struct *napi; u32 cookie_index; u32 latency; u32 tx_congestion; diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index d6f37456fb317..0863dca2fc0be 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -737,6 +737,7 @@ static void mlxsw_pci_cqe_rdq_md_init(struct sk_buff *skb, const char *cqe) } static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci, + struct napi_struct *napi, struct mlxsw_pci_queue *q, u16 consumer_counter_limit, enum mlxsw_pci_cqe_v cqe_v, char *cqe) @@ -807,6 +808,7 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci, } mlxsw_pci_skb_cb_ts_set(mlxsw_pci, skb, cqe_v, cqe); + mlxsw_skb_cb(skb)->rx_md_info.napi = napi; mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info); @@ -869,7 +871,7 @@ static int mlxsw_pci_napi_poll_cq_rx(struct napi_struct *napi, int budget) continue; } - mlxsw_pci_cqe_rdq_handle(mlxsw_pci, rdq, + mlxsw_pci_cqe_rdq_handle(mlxsw_pci, napi, rdq, wqe_counter, q->u.cq.v, cqe); if (++work_done == budget) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 3f5e5d99251b7..aa71993daf282 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2449,7 +2449,7 @@ void mlxsw_sp_rx_listener_no_mark_func(struct sk_buff *skb, u64_stats_update_end(&pcpu_stats->syncp); skb->protocol = eth_type_trans(skb, skb->dev); - netif_receive_skb(skb); + napi_gro_receive(mlxsw_skb_cb(skb)->rx_md_info.napi, skb); } static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u16 local_port, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index 899c954e0e5f8..1f9c1c86839f1 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -173,7 +173,7 @@ static void mlxsw_sp_rx_no_mark_listener(struct sk_buff *skb, u16 local_port, if (err) return; - netif_receive_skb(skb); + napi_gro_receive(mlxsw_skb_cb(skb)->rx_md_info.napi, skb); } static void mlxsw_sp_rx_mark_listener(struct sk_buff *skb, u16 local_port, From 33d06d1d28124b042178894584b727fdf83660b1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Dec 2024 15:51:22 +0000 Subject: [PATCH 0674/1450] niu: Use page->private instead of page->index We are close to removing page->index. Use page->private instead, which is least likely to be removed. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20241216155124.3114-1-willy@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sun/niu.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index df6d35d41b976..d7459866d24c1 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -3303,7 +3303,7 @@ static struct page *niu_find_rxpage(struct rx_ring_info *rp, u64 addr, addr &= PAGE_MASK; pp = &rp->rxhash[h]; for (; (p = *pp) != NULL; pp = &niu_next_page(p)) { - if (p->index == addr) { + if (p->private == addr) { *link = pp; goto found; } @@ -3318,7 +3318,7 @@ static void niu_hash_page(struct rx_ring_info *rp, struct page *page, u64 base) { unsigned int h = niu_hash_rxaddr(rp, base); - page->index = base; + page->private = base; niu_next_page(page) = rp->rxhash[h]; rp->rxhash[h] = page; } @@ -3400,11 +3400,11 @@ static int niu_rx_pkt_ignore(struct niu *np, struct rx_ring_info *rp) rcr_size = rp->rbr_sizes[(val & RCR_ENTRY_PKTBUFSZ) >> RCR_ENTRY_PKTBUFSZ_SHIFT]; - if ((page->index + PAGE_SIZE) - rcr_size == addr) { + if ((page->private + PAGE_SIZE) - rcr_size == addr) { *link = niu_next_page(page); - np->ops->unmap_page(np->device, page->index, + np->ops->unmap_page(np->device, page->private, PAGE_SIZE, DMA_FROM_DEVICE); - page->index = 0; + page->private = 0; niu_next_page(page) = NULL; __free_page(page); rp->rbr_refill_pending++; @@ -3469,11 +3469,11 @@ static int niu_process_rx_pkt(struct napi_struct *napi, struct niu *np, append_size = append_size - skb->len; niu_rx_skb_append(skb, page, off, append_size, rcr_size); - if ((page->index + rp->rbr_block_size) - rcr_size == addr) { + if ((page->private + rp->rbr_block_size) - rcr_size == addr) { *link = niu_next_page(page); - np->ops->unmap_page(np->device, page->index, + np->ops->unmap_page(np->device, page->private, PAGE_SIZE, DMA_FROM_DEVICE); - page->index = 0; + page->private = 0; niu_next_page(page) = NULL; rp->rbr_refill_pending++; } else @@ -3538,11 +3538,11 @@ static void niu_rbr_free(struct niu *np, struct rx_ring_info *rp) page = rp->rxhash[i]; while (page) { struct page *next = niu_next_page(page); - u64 base = page->index; + u64 base = page->private; np->ops->unmap_page(np->device, base, PAGE_SIZE, DMA_FROM_DEVICE); - page->index = 0; + page->private = 0; niu_next_page(page) = NULL; __free_page(page); @@ -6460,7 +6460,7 @@ static void niu_reset_buffers(struct niu *np) page = rp->rxhash[j]; while (page) { struct page *next = niu_next_page(page); - u64 base = page->index; + u64 base = page->private; base = base >> RBR_DESCR_ADDR_SHIFT; rp->rbr[k++] = cpu_to_le32(base); page = next; From 30c63abaee9024ed7524325b3eeb7f2d26727c31 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 16 Dec 2024 13:09:36 +0100 Subject: [PATCH 0675/1450] net: usb: lan78xx: Add error handling to lan78xx_get_regs Update `lan78xx_get_regs` to handle errors during register and PHY reads. Log warnings for failed reads and exit the function early if an error occurs. Drop all previously logged registers to signal inconsistent readings to the user space. This ensures that invalid data is not returned to users. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20241216120941.1690908-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 4661d131b1904..270345fcad651 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2108,20 +2108,44 @@ static void lan78xx_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *buf) { - u32 *data = buf; - int i, j; struct lan78xx_net *dev = netdev_priv(netdev); + unsigned int data_count = 0; + u32 *data = buf; + int i, j, ret; /* Read Device/MAC registers */ - for (i = 0; i < ARRAY_SIZE(lan78xx_regs); i++) - lan78xx_read_reg(dev, lan78xx_regs[i], &data[i]); + for (i = 0; i < ARRAY_SIZE(lan78xx_regs); i++) { + ret = lan78xx_read_reg(dev, lan78xx_regs[i], &data[i]); + if (ret < 0) { + netdev_warn(dev->net, + "failed to read register 0x%08x\n", + lan78xx_regs[i]); + goto clean_data; + } + + data_count++; + } if (!netdev->phydev) return; /* Read PHY registers */ - for (j = 0; j < 32; i++, j++) - data[i] = phy_read(netdev->phydev, j); + for (j = 0; j < 32; i++, j++) { + ret = phy_read(netdev->phydev, j); + if (ret < 0) { + netdev_warn(dev->net, + "failed to read PHY register 0x%02x\n", j); + goto clean_data; + } + + data[i] = ret; + data_count++; + } + + return; + +clean_data: + memset(data, 0, data_count * sizeof(u32)); } static const struct ethtool_ops lan78xx_ethtool_ops = { From 18bdefe62439c75227021ddbbf6510aa2f2f4e54 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 16 Dec 2024 13:09:37 +0100 Subject: [PATCH 0676/1450] net: usb: lan78xx: Use ETIMEDOUT instead of ETIME in lan78xx_stop_hw Update lan78xx_stop_hw to return -ETIMEDOUT instead of -ETIME when a timeout occurs. While -ETIME indicates a general timer expiration, -ETIMEDOUT is more commonly used for signaling operation timeouts and provides better consistency with standard error handling in the driver. The -ETIME checks in tx_complete() and rx_complete() are unrelated to this error handling change. In these functions, the error values are derived from urb->status, which reflects USB transfer errors. The error value from lan78xx_stop_hw will be exposed in the following cases: - usb_driver::suspend - net_device_ops::ndo_stop (potentially, though currently the return value is not used). Signed-off-by: Oleksij Rempel Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20241216120941.1690908-3-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 270345fcad651..4674051f5c9cc 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -844,9 +844,7 @@ static int lan78xx_stop_hw(struct lan78xx_net *dev, u32 reg, u32 hw_enabled, } while (!stopped && !time_after(jiffies, timeout)); } - ret = stopped ? 0 : -ETIME; - - return ret; + return stopped ? 0 : -ETIMEDOUT; } static int lan78xx_flush_fifo(struct lan78xx_net *dev, u32 reg, u32 fifo_flush) From 7433d022b915977a0e361a036aa06a0d382a9630 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 16 Dec 2024 13:09:38 +0100 Subject: [PATCH 0677/1450] net: usb: lan78xx: Use action-specific label in lan78xx_mac_reset Rename the generic `done` label to the action-specific `exit_unlock` label in `lan78xx_mac_reset`. This improves clarity by indicating the specific cleanup action (mutex unlock) and aligns with best practices for error handling and cleanup labels. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20241216120941.1690908-4-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 4674051f5c9cc..30301af29ab2b 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1604,16 +1604,16 @@ static int lan78xx_mac_reset(struct lan78xx_net *dev) */ ret = lan78xx_mdiobus_wait_not_busy(dev); if (ret < 0) - goto done; + goto exit_unlock; ret = lan78xx_read_reg(dev, MAC_CR, &val); if (ret < 0) - goto done; + goto exit_unlock; val |= MAC_CR_RST_; ret = lan78xx_write_reg(dev, MAC_CR, val); if (ret < 0) - goto done; + goto exit_unlock; /* Wait for the reset to complete before allowing any further * MAC register accesses otherwise the MAC may lock up. @@ -1621,16 +1621,16 @@ static int lan78xx_mac_reset(struct lan78xx_net *dev) do { ret = lan78xx_read_reg(dev, MAC_CR, &val); if (ret < 0) - goto done; + goto exit_unlock; if (!(val & MAC_CR_RST_)) { ret = 0; - goto done; + goto exit_unlock; } } while (!time_after(jiffies, start_time + HZ)); ret = -ETIMEDOUT; -done: +exit_unlock: mutex_unlock(&dev->phy_mutex); return ret; From 3a59437ed9072fa812e3e30bf0637ca94a239652 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 16 Dec 2024 13:09:39 +0100 Subject: [PATCH 0678/1450] net: usb: lan78xx: rename phy_mutex to mdiobus_mutex Rename `phy_mutex` to `mdiobus_mutex` for clarity, as the mutex protects MDIO bus access rather than PHY-specific operations. Update all references to ensure consistency. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20241216120941.1690908-5-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 30301af29ab2b..78c75599b8f17 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -439,7 +439,7 @@ struct lan78xx_net { struct usb_anchor deferred; struct mutex dev_mutex; /* serialise open/stop wrt suspend/resume */ - struct mutex phy_mutex; /* for phy access */ + struct mutex mdiobus_mutex; /* for MDIO bus access */ unsigned int pipe_in, pipe_out, pipe_intr; unsigned int bulk_in_delay; @@ -952,7 +952,7 @@ static int lan78xx_flush_rx_fifo(struct lan78xx_net *dev) return lan78xx_flush_fifo(dev, FCT_RX_CTL, FCT_RX_CTL_RST_); } -/* Loop until the read is completed with timeout called with phy_mutex held */ +/* Loop until the read is completed with timeout called with mdiobus_mutex held */ static int lan78xx_mdiobus_wait_not_busy(struct lan78xx_net *dev) { unsigned long start_time = jiffies; @@ -1596,7 +1596,7 @@ static int lan78xx_mac_reset(struct lan78xx_net *dev) u32 val; int ret; - mutex_lock(&dev->phy_mutex); + mutex_lock(&dev->mdiobus_mutex); /* Resetting the device while there is activity on the MDIO * bus can result in the MAC interface locking up and not @@ -1631,7 +1631,7 @@ static int lan78xx_mac_reset(struct lan78xx_net *dev) ret = -ETIMEDOUT; exit_unlock: - mutex_unlock(&dev->phy_mutex); + mutex_unlock(&dev->mdiobus_mutex); return ret; } @@ -2249,7 +2249,7 @@ static int lan78xx_mdiobus_read(struct mii_bus *bus, int phy_id, int idx) if (ret < 0) return ret; - mutex_lock(&dev->phy_mutex); + mutex_lock(&dev->mdiobus_mutex); /* confirm MII not busy */ ret = lan78xx_mdiobus_wait_not_busy(dev); @@ -2273,7 +2273,7 @@ static int lan78xx_mdiobus_read(struct mii_bus *bus, int phy_id, int idx) ret = (int)(val & 0xFFFF); done: - mutex_unlock(&dev->phy_mutex); + mutex_unlock(&dev->mdiobus_mutex); usb_autopm_put_interface(dev->intf); return ret; @@ -2290,7 +2290,7 @@ static int lan78xx_mdiobus_write(struct mii_bus *bus, int phy_id, int idx, if (ret < 0) return ret; - mutex_lock(&dev->phy_mutex); + mutex_lock(&dev->mdiobus_mutex); /* confirm MII not busy */ ret = lan78xx_mdiobus_wait_not_busy(dev); @@ -2313,7 +2313,7 @@ static int lan78xx_mdiobus_write(struct mii_bus *bus, int phy_id, int idx, goto done; done: - mutex_unlock(&dev->phy_mutex); + mutex_unlock(&dev->mdiobus_mutex); usb_autopm_put_interface(dev->intf); return ret; } @@ -4476,7 +4476,7 @@ static int lan78xx_probe(struct usb_interface *intf, skb_queue_head_init(&dev->rxq_done); skb_queue_head_init(&dev->txq_pend); skb_queue_head_init(&dev->rxq_overflow); - mutex_init(&dev->phy_mutex); + mutex_init(&dev->mdiobus_mutex); mutex_init(&dev->dev_mutex); ret = lan78xx_urb_config_init(dev); From d09de7ebd4abf26d9aee072b82a514c372d278b5 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 16 Dec 2024 13:09:40 +0100 Subject: [PATCH 0679/1450] net: usb: lan78xx: remove PHY register access from ethtool get_regs Remove PHY register handling from `lan78xx_get_regs` and `lan78xx_get_regs_len`. Since the controller can have different PHYs attached, the first 32 registers are not universally relevant or the most interesting. Simplify the implementation to focus on MAC and device registers. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20241216120941.1690908-6-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 78c75599b8f17..6c9dab290f3f9 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2096,10 +2096,7 @@ static int lan78xx_set_pause(struct net_device *net, static int lan78xx_get_regs_len(struct net_device *netdev) { - if (!netdev->phydev) - return (sizeof(lan78xx_regs)); - else - return (sizeof(lan78xx_regs) + PHY_REG_SIZE); + return sizeof(lan78xx_regs); } static void @@ -2109,7 +2106,7 @@ lan78xx_get_regs(struct net_device *netdev, struct ethtool_regs *regs, struct lan78xx_net *dev = netdev_priv(netdev); unsigned int data_count = 0; u32 *data = buf; - int i, j, ret; + int i, ret; /* Read Device/MAC registers */ for (i = 0; i < ARRAY_SIZE(lan78xx_regs); i++) { @@ -2124,22 +2121,6 @@ lan78xx_get_regs(struct net_device *netdev, struct ethtool_regs *regs, data_count++; } - if (!netdev->phydev) - return; - - /* Read PHY registers */ - for (j = 0; j < 32; i++, j++) { - ret = phy_read(netdev->phydev, j); - if (ret < 0) { - netdev_warn(dev->net, - "failed to read PHY register 0x%02x\n", j); - goto clean_data; - } - - data[i] = ret; - data_count++; - } - return; clean_data: From 01e2f4d55bda0e24548e1458e77975898683a2cd Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 16 Dec 2024 13:09:41 +0100 Subject: [PATCH 0680/1450] net: usb: lan78xx: Improve error handling in WoL operations Enhance error handling in Wake-on-LAN (WoL) operations: - Log a warning in `lan78xx_get_wol` if `lan78xx_read_reg` fails. - Check and handle errors from `device_set_wakeup_enable` and `phy_ethtool_set_wol` in `lan78xx_set_wol`. - Ensure proper cleanup with a unified error handling path. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241216120941.1690908-7-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/lan78xx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 6c9dab290f3f9..a91bf9c7e31d2 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1857,6 +1857,7 @@ static void lan78xx_get_wol(struct net_device *netdev, ret = lan78xx_read_reg(dev, USB_CFG0, &buf); if (unlikely(ret < 0)) { + netdev_warn(dev->net, "failed to get WoL %pe", ERR_PTR(ret)); wol->supported = 0; wol->wolopts = 0; } else { @@ -1888,10 +1889,13 @@ static int lan78xx_set_wol(struct net_device *netdev, pdata->wol = wol->wolopts; - device_set_wakeup_enable(&dev->udev->dev, (bool)wol->wolopts); + ret = device_set_wakeup_enable(&dev->udev->dev, (bool)wol->wolopts); + if (ret < 0) + goto exit_pm_put; - phy_ethtool_set_wol(netdev->phydev, wol); + ret = phy_ethtool_set_wol(netdev->phydev, wol); +exit_pm_put: usb_autopm_put_interface(dev->intf); return ret; From 86331b510260bdb4b4b0dcac2eeb81a82eb161c3 Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Mon, 16 Dec 2024 12:05:26 +0800 Subject: [PATCH 0681/1450] net: hibmcge: Add debugfs supported in this module This patch initializes debugfs and creates root directory for each device. The tx_ring and rx_ring debugfs files are implemented together. Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241216040532.1566229-2-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/hisilicon/hibmcge/Makefile | 3 +- .../ethernet/hisilicon/hibmcge/hbg_debugfs.c | 94 +++++++++++++++++++ .../ethernet/hisilicon/hibmcge/hbg_debugfs.h | 12 +++ .../net/ethernet/hisilicon/hibmcge/hbg_main.c | 29 +++++- 4 files changed, 135 insertions(+), 3 deletions(-) create mode 100644 drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c create mode 100644 drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.h diff --git a/drivers/net/ethernet/hisilicon/hibmcge/Makefile b/drivers/net/ethernet/hisilicon/hibmcge/Makefile index ae58ac38c206e..1a0ec2fb8c24d 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/Makefile +++ b/drivers/net/ethernet/hisilicon/hibmcge/Makefile @@ -5,4 +5,5 @@ obj-$(CONFIG_HIBMCGE) += hibmcge.o -hibmcge-objs = hbg_main.o hbg_hw.o hbg_mdio.o hbg_irq.o hbg_txrx.o hbg_ethtool.o +hibmcge-objs = hbg_main.o hbg_hw.o hbg_mdio.o hbg_irq.o hbg_txrx.o hbg_ethtool.o \ + hbg_debugfs.o diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c new file mode 100644 index 0000000000000..773a6434b1143 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (c) 2024 Hisilicon Limited. + +#include +#include +#include +#include +#include +#include "hbg_common.h" +#include "hbg_debugfs.h" +#include "hbg_hw.h" +#include "hbg_irq.h" +#include "hbg_txrx.h" + +static struct dentry *hbg_dbgfs_root; + +struct hbg_dbg_info { + const char *name; + int (*read)(struct seq_file *seq, void *data); +}; + +static void hbg_dbg_ring(struct hbg_priv *priv, struct hbg_ring *ring, + struct seq_file *s) +{ + u32 irq_mask = ring->dir == HBG_DIR_TX ? HBG_INT_MSK_TX_B : + HBG_INT_MSK_RX_B; + + seq_printf(s, "ring used num: %u\n", + hbg_get_queue_used_num(ring)); + seq_printf(s, "ring max num: %u\n", ring->len); + seq_printf(s, "ring head: %u, tail: %u\n", ring->head, ring->tail); + seq_printf(s, "fifo used num: %u\n", + hbg_hw_get_fifo_used_num(priv, ring->dir)); + seq_printf(s, "fifo max num: %u\n", + hbg_get_spec_fifo_max_num(priv, ring->dir)); + seq_printf(s, "irq enabled: %s\n", + str_true_false(hbg_hw_irq_is_enabled(priv, irq_mask))); +} + +static int hbg_dbg_tx_ring(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + + hbg_dbg_ring(priv, &priv->tx_ring, s); + return 0; +} + +static int hbg_dbg_rx_ring(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + + hbg_dbg_ring(priv, &priv->rx_ring, s); + return 0; +} + +static const struct hbg_dbg_info hbg_dbg_infos[] = { + { "tx_ring", hbg_dbg_tx_ring }, + { "rx_ring", hbg_dbg_rx_ring }, +}; + +static void hbg_debugfs_uninit(void *data) +{ + debugfs_remove_recursive((struct dentry *)data); +} + +void hbg_debugfs_init(struct hbg_priv *priv) +{ + const char *name = pci_name(priv->pdev); + struct device *dev = &priv->pdev->dev; + struct dentry *root; + u32 i; + + root = debugfs_create_dir(name, hbg_dbgfs_root); + + for (i = 0; i < ARRAY_SIZE(hbg_dbg_infos); i++) + debugfs_create_devm_seqfile(dev, hbg_dbg_infos[i].name, + root, hbg_dbg_infos[i].read); + + /* Ignore the failure because debugfs is not a key feature. */ + devm_add_action_or_reset(dev, hbg_debugfs_uninit, root); +} + +void hbg_debugfs_register(void) +{ + hbg_dbgfs_root = debugfs_create_dir("hibmcge", NULL); +} + +void hbg_debugfs_unregister(void) +{ + debugfs_remove_recursive(hbg_dbgfs_root); + hbg_dbgfs_root = NULL; +} diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.h new file mode 100644 index 0000000000000..80670d66bbeb8 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (c) 2024 Hisilicon Limited. */ + +#ifndef __HBG_DEBUGFS_H +#define __HBG_DEBUGFS_H + +void hbg_debugfs_register(void); +void hbg_debugfs_unregister(void); + +void hbg_debugfs_init(struct hbg_priv *priv); + +#endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c index 75505fb5cc4a5..7a03fdfa32a75 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c @@ -11,6 +11,7 @@ #include "hbg_irq.h" #include "hbg_mdio.h" #include "hbg_txrx.h" +#include "hbg_debugfs.h" static void hbg_change_mtu(struct hbg_priv *priv, int new_mtu); @@ -160,7 +161,12 @@ static int hbg_init(struct hbg_priv *priv) if (ret) return ret; - return hbg_mdio_init(priv); + ret = hbg_mdio_init(priv); + if (ret) + return ret; + + hbg_debugfs_init(priv); + return 0; } static int hbg_pci_init(struct pci_dev *pdev) @@ -245,7 +251,26 @@ static struct pci_driver hbg_driver = { .id_table = hbg_pci_tbl, .probe = hbg_probe, }; -module_pci_driver(hbg_driver); + +static int __init hbg_module_init(void) +{ + int ret; + + hbg_debugfs_register(); + ret = pci_register_driver(&hbg_driver); + if (ret) + hbg_debugfs_unregister(); + + return ret; +} +module_init(hbg_module_init); + +static void __exit hbg_module_exit(void) +{ + pci_unregister_driver(&hbg_driver); + hbg_debugfs_unregister(); +} +module_exit(hbg_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Huawei Tech. Co., Ltd."); From df491c419bcb37af6e250d15a872218673141fb2 Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Mon, 16 Dec 2024 12:05:27 +0800 Subject: [PATCH 0682/1450] net: hibmcge: Add irq_info file to debugfs the driver requested three interrupts: "tx", "rx", "err". The err interrupt is a summary interrupt. We distinguish different errors based on the status register and mask. With "cat /proc/interrupts | grep hibmcge", we can't distinguish the detailed cause of the error, so we added this file to debugfs. the following effects are achieved: [root@localhost sjj]# cat /sys/kernel/debug/hibmcge/0000\:83\:00.1/irq_info RX : enabled: true , logged: false, count: 0 TX : enabled: true , logged: false, count: 0 MAC_MII_FIFO_ERR : enabled: false, logged: true , count: 0 MAC_PCS_RX_FIFO_ERR : enabled: false, logged: true , count: 0 MAC_PCS_TX_FIFO_ERR : enabled: false, logged: true , count: 0 MAC_APP_RX_FIFO_ERR : enabled: false, logged: true , count: 0 MAC_APP_TX_FIFO_ERR : enabled: false, logged: true , count: 0 SRAM_PARITY_ERR : enabled: true , logged: true , count: 0 TX_AHB_ERR : enabled: true , logged: true , count: 0 RX_BUF_AVL : enabled: true , logged: false, count: 0 REL_BUF_ERR : enabled: true , logged: true , count: 0 TXCFG_AVL : enabled: true , logged: false, count: 0 TX_DROP : enabled: true , logged: false, count: 0 RX_DROP : enabled: true , logged: false, count: 0 RX_AHB_ERR : enabled: true , logged: true , count: 0 MAC_FIFO_ERR : enabled: true , logged: false, count: 0 RBREQ_ERR : enabled: true , logged: false, count: 0 WE_ERR : enabled: true , logged: false, count: 0 The irq framework of hibmcge driver also includes tx/rx interrupts. Therefore, TX and RX are not moved separately form this file. Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241216040532.1566229-3-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hibmcge/hbg_debugfs.c | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c index 773a6434b1143..56d8599563c0b 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c @@ -55,9 +55,31 @@ static int hbg_dbg_rx_ring(struct seq_file *s, void *unused) return 0; } +static int hbg_dbg_irq_info(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + struct hbg_irq_info *info; + u32 i; + + for (i = 0; i < priv->vectors.info_array_len; i++) { + info = &priv->vectors.info_array[i]; + seq_printf(s, + "%-20s: enabled: %-5s, logged: %-5s, count: %llu\n", + info->name, + str_true_false(hbg_hw_irq_is_enabled(priv, + info->mask)), + str_true_false(info->need_print), + info->count); + } + + return 0; +} + static const struct hbg_dbg_info hbg_dbg_infos[] = { { "tx_ring", hbg_dbg_tx_ring }, { "rx_ring", hbg_dbg_rx_ring }, + { "irq_info", hbg_dbg_irq_info }, }; static void hbg_debugfs_uninit(void *data) From 37b367d60d0f91260cd787ffbfba8e71e8f6fc7c Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Mon, 16 Dec 2024 12:05:28 +0800 Subject: [PATCH 0683/1450] net: hibmcge: Add unicast frame filter supported in this module MAC supports filtering unmatched unicast packets according to the MAC address table. This patch adds the support for unicast frame filtering. To support automatic restoration of MAC entries after reset, the driver saves a copy of MAC entries in the driver. Signed-off-by: Jijie Shao Reviewed-by: Hariprasad Kelam Link: https://patch.msgid.link/20241216040532.1566229-4-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hibmcge/hbg_common.h | 12 ++ .../ethernet/hisilicon/hibmcge/hbg_debugfs.c | 22 +++ .../net/ethernet/hisilicon/hibmcge/hbg_hw.c | 17 ++- .../net/ethernet/hisilicon/hibmcge/hbg_hw.h | 3 +- .../net/ethernet/hisilicon/hibmcge/hbg_main.c | 136 +++++++++++++++++- .../net/ethernet/hisilicon/hibmcge/hbg_reg.h | 3 + 6 files changed, 187 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h index 96daf058d387c..9bb3abe883773 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h @@ -84,6 +84,7 @@ struct hbg_dev_specs { u32 vlan_layers; u32 max_mtu; u32 min_mtu; + u32 uc_mac_num; u32 max_frame_len; u32 rx_buf_size; @@ -116,6 +117,16 @@ struct hbg_mac { u32 link_status; }; +struct hbg_mac_table_entry { + u8 addr[ETH_ALEN]; +}; + +struct hbg_mac_filter { + struct hbg_mac_table_entry *mac_table; + u32 table_max_len; + bool enabled; +}; + struct hbg_priv { struct net_device *netdev; struct pci_dev *pdev; @@ -126,6 +137,7 @@ struct hbg_priv { struct hbg_vector vectors; struct hbg_ring tx_ring; struct hbg_ring rx_ring; + struct hbg_mac_filter filter; }; #endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c index 56d8599563c0b..616b86333eec1 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c @@ -76,10 +76,32 @@ static int hbg_dbg_irq_info(struct seq_file *s, void *unused) return 0; } +static int hbg_dbg_mac_table(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + struct hbg_mac_filter *filter; + u32 i; + + filter = &priv->filter; + seq_printf(s, "mac addr max count: %u\n", filter->table_max_len); + seq_printf(s, "filter enabled: %s\n", str_true_false(filter->enabled)); + + for (i = 0; i < filter->table_max_len; i++) { + if (is_zero_ether_addr(filter->mac_table[i].addr)) + continue; + + seq_printf(s, "[%u] %pM\n", i, filter->mac_table[i].addr); + } + + return 0; +} + static const struct hbg_dbg_info hbg_dbg_infos[] = { { "tx_ring", hbg_dbg_tx_ring }, { "rx_ring", hbg_dbg_rx_ring }, { "irq_info", hbg_dbg_irq_info }, + { "mac_table", hbg_dbg_mac_table }, }; static void hbg_debugfs_uninit(void *data) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c index 05295c2ad4392..29d66a0ea0a6c 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c @@ -67,6 +67,8 @@ static int hbg_hw_dev_specs_init(struct hbg_priv *priv) specs->vlan_layers = hbg_reg_read(priv, HBG_REG_VLAN_LAYERS_ADDR); specs->rx_fifo_num = hbg_reg_read(priv, HBG_REG_RX_FIFO_NUM_ADDR); specs->tx_fifo_num = hbg_reg_read(priv, HBG_REG_TX_FIFO_NUM_ADDR); + specs->uc_mac_num = hbg_reg_read(priv, HBG_REG_UC_MAC_NUM_ADDR); + mac_addr = hbg_reg_read64(priv, HBG_REG_MAC_ADDR_ADDR); u64_to_ether_addr(mac_addr, (u8 *)specs->mac_addr.sa_data); @@ -135,9 +137,13 @@ void hbg_hw_irq_enable(struct hbg_priv *priv, u32 mask, bool enable) hbg_reg_write(priv, HBG_REG_CF_INTRPT_MSK_ADDR, value); } -void hbg_hw_set_uc_addr(struct hbg_priv *priv, u64 mac_addr) +void hbg_hw_set_uc_addr(struct hbg_priv *priv, u64 mac_addr, u32 index) { - hbg_reg_write64(priv, HBG_REG_STATION_ADDR_LOW_2_ADDR, mac_addr); + u32 addr; + + /* mac addr is u64, so the addr offset is 0x8 */ + addr = HBG_REG_STATION_ADDR_LOW_2_ADDR + (index * 0x8); + hbg_reg_write64(priv, addr, mac_addr); } static void hbg_hw_set_pcu_max_frame_len(struct hbg_priv *priv, @@ -207,6 +213,13 @@ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex) HBG_REG_DUPLEX_B, duplex); } +/* only support uc filter */ +void hbg_hw_set_mac_filter_enable(struct hbg_priv *priv, u32 enable) +{ + hbg_reg_write_field(priv, HBG_REG_REC_FILT_CTRL_ADDR, + HBG_REG_REC_FILT_CTRL_UC_MATCH_EN_B, enable); +} + static void hbg_hw_init_transmit_ctrl(struct hbg_priv *priv) { u32 ctrl = 0; diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h index 14fb39241c93f..6eb4b7d2cba84 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h @@ -51,9 +51,10 @@ bool hbg_hw_irq_is_enabled(struct hbg_priv *priv, u32 mask); void hbg_hw_irq_enable(struct hbg_priv *priv, u32 mask, bool enable); void hbg_hw_set_mtu(struct hbg_priv *priv, u16 mtu); void hbg_hw_mac_enable(struct hbg_priv *priv, u32 enable); -void hbg_hw_set_uc_addr(struct hbg_priv *priv, u64 mac_addr); +void hbg_hw_set_uc_addr(struct hbg_priv *priv, u64 mac_addr, u32 index); u32 hbg_hw_get_fifo_used_num(struct hbg_priv *priv, enum hbg_dir dir); void hbg_hw_set_tx_desc(struct hbg_priv *priv, struct hbg_tx_desc *tx_desc); void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr); +void hbg_hw_set_mac_filter_enable(struct hbg_priv *priv, u32 enable); #endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c index 7a03fdfa32a75..578ba8ee409be 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c @@ -57,7 +57,7 @@ static int hbg_hw_txrx_clear(struct hbg_priv *priv) /* After reset, regs need to be reconfigured */ hbg_hw_init(priv); - hbg_hw_set_uc_addr(priv, ether_addr_to_u64(priv->netdev->dev_addr)); + hbg_hw_set_uc_addr(priv, ether_addr_to_u64(priv->netdev->dev_addr), 0); hbg_change_mtu(priv, priv->netdev->mtu); return 0; @@ -75,19 +75,123 @@ static int hbg_net_stop(struct net_device *netdev) return hbg_hw_txrx_clear(priv); } +static void hbg_update_promisc_mode(struct net_device *netdev, bool overflow) +{ + struct hbg_priv *priv = netdev_priv(netdev); + + /* Only when not table_overflow, and netdev->flags not set IFF_PROMISC, + * The MAC filter will be enabled. + * Otherwise the filter will be disabled. + */ + priv->filter.enabled = !(overflow || (netdev->flags & IFF_PROMISC)); + hbg_hw_set_mac_filter_enable(priv, priv->filter.enabled); +} + +static void hbg_set_mac_to_mac_table(struct hbg_priv *priv, + u32 index, const u8 *addr) +{ + if (addr) { + ether_addr_copy(priv->filter.mac_table[index].addr, addr); + hbg_hw_set_uc_addr(priv, ether_addr_to_u64(addr), index); + } else { + eth_zero_addr(priv->filter.mac_table[index].addr); + hbg_hw_set_uc_addr(priv, 0, index); + } +} + +static int hbg_get_index_from_mac_table(struct hbg_priv *priv, + const u8 *addr, u32 *index) +{ + u32 i; + + for (i = 0; i < priv->filter.table_max_len; i++) + if (ether_addr_equal(priv->filter.mac_table[i].addr, addr)) { + *index = i; + return 0; + } + + return -EINVAL; +} + +static int hbg_add_mac_to_filter(struct hbg_priv *priv, const u8 *addr) +{ + u32 index; + + /* already exists */ + if (!hbg_get_index_from_mac_table(priv, addr, &index)) + return 0; + + for (index = 0; index < priv->filter.table_max_len; index++) + if (is_zero_ether_addr(priv->filter.mac_table[index].addr)) { + hbg_set_mac_to_mac_table(priv, index, addr); + return 0; + } + + return -ENOSPC; +} + +static void hbg_del_mac_from_filter(struct hbg_priv *priv, const u8 *addr) +{ + u32 index; + + /* not exists */ + if (hbg_get_index_from_mac_table(priv, addr, &index)) + return; + + hbg_set_mac_to_mac_table(priv, index, NULL); +} + +static int hbg_uc_sync(struct net_device *netdev, const unsigned char *addr) +{ + struct hbg_priv *priv = netdev_priv(netdev); + + return hbg_add_mac_to_filter(priv, addr); +} + +static int hbg_uc_unsync(struct net_device *netdev, const unsigned char *addr) +{ + struct hbg_priv *priv = netdev_priv(netdev); + + if (ether_addr_equal(netdev->dev_addr, (u8 *)addr)) + return 0; + + hbg_del_mac_from_filter(priv, addr); + return 0; +} + +static void hbg_net_set_rx_mode(struct net_device *netdev) +{ + int ret; + + ret = __dev_uc_sync(netdev, hbg_uc_sync, hbg_uc_unsync); + + /* If ret != 0, overflow has occurred */ + hbg_update_promisc_mode(netdev, !!ret); +} + static int hbg_net_set_mac_address(struct net_device *netdev, void *addr) { struct hbg_priv *priv = netdev_priv(netdev); u8 *mac_addr; + bool exists; + u32 index; mac_addr = ((struct sockaddr *)addr)->sa_data; if (!is_valid_ether_addr(mac_addr)) return -EADDRNOTAVAIL; - hbg_hw_set_uc_addr(priv, ether_addr_to_u64(mac_addr)); - dev_addr_set(netdev, mac_addr); + /* The index of host mac is always 0. + * If new mac address already exists, + * delete the existing mac address and + * add it to the position with index 0. + */ + exists = !hbg_get_index_from_mac_table(priv, mac_addr, &index); + hbg_set_mac_to_mac_table(priv, 0, mac_addr); + if (exists) + hbg_set_mac_to_mac_table(priv, index, NULL); + dev_addr_set(netdev, mac_addr); return 0; } @@ -143,8 +247,28 @@ static const struct net_device_ops hbg_netdev_ops = { .ndo_set_mac_address = hbg_net_set_mac_address, .ndo_change_mtu = hbg_net_change_mtu, .ndo_tx_timeout = hbg_net_tx_timeout, + .ndo_set_rx_mode = hbg_net_set_rx_mode, }; +static int hbg_mac_filter_init(struct hbg_priv *priv) +{ + struct hbg_dev_specs *dev_specs = &priv->dev_specs; + struct hbg_mac_filter *filter = &priv->filter; + struct hbg_mac_table_entry *tmp_table; + + tmp_table = devm_kcalloc(&priv->pdev->dev, dev_specs->uc_mac_num, + sizeof(*tmp_table), GFP_KERNEL); + if (!tmp_table) + return -ENOMEM; + + filter->mac_table = tmp_table; + filter->table_max_len = dev_specs->uc_mac_num; + filter->enabled = true; + + hbg_hw_set_mac_filter_enable(priv, filter->enabled); + return 0; +} + static int hbg_init(struct hbg_priv *priv) { int ret; @@ -165,6 +289,10 @@ static int hbg_init(struct hbg_priv *priv) if (ret) return ret; + ret = hbg_mac_filter_init(priv); + if (ret) + return ret; + hbg_debugfs_init(priv); return 0; } @@ -222,6 +350,8 @@ static int hbg_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (ret) return ret; + netdev->priv_flags |= IFF_UNICAST_FLT; + netdev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netdev->max_mtu = priv->dev_specs.max_mtu; netdev->min_mtu = priv->dev_specs.min_mtu; diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h index 57d81c6d76332..8993f57ecea45 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h @@ -10,6 +10,7 @@ #define HBG_REG_MAC_ID_ADDR 0x0008 #define HBG_REG_PHY_ID_ADDR 0x000C #define HBG_REG_MAC_ADDR_ADDR 0x0010 +#define HBG_REG_UC_MAC_NUM_ADDR 0x0018 #define HBG_REG_MDIO_FREQ_ADDR 0x0024 #define HBG_REG_MAX_MTU_ADDR 0x0028 #define HBG_REG_MIN_MTU_ADDR 0x002C @@ -47,6 +48,8 @@ #define HBG_REG_TRANSMIT_CTRL_PAD_EN_B BIT(7) #define HBG_REG_TRANSMIT_CTRL_CRC_ADD_B BIT(6) #define HBG_REG_TRANSMIT_CTRL_AN_EN_B BIT(5) +#define HBG_REG_REC_FILT_CTRL_ADDR (HBG_REG_SGMII_BASE + 0x0064) +#define HBG_REG_REC_FILT_CTRL_UC_MATCH_EN_B BIT(0) #define HBG_REG_CF_CRC_STRIP_ADDR (HBG_REG_SGMII_BASE + 0x01B0) #define HBG_REG_CF_CRC_STRIP_B BIT(0) #define HBG_REG_MODE_CHANGE_EN_ADDR (HBG_REG_SGMII_BASE + 0x01B4) From 51574da8dce3c08f388893d727292364a1db8cc0 Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Mon, 16 Dec 2024 12:05:29 +0800 Subject: [PATCH 0684/1450] net: hibmcge: Add register dump supported in this module The dump register is an effective way to analyze problems. To ensure code flexibility, each register contains the type, offset, and value information. The ethtool does the pretty print based on these information. The driver can dynamically add or delete registers that need to be dumped in the future because information such as type and offset is contained. ethtool always can do pretty print. With the ethtool of a specific version, the following effects are achieved: [root@localhost sjj]# ./ethtool -d enp131s0f1 [SPEC] VALID [0x0000]: 0x00000001 [SPEC] EVENT_REQ [0x0004]: 0x00000000 [SPEC] MAC_ID [0x0008]: 0x00000002 [SPEC] PHY_ADDR [0x000c]: 0x00000002 [SPEC] MAC_ADDR_L [0x0010]: 0x00000808 [SPEC] MAC_ADDR_H [0x0014]: 0x08080802 [SPEC] UC_MAX_NUM [0x0018]: 0x00000004 [SPEC] MAX_MTU [0x0028]: 0x00000fc2 [SPEC] MIN_MTU [0x002c]: 0x00000100 [SPEC] TX_FIFO_NUM [0x0030]: 0x00000040 [SPEC] RX_FIFO_NUM [0x0034]: 0x0000007f [SPEC] VLAN_LAYERS [0x0038]: 0x00000002 [MDIO] COMMAND_REG [0x0000]: 0x0000185f [MDIO] ADDR_REG [0x0004]: 0x00000000 [MDIO] WDATA_REG [0x0008]: 0x0000a000 [MDIO] RDATA_REG [0x000c]: 0x00000000 [MDIO] STA_REG [0x0010]: 0x00000000 [GMAC] DUPLEX_TYPE [0x0008]: 0x00000001 [GMAC] FD_FC_TYPE [0x000c]: 0x00008808 [GMAC] FC_TX_TIMER [0x001c]: 0x000000ff [GMAC] FD_FC_ADDR_LOW [0x0020]: 0xc2000001 [GMAC] FD_FC_ADDR_HIGH [0x0024]: 0x00000180 [GMAC] MAX_FRM_SIZE [0x003c]: 0x000005f6 [GMAC] PORT_MODE [0x0040]: 0x00000002 [GMAC] PORT_EN [0x0044]: 0x00000006 ... Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241216040532.1566229-5-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hibmcge/hbg_ethtool.c | 140 ++++++++++++++++++ .../net/ethernet/hisilicon/hibmcge/hbg_reg.h | 34 +++++ 2 files changed, 174 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c index c3370114aef32..e7f169d2abb7e 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c @@ -3,12 +3,152 @@ #include #include +#include "hbg_common.h" #include "hbg_ethtool.h" +#include "hbg_hw.h" + +enum hbg_reg_dump_type { + HBG_DUMP_REG_TYPE_SPEC = 0, + HBG_DUMP_REG_TYPE_MDIO, + HBG_DUMP_REG_TYPE_GMAC, + HBG_DUMP_REG_TYPE_PCU, +}; + +struct hbg_reg_info { + u32 type; + u32 offset; + u32 val; +}; + +#define HBG_DUMP_SPEC_I(offset) {HBG_DUMP_REG_TYPE_SPEC, offset, 0} +#define HBG_DUMP_MDIO_I(offset) {HBG_DUMP_REG_TYPE_MDIO, offset, 0} +#define HBG_DUMP_GMAC_I(offset) {HBG_DUMP_REG_TYPE_GMAC, offset, 0} +#define HBG_DUMP_PCU_I(offset) {HBG_DUMP_REG_TYPE_PCU, offset, 0} + +static const struct hbg_reg_info hbg_dump_reg_infos[] = { + /* dev specs */ + HBG_DUMP_SPEC_I(HBG_REG_SPEC_VALID_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_EVENT_REQ_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MAC_ID_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_PHY_ID_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MAC_ADDR_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MAC_ADDR_HIGH_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_UC_MAC_NUM_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MDIO_FREQ_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MAX_MTU_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_MIN_MTU_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_TX_FIFO_NUM_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_RX_FIFO_NUM_ADDR), + HBG_DUMP_SPEC_I(HBG_REG_VLAN_LAYERS_ADDR), + + /* mdio */ + HBG_DUMP_MDIO_I(HBG_REG_MDIO_COMMAND_ADDR), + HBG_DUMP_MDIO_I(HBG_REG_MDIO_ADDR_ADDR), + HBG_DUMP_MDIO_I(HBG_REG_MDIO_WDATA_ADDR), + HBG_DUMP_MDIO_I(HBG_REG_MDIO_RDATA_ADDR), + HBG_DUMP_MDIO_I(HBG_REG_MDIO_STA_ADDR), + + /* gmac */ + HBG_DUMP_GMAC_I(HBG_REG_DUPLEX_TYPE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_FD_FC_TYPE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_FC_TX_TIMER_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_FD_FC_ADDR_LOW_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_FD_FC_ADDR_HIGH_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_MAX_FRAME_SIZE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_PORT_MODE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_PORT_ENABLE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_PAUSE_ENABLE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_AN_NEG_STATE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_TRANSMIT_CTRL_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_REC_FILT_CTRL_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_LINE_LOOP_BACK_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_CF_CRC_STRIP_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_MODE_CHANGE_EN_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_LOOP_REG_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_RECV_CTRL_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_VLAN_CODE_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_0_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_0_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_1_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_1_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_2_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_2_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_3_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_3_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_4_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_4_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_LOW_5_ADDR), + HBG_DUMP_GMAC_I(HBG_REG_STATION_ADDR_HIGH_5_ADDR), + + /* pcu */ + HBG_DUMP_PCU_I(HBG_REG_TX_FIFO_THRSLD_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_FIFO_THRSLD_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CFG_FIFO_THRSLD_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_INTRPT_MSK_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_INTRPT_STAT_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_INTRPT_CLR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_TX_BUS_ERR_ADDR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_BUS_ERR_ADDR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_MAX_FRAME_LEN_ADDR), + HBG_DUMP_PCU_I(HBG_REG_DEBUG_ST_MCH_ADDR), + HBG_DUMP_PCU_I(HBG_REG_FIFO_CURR_STATUS_ADDR), + HBG_DUMP_PCU_I(HBG_REG_FIFO_HIST_STATUS_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_CFF_DATA_NUM_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_TX_PAUSE_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_CFF_ADDR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_BUF_SIZE_ADDR), + HBG_DUMP_PCU_I(HBG_REG_BUS_CTRL_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_CTRL_ADDR), + HBG_DUMP_PCU_I(HBG_REG_RX_PKT_MODE_ADDR), + HBG_DUMP_PCU_I(HBG_REG_DBG_ST0_ADDR), + HBG_DUMP_PCU_I(HBG_REG_DBG_ST1_ADDR), + HBG_DUMP_PCU_I(HBG_REG_DBG_ST2_ADDR), + HBG_DUMP_PCU_I(HBG_REG_BUS_RST_EN_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_TXINT_MSK_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_TXINT_STAT_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_TXINT_CLR_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_RXINT_MSK_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_RXINT_STAT_ADDR), + HBG_DUMP_PCU_I(HBG_REG_CF_IND_RXINT_CLR_ADDR), +}; + +static const u32 hbg_dump_type_base_array[] = { + [HBG_DUMP_REG_TYPE_SPEC] = 0, + [HBG_DUMP_REG_TYPE_MDIO] = HBG_REG_MDIO_BASE, + [HBG_DUMP_REG_TYPE_GMAC] = HBG_REG_SGMII_BASE, + [HBG_DUMP_REG_TYPE_PCU] = HBG_REG_SGMII_BASE, +}; + +static int hbg_ethtool_get_regs_len(struct net_device *netdev) +{ + return ARRAY_SIZE(hbg_dump_reg_infos) * sizeof(struct hbg_reg_info); +} + +static void hbg_ethtool_get_regs(struct net_device *netdev, + struct ethtool_regs *regs, void *data) +{ + struct hbg_priv *priv = netdev_priv(netdev); + struct hbg_reg_info *info; + u32 i, offset = 0; + + regs->version = 0; + for (i = 0; i < ARRAY_SIZE(hbg_dump_reg_infos); i++) { + info = data + offset; + + *info = hbg_dump_reg_infos[i]; + info->val = hbg_reg_read(priv, info->offset); + info->offset -= hbg_dump_type_base_array[info->type]; + + offset += sizeof(*info); + } +} static const struct ethtool_ops hbg_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = phy_ethtool_get_link_ksettings, .set_link_ksettings = phy_ethtool_set_link_ksettings, + .get_regs_len = hbg_ethtool_get_regs_len, + .get_regs = hbg_ethtool_get_regs, }; void hbg_ethtool_set_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h index 8993f57ecea45..665666712c7cd 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h @@ -10,6 +10,7 @@ #define HBG_REG_MAC_ID_ADDR 0x0008 #define HBG_REG_PHY_ID_ADDR 0x000C #define HBG_REG_MAC_ADDR_ADDR 0x0010 +#define HBG_REG_MAC_ADDR_HIGH_ADDR 0x0014 #define HBG_REG_UC_MAC_NUM_ADDR 0x0018 #define HBG_REG_MDIO_FREQ_ADDR 0x0024 #define HBG_REG_MAX_MTU_ADDR 0x0028 @@ -29,6 +30,7 @@ #define HBG_REG_MDIO_COMMAND_OP_M GENMASK(11, 10) #define HBG_REG_MDIO_COMMAND_PRTAD_M GENMASK(9, 5) #define HBG_REG_MDIO_COMMAND_DEVAD_M GENMASK(4, 0) +#define HBG_REG_MDIO_ADDR_ADDR (HBG_REG_MDIO_BASE + 0x0004) #define HBG_REG_MDIO_WDATA_ADDR (HBG_REG_MDIO_BASE + 0x0008) #define HBG_REG_MDIO_WDATA_M GENMASK(15, 0) #define HBG_REG_MDIO_RDATA_ADDR (HBG_REG_MDIO_BASE + 0x000C) @@ -37,6 +39,10 @@ /* GMAC */ #define HBG_REG_SGMII_BASE 0x10000 #define HBG_REG_DUPLEX_TYPE_ADDR (HBG_REG_SGMII_BASE + 0x0008) +#define HBG_REG_FD_FC_TYPE_ADDR (HBG_REG_SGMII_BASE + 0x000C) +#define HBG_REG_FC_TX_TIMER_ADDR (HBG_REG_SGMII_BASE + 0x001C) +#define HBG_REG_FD_FC_ADDR_LOW_ADDR (HBG_REG_SGMII_BASE + 0x0020) +#define HBG_REG_FD_FC_ADDR_HIGH_ADDR (HBG_REG_SGMII_BASE + 0x0024) #define HBG_REG_DUPLEX_B BIT(0) #define HBG_REG_MAX_FRAME_SIZE_ADDR (HBG_REG_SGMII_BASE + 0x003C) #define HBG_REG_PORT_MODE_ADDR (HBG_REG_SGMII_BASE + 0x0040) @@ -44,22 +50,40 @@ #define HBG_REG_PORT_ENABLE_ADDR (HBG_REG_SGMII_BASE + 0x0044) #define HBG_REG_PORT_ENABLE_RX_B BIT(1) #define HBG_REG_PORT_ENABLE_TX_B BIT(2) +#define HBG_REG_PAUSE_ENABLE_ADDR (HBG_REG_SGMII_BASE + 0x0048) +#define HBG_REG_AN_NEG_STATE_ADDR (HBG_REG_SGMII_BASE + 0x0058) #define HBG_REG_TRANSMIT_CTRL_ADDR (HBG_REG_SGMII_BASE + 0x0060) #define HBG_REG_TRANSMIT_CTRL_PAD_EN_B BIT(7) #define HBG_REG_TRANSMIT_CTRL_CRC_ADD_B BIT(6) #define HBG_REG_TRANSMIT_CTRL_AN_EN_B BIT(5) #define HBG_REG_REC_FILT_CTRL_ADDR (HBG_REG_SGMII_BASE + 0x0064) #define HBG_REG_REC_FILT_CTRL_UC_MATCH_EN_B BIT(0) +#define HBG_REG_LINE_LOOP_BACK_ADDR (HBG_REG_SGMII_BASE + 0x01A8) #define HBG_REG_CF_CRC_STRIP_ADDR (HBG_REG_SGMII_BASE + 0x01B0) #define HBG_REG_CF_CRC_STRIP_B BIT(0) #define HBG_REG_MODE_CHANGE_EN_ADDR (HBG_REG_SGMII_BASE + 0x01B4) #define HBG_REG_MODE_CHANGE_EN_B BIT(0) +#define HBG_REG_LOOP_REG_ADDR (HBG_REG_SGMII_BASE + 0x01DC) #define HBG_REG_RECV_CTRL_ADDR (HBG_REG_SGMII_BASE + 0x01E0) #define HBG_REG_RECV_CTRL_STRIP_PAD_EN_B BIT(3) +#define HBG_REG_VLAN_CODE_ADDR (HBG_REG_SGMII_BASE + 0x01E8) +#define HBG_REG_STATION_ADDR_LOW_0_ADDR (HBG_REG_SGMII_BASE + 0x0200) +#define HBG_REG_STATION_ADDR_HIGH_0_ADDR (HBG_REG_SGMII_BASE + 0x0204) +#define HBG_REG_STATION_ADDR_LOW_1_ADDR (HBG_REG_SGMII_BASE + 0x0208) +#define HBG_REG_STATION_ADDR_HIGH_1_ADDR (HBG_REG_SGMII_BASE + 0x020C) #define HBG_REG_STATION_ADDR_LOW_2_ADDR (HBG_REG_SGMII_BASE + 0x0210) #define HBG_REG_STATION_ADDR_HIGH_2_ADDR (HBG_REG_SGMII_BASE + 0x0214) +#define HBG_REG_STATION_ADDR_LOW_3_ADDR (HBG_REG_SGMII_BASE + 0x0218) +#define HBG_REG_STATION_ADDR_HIGH_3_ADDR (HBG_REG_SGMII_BASE + 0x021C) +#define HBG_REG_STATION_ADDR_LOW_4_ADDR (HBG_REG_SGMII_BASE + 0x0220) +#define HBG_REG_STATION_ADDR_HIGH_4_ADDR (HBG_REG_SGMII_BASE + 0x0224) +#define HBG_REG_STATION_ADDR_LOW_5_ADDR (HBG_REG_SGMII_BASE + 0x0228) +#define HBG_REG_STATION_ADDR_HIGH_5_ADDR (HBG_REG_SGMII_BASE + 0x022C) /* PCU */ +#define HBG_REG_TX_FIFO_THRSLD_ADDR (HBG_REG_SGMII_BASE + 0x0420) +#define HBG_REG_RX_FIFO_THRSLD_ADDR (HBG_REG_SGMII_BASE + 0x0424) +#define HBG_REG_CFG_FIFO_THRSLD_ADDR (HBG_REG_SGMII_BASE + 0x0428) #define HBG_REG_CF_INTRPT_MSK_ADDR (HBG_REG_SGMII_BASE + 0x042C) #define HBG_INT_MSK_WE_ERR_B BIT(31) #define HBG_INT_MSK_RBREQ_ERR_B BIT(30) @@ -81,11 +105,17 @@ #define HBG_INT_MSK_RX_B BIT(0) /* just used in driver */ #define HBG_REG_CF_INTRPT_STAT_ADDR (HBG_REG_SGMII_BASE + 0x0434) #define HBG_REG_CF_INTRPT_CLR_ADDR (HBG_REG_SGMII_BASE + 0x0438) +#define HBG_REG_TX_BUS_ERR_ADDR_ADDR (HBG_REG_SGMII_BASE + 0x043C) +#define HBG_REG_RX_BUS_ERR_ADDR_ADDR (HBG_REG_SGMII_BASE + 0x0440) #define HBG_REG_MAX_FRAME_LEN_ADDR (HBG_REG_SGMII_BASE + 0x0444) #define HBG_REG_MAX_FRAME_LEN_M GENMASK(15, 0) +#define HBG_REG_DEBUG_ST_MCH_ADDR (HBG_REG_SGMII_BASE + 0x0450) +#define HBG_REG_FIFO_CURR_STATUS_ADDR (HBG_REG_SGMII_BASE + 0x0454) +#define HBG_REG_FIFO_HIST_STATUS_ADDR (HBG_REG_SGMII_BASE + 0x0458) #define HBG_REG_CF_CFF_DATA_NUM_ADDR (HBG_REG_SGMII_BASE + 0x045C) #define HBG_REG_CF_CFF_DATA_NUM_ADDR_TX_M GENMASK(8, 0) #define HBG_REG_CF_CFF_DATA_NUM_ADDR_RX_M GENMASK(24, 16) +#define HBG_REG_CF_TX_PAUSE_ADDR (HBG_REG_SGMII_BASE + 0x0470) #define HBG_REG_TX_CFF_ADDR_0_ADDR (HBG_REG_SGMII_BASE + 0x0488) #define HBG_REG_TX_CFF_ADDR_1_ADDR (HBG_REG_SGMII_BASE + 0x048C) #define HBG_REG_TX_CFF_ADDR_2_ADDR (HBG_REG_SGMII_BASE + 0x0490) @@ -104,6 +134,10 @@ #define HBG_REG_RX_CTRL_RXBUF_1ST_SKIP_SIZE2_M GENMASK(3, 0) #define HBG_REG_RX_PKT_MODE_ADDR (HBG_REG_SGMII_BASE + 0x04F4) #define HBG_REG_RX_PKT_MODE_PARSE_MODE_M GENMASK(22, 21) +#define HBG_REG_DBG_ST0_ADDR (HBG_REG_SGMII_BASE + 0x05E4) +#define HBG_REG_DBG_ST1_ADDR (HBG_REG_SGMII_BASE + 0x05E8) +#define HBG_REG_DBG_ST2_ADDR (HBG_REG_SGMII_BASE + 0x05EC) +#define HBG_REG_BUS_RST_EN_ADDR (HBG_REG_SGMII_BASE + 0x0688) #define HBG_REG_CF_IND_TXINT_MSK_ADDR (HBG_REG_SGMII_BASE + 0x0694) #define HBG_REG_IND_INTR_MASK_B BIT(0) #define HBG_REG_CF_IND_TXINT_STAT_ADDR (HBG_REG_SGMII_BASE + 0x0698) From 3a03763f38769707a4dd0ca44474806fed3a7f81 Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Mon, 16 Dec 2024 12:05:30 +0800 Subject: [PATCH 0685/1450] net: hibmcge: Add pauseparam supported in this module The MAC can automatically send or respond to pause frames. This patch supports the function of enabling pause frames by using ethtool. Signed-off-by: Jijie Shao Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241216040532.1566229-6-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hibmcge/hbg_common.h | 1 + .../ethernet/hisilicon/hibmcge/hbg_ethtool.c | 25 +++++++++++++++++++ .../net/ethernet/hisilicon/hibmcge/hbg_hw.c | 21 ++++++++++++++++ .../net/ethernet/hisilicon/hibmcge/hbg_hw.h | 3 +++ .../net/ethernet/hisilicon/hibmcge/hbg_main.c | 1 + .../net/ethernet/hisilicon/hibmcge/hbg_mdio.c | 15 +++++++++++ .../net/ethernet/hisilicon/hibmcge/hbg_reg.h | 2 ++ 7 files changed, 68 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h index 9bb3abe883773..cc143a5367130 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h @@ -115,6 +115,7 @@ struct hbg_mac { u32 duplex; u32 autoneg; u32 link_status; + u32 pause_autoneg; }; struct hbg_mac_table_entry { diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c index e7f169d2abb7e..a821a92db43db 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c @@ -143,12 +143,37 @@ static void hbg_ethtool_get_regs(struct net_device *netdev, } } +static void hbg_ethtool_get_pauseparam(struct net_device *net_dev, + struct ethtool_pauseparam *param) +{ + struct hbg_priv *priv = netdev_priv(net_dev); + + param->autoneg = priv->mac.pause_autoneg; + hbg_hw_get_pause_enable(priv, ¶m->tx_pause, ¶m->rx_pause); +} + +static int hbg_ethtool_set_pauseparam(struct net_device *net_dev, + struct ethtool_pauseparam *param) +{ + struct hbg_priv *priv = netdev_priv(net_dev); + + priv->mac.pause_autoneg = param->autoneg; + phy_set_asym_pause(priv->mac.phydev, param->rx_pause, param->tx_pause); + + if (!param->autoneg) + hbg_hw_set_pause_enable(priv, param->tx_pause, param->rx_pause); + + return 0; +} + static const struct ethtool_ops hbg_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = phy_ethtool_get_link_ksettings, .set_link_ksettings = phy_ethtool_set_link_ksettings, .get_regs_len = hbg_ethtool_get_regs_len, .get_regs = hbg_ethtool_get_regs, + .get_pauseparam = hbg_ethtool_get_pauseparam, + .set_pauseparam = hbg_ethtool_set_pauseparam, }; void hbg_ethtool_set_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c index 29d66a0ea0a6c..0cbe9f7229b34 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c @@ -220,6 +220,27 @@ void hbg_hw_set_mac_filter_enable(struct hbg_priv *priv, u32 enable) HBG_REG_REC_FILT_CTRL_UC_MATCH_EN_B, enable); } +void hbg_hw_set_pause_enable(struct hbg_priv *priv, u32 tx_en, u32 rx_en) +{ + hbg_reg_write_field(priv, HBG_REG_PAUSE_ENABLE_ADDR, + HBG_REG_PAUSE_ENABLE_TX_B, tx_en); + hbg_reg_write_field(priv, HBG_REG_PAUSE_ENABLE_ADDR, + HBG_REG_PAUSE_ENABLE_RX_B, rx_en); +} + +void hbg_hw_get_pause_enable(struct hbg_priv *priv, u32 *tx_en, u32 *rx_en) +{ + *tx_en = hbg_reg_read_field(priv, HBG_REG_PAUSE_ENABLE_ADDR, + HBG_REG_PAUSE_ENABLE_TX_B); + *rx_en = hbg_reg_read_field(priv, HBG_REG_PAUSE_ENABLE_ADDR, + HBG_REG_PAUSE_ENABLE_RX_B); +} + +void hbg_hw_set_rx_pause_mac_addr(struct hbg_priv *priv, u64 mac_addr) +{ + hbg_reg_write64(priv, HBG_REG_FD_FC_ADDR_LOW_ADDR, mac_addr); +} + static void hbg_hw_init_transmit_ctrl(struct hbg_priv *priv) { u32 ctrl = 0; diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h index 6eb4b7d2cba84..a4a049b5121d3 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.h @@ -56,5 +56,8 @@ u32 hbg_hw_get_fifo_used_num(struct hbg_priv *priv, enum hbg_dir dir); void hbg_hw_set_tx_desc(struct hbg_priv *priv, struct hbg_tx_desc *tx_desc); void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr); void hbg_hw_set_mac_filter_enable(struct hbg_priv *priv, u32 enable); +void hbg_hw_set_pause_enable(struct hbg_priv *priv, u32 tx_en, u32 rx_en); +void hbg_hw_get_pause_enable(struct hbg_priv *priv, u32 *tx_en, u32 *rx_en); +void hbg_hw_set_rx_pause_mac_addr(struct hbg_priv *priv, u64 mac_addr); #endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c index 578ba8ee409be..8a2d63c0c1964 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c @@ -191,6 +191,7 @@ static int hbg_net_set_mac_address(struct net_device *netdev, void *addr) if (exists) hbg_set_mac_to_mac_table(priv, index, NULL); + hbg_hw_set_rx_pause_mac_addr(priv, ether_addr_to_u64(mac_addr)); dev_addr_set(netdev, mac_addr); return 0; } diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_mdio.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_mdio.c index a3479fba85016..db6bc4cfb971f 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_mdio.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_mdio.c @@ -114,6 +114,19 @@ static void hbg_mdio_init_hw(struct hbg_priv *priv) hbg_mdio_set_command(mac, cmd); } +static void hbg_flowctrl_cfg(struct hbg_priv *priv) +{ + struct phy_device *phydev = priv->mac.phydev; + bool rx_pause; + bool tx_pause; + + if (!priv->mac.pause_autoneg) + return; + + phy_get_pause(phydev, &tx_pause, &rx_pause); + hbg_hw_set_pause_enable(priv, tx_pause, rx_pause); +} + static void hbg_phy_adjust_link(struct net_device *netdev) { struct hbg_priv *priv = netdev_priv(netdev); @@ -140,6 +153,7 @@ static void hbg_phy_adjust_link(struct net_device *netdev) priv->mac.duplex = phydev->duplex; priv->mac.autoneg = phydev->autoneg; hbg_hw_adjust_link(priv, speed, phydev->duplex); + hbg_flowctrl_cfg(priv); } priv->mac.link_status = phydev->link; @@ -168,6 +182,7 @@ static int hbg_phy_connect(struct hbg_priv *priv) return ret; phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT); + phy_support_asym_pause(phydev); phy_attached_info(phydev); return 0; diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h index 665666712c7cd..f12efc12f3c54 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_reg.h @@ -51,6 +51,8 @@ #define HBG_REG_PORT_ENABLE_RX_B BIT(1) #define HBG_REG_PORT_ENABLE_TX_B BIT(2) #define HBG_REG_PAUSE_ENABLE_ADDR (HBG_REG_SGMII_BASE + 0x0048) +#define HBG_REG_PAUSE_ENABLE_RX_B BIT(0) +#define HBG_REG_PAUSE_ENABLE_TX_B BIT(1) #define HBG_REG_AN_NEG_STATE_ADDR (HBG_REG_SGMII_BASE + 0x0058) #define HBG_REG_TRANSMIT_CTRL_ADDR (HBG_REG_SGMII_BASE + 0x0060) #define HBG_REG_TRANSMIT_CTRL_PAD_EN_B BIT(7) From 3f5a61f6d504f55ed1a36cce044d5123d508721f Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Mon, 16 Dec 2024 12:05:31 +0800 Subject: [PATCH 0686/1450] net: hibmcge: Add reset supported in this module Sometimes, if the port doesn't work, we can try to fix it by resetting it. This patch supports reset triggered by ethtool or FLR of PCIe, For example: ethtool --reset eth0 dedicated echo 1 > /sys/bus/pci/devices/0000\:83\:00.1/reset We hope that the reset can be performed only when the port is down, and the port cannot be up during the reset. Therefore, the entire reset process is protected by the rtnl lock. After the reset is complete, the hardware registers are restored to their default values. Therefore, some rebuild operations are required to rewrite the user configuration to the registers. Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241216040532.1566229-7-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/hisilicon/hibmcge/Makefile | 2 +- .../ethernet/hisilicon/hibmcge/hbg_common.h | 16 +++ .../ethernet/hisilicon/hibmcge/hbg_debugfs.c | 22 +++ .../net/ethernet/hisilicon/hibmcge/hbg_err.c | 134 ++++++++++++++++++ .../net/ethernet/hisilicon/hibmcge/hbg_err.h | 13 ++ .../ethernet/hisilicon/hibmcge/hbg_ethtool.c | 15 ++ .../net/ethernet/hisilicon/hibmcge/hbg_hw.c | 10 +- .../net/ethernet/hisilicon/hibmcge/hbg_main.c | 35 +++-- 8 files changed, 226 insertions(+), 21 deletions(-) create mode 100644 drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c create mode 100644 drivers/net/ethernet/hisilicon/hibmcge/hbg_err.h diff --git a/drivers/net/ethernet/hisilicon/hibmcge/Makefile b/drivers/net/ethernet/hisilicon/hibmcge/Makefile index 1a0ec2fb8c24d..7ea15f9ef8492 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/Makefile +++ b/drivers/net/ethernet/hisilicon/hibmcge/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_HIBMCGE) += hibmcge.o hibmcge-objs = hbg_main.o hbg_hw.o hbg_mdio.o hbg_irq.o hbg_txrx.o hbg_ethtool.o \ - hbg_debugfs.o + hbg_debugfs.o hbg_err.o diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h index cc143a5367130..b4300d8ea4ad2 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_common.h @@ -4,6 +4,7 @@ #ifndef __HBG_COMMON_H #define __HBG_COMMON_H +#include #include #include #include "hbg_reg.h" @@ -33,6 +34,14 @@ enum hbg_tx_state { enum hbg_nic_state { HBG_NIC_STATE_EVENT_HANDLING = 0, + HBG_NIC_STATE_RESETTING, + HBG_NIC_STATE_RESET_FAIL, +}; + +enum hbg_reset_type { + HBG_RESET_TYPE_NONE = 0, + HBG_RESET_TYPE_FLR, + HBG_RESET_TYPE_FUNCTION, }; struct hbg_buffer { @@ -128,6 +137,11 @@ struct hbg_mac_filter { bool enabled; }; +/* saved for restore after rest */ +struct hbg_user_def { + struct ethtool_pauseparam pause_param; +}; + struct hbg_priv { struct net_device *netdev; struct pci_dev *pdev; @@ -139,6 +153,8 @@ struct hbg_priv { struct hbg_ring tx_ring; struct hbg_ring rx_ring; struct hbg_mac_filter filter; + enum hbg_reset_type reset_type; + struct hbg_user_def user_def; }; #endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c index 616b86333eec1..8473c43d171a9 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_debugfs.c @@ -19,6 +19,8 @@ struct hbg_dbg_info { int (*read)(struct seq_file *seq, void *data); }; +#define state_str_true_false(p, s) str_true_false(test_bit(s, &(p)->state)) + static void hbg_dbg_ring(struct hbg_priv *priv, struct hbg_ring *ring, struct seq_file *s) { @@ -97,11 +99,31 @@ static int hbg_dbg_mac_table(struct seq_file *s, void *unused) return 0; } +static const char * const reset_type_str[] = {"None", "FLR", "Function"}; + +static int hbg_dbg_nic_state(struct seq_file *s, void *unused) +{ + struct net_device *netdev = dev_get_drvdata(s->private); + struct hbg_priv *priv = netdev_priv(netdev); + + seq_printf(s, "event handling state: %s\n", + state_str_true_false(priv, HBG_NIC_STATE_EVENT_HANDLING)); + seq_printf(s, "resetting state: %s\n", + state_str_true_false(priv, HBG_NIC_STATE_RESETTING)); + seq_printf(s, "reset fail state: %s\n", + state_str_true_false(priv, HBG_NIC_STATE_RESET_FAIL)); + seq_printf(s, "last reset type: %s\n", + reset_type_str[priv->reset_type]); + + return 0; +} + static const struct hbg_dbg_info hbg_dbg_infos[] = { { "tx_ring", hbg_dbg_tx_ring }, { "rx_ring", hbg_dbg_rx_ring }, { "irq_info", hbg_dbg_irq_info }, { "mac_table", hbg_dbg_mac_table }, + { "nic_state", hbg_dbg_nic_state }, }; static void hbg_debugfs_uninit(void *data) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c new file mode 100644 index 0000000000000..4d1f4a33391a8 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (c) 2024 Hisilicon Limited. + +#include +#include +#include +#include +#include "hbg_common.h" +#include "hbg_err.h" +#include "hbg_hw.h" + +static void hbg_restore_mac_table(struct hbg_priv *priv) +{ + struct hbg_mac_filter *filter = &priv->filter; + u64 addr; + u32 i; + + for (i = 0; i < filter->table_max_len; i++) + if (!is_zero_ether_addr(filter->mac_table[i].addr)) { + addr = ether_addr_to_u64(filter->mac_table[i].addr); + hbg_hw_set_uc_addr(priv, addr, i); + } + + hbg_hw_set_mac_filter_enable(priv, priv->filter.enabled); +} + +static void hbg_restore_user_def_settings(struct hbg_priv *priv) +{ + struct ethtool_pauseparam *pause_param = &priv->user_def.pause_param; + + hbg_restore_mac_table(priv); + hbg_hw_set_mtu(priv, priv->netdev->mtu); + hbg_hw_set_pause_enable(priv, pause_param->tx_pause, + pause_param->rx_pause); +} + +int hbg_rebuild(struct hbg_priv *priv) +{ + int ret; + + ret = hbg_hw_init(priv); + if (ret) + return ret; + + hbg_restore_user_def_settings(priv); + return 0; +} + +static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type) +{ + int ret; + + ASSERT_RTNL(); + + if (netif_running(priv->netdev)) { + dev_warn(&priv->pdev->dev, + "failed to reset because port is up\n"); + return -EBUSY; + } + + priv->reset_type = type; + set_bit(HBG_NIC_STATE_RESETTING, &priv->state); + clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); + ret = hbg_hw_event_notify(priv, HBG_HW_EVENT_RESET); + if (ret) { + set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); + clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + } + + return ret; +} + +static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type) +{ + int ret; + + if (!test_bit(HBG_NIC_STATE_RESETTING, &priv->state) || + type != priv->reset_type) + return 0; + + ASSERT_RTNL(); + + clear_bit(HBG_NIC_STATE_RESETTING, &priv->state); + ret = hbg_rebuild(priv); + if (ret) { + set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state); + dev_err(&priv->pdev->dev, "failed to rebuild after reset\n"); + return ret; + } + + dev_info(&priv->pdev->dev, "reset done\n"); + return ret; +} + +/* must be protected by rtnl lock */ +int hbg_reset(struct hbg_priv *priv) +{ + int ret; + + ASSERT_RTNL(); + ret = hbg_reset_prepare(priv, HBG_RESET_TYPE_FUNCTION); + if (ret) + return ret; + + return hbg_reset_done(priv, HBG_RESET_TYPE_FUNCTION); +} + +static void hbg_pci_err_reset_prepare(struct pci_dev *pdev) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct hbg_priv *priv = netdev_priv(netdev); + + rtnl_lock(); + hbg_reset_prepare(priv, HBG_RESET_TYPE_FLR); +} + +static void hbg_pci_err_reset_done(struct pci_dev *pdev) +{ + struct net_device *netdev = pci_get_drvdata(pdev); + struct hbg_priv *priv = netdev_priv(netdev); + + hbg_reset_done(priv, HBG_RESET_TYPE_FLR); + rtnl_unlock(); +} + +static const struct pci_error_handlers hbg_pci_err_handler = { + .reset_prepare = hbg_pci_err_reset_prepare, + .reset_done = hbg_pci_err_reset_done, +}; + +void hbg_set_pci_err_handler(struct pci_driver *pdrv) +{ + pdrv->err_handler = &hbg_pci_err_handler; +} diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.h new file mode 100644 index 0000000000000..d7828e446308f --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (c) 2024 Hisilicon Limited. */ + +#ifndef __HBG_ERR_H +#define __HBG_ERR_H + +#include + +void hbg_set_pci_err_handler(struct pci_driver *pdrv); +int hbg_reset(struct hbg_priv *priv); +int hbg_rebuild(struct hbg_priv *priv); + +#endif diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c index a821a92db43db..326228b7b801b 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c @@ -3,7 +3,9 @@ #include #include +#include #include "hbg_common.h" +#include "hbg_err.h" #include "hbg_ethtool.h" #include "hbg_hw.h" @@ -163,9 +165,21 @@ static int hbg_ethtool_set_pauseparam(struct net_device *net_dev, if (!param->autoneg) hbg_hw_set_pause_enable(priv, param->tx_pause, param->rx_pause); + priv->user_def.pause_param = *param; return 0; } +static int hbg_ethtool_reset(struct net_device *netdev, u32 *flags) +{ + struct hbg_priv *priv = netdev_priv(netdev); + + if (*flags != ETH_RESET_DEDICATED) + return -EOPNOTSUPP; + + *flags = 0; + return hbg_reset(priv); +} + static const struct ethtool_ops hbg_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = phy_ethtool_get_link_ksettings, @@ -174,6 +188,7 @@ static const struct ethtool_ops hbg_ethtool_ops = { .get_regs = hbg_ethtool_get_regs, .get_pauseparam = hbg_ethtool_get_pauseparam, .set_pauseparam = hbg_ethtool_set_pauseparam, + .reset = hbg_ethtool_reset, }; void hbg_ethtool_set_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c index 0cbe9f7229b34..e7798f2136450 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include "hbg_common.h" @@ -167,8 +168,13 @@ static void hbg_hw_set_mac_max_frame_len(struct hbg_priv *priv, void hbg_hw_set_mtu(struct hbg_priv *priv, u16 mtu) { - hbg_hw_set_pcu_max_frame_len(priv, mtu); - hbg_hw_set_mac_max_frame_len(priv, mtu); + u32 frame_len; + + frame_len = mtu + VLAN_HLEN * priv->dev_specs.vlan_layers + + ETH_HLEN + ETH_FCS_LEN; + + hbg_hw_set_pcu_max_frame_len(priv, frame_len); + hbg_hw_set_mac_max_frame_len(priv, frame_len); } void hbg_hw_mac_enable(struct hbg_priv *priv, u32 enable) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c index 8a2d63c0c1964..bb0f25ac97600 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c @@ -6,6 +6,7 @@ #include #include #include "hbg_common.h" +#include "hbg_err.h" #include "hbg_ethtool.h" #include "hbg_hw.h" #include "hbg_irq.h" @@ -13,8 +14,6 @@ #include "hbg_txrx.h" #include "hbg_debugfs.h" -static void hbg_change_mtu(struct hbg_priv *priv, int new_mtu); - static void hbg_all_irq_enable(struct hbg_priv *priv, bool enabled) { struct hbg_irq_info *info; @@ -56,11 +55,7 @@ static int hbg_hw_txrx_clear(struct hbg_priv *priv) return ret; /* After reset, regs need to be reconfigured */ - hbg_hw_init(priv); - hbg_hw_set_uc_addr(priv, ether_addr_to_u64(priv->netdev->dev_addr), 0); - hbg_change_mtu(priv, priv->netdev->mtu); - - return 0; + return hbg_rebuild(priv); } static int hbg_net_stop(struct net_device *netdev) @@ -196,15 +191,6 @@ static int hbg_net_set_mac_address(struct net_device *netdev, void *addr) return 0; } -static void hbg_change_mtu(struct hbg_priv *priv, int new_mtu) -{ - u32 frame_len; - - frame_len = new_mtu + VLAN_HLEN * priv->dev_specs.vlan_layers + - ETH_HLEN + ETH_FCS_LEN; - hbg_hw_set_mtu(priv, frame_len); -} - static int hbg_net_change_mtu(struct net_device *netdev, int new_mtu) { struct hbg_priv *priv = netdev_priv(netdev); @@ -212,7 +198,7 @@ static int hbg_net_change_mtu(struct net_device *netdev, int new_mtu) if (netif_running(netdev)) return -EBUSY; - hbg_change_mtu(priv, new_mtu); + hbg_hw_set_mtu(priv, new_mtu); WRITE_ONCE(netdev->mtu, new_mtu); dev_dbg(&priv->pdev->dev, @@ -270,6 +256,17 @@ static int hbg_mac_filter_init(struct hbg_priv *priv) return 0; } +static void hbg_init_user_def(struct hbg_priv *priv) +{ + struct ethtool_pauseparam *pause_param = &priv->user_def.pause_param; + + priv->mac.pause_autoneg = HBG_STATUS_ENABLE; + + pause_param->autoneg = priv->mac.pause_autoneg; + hbg_hw_get_pause_enable(priv, &pause_param->tx_pause, + &pause_param->rx_pause); +} + static int hbg_init(struct hbg_priv *priv) { int ret; @@ -295,6 +292,7 @@ static int hbg_init(struct hbg_priv *priv) return ret; hbg_debugfs_init(priv); + hbg_init_user_def(priv); return 0; } @@ -359,7 +357,7 @@ static int hbg_probe(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->netdev_ops = &hbg_netdev_ops; netdev->watchdog_timeo = 5 * HZ; - hbg_change_mtu(priv, ETH_DATA_LEN); + hbg_hw_set_mtu(priv, ETH_DATA_LEN); hbg_net_set_mac_address(priv->netdev, &priv->dev_specs.mac_addr); hbg_ethtool_set_ops(netdev); @@ -388,6 +386,7 @@ static int __init hbg_module_init(void) int ret; hbg_debugfs_register(); + hbg_set_pci_err_handler(&hbg_driver); ret = pci_register_driver(&hbg_driver); if (ret) hbg_debugfs_unregister(); From adb42b1e0ef32f80f6f02374342aa5c223e9d17f Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Mon, 16 Dec 2024 12:05:32 +0800 Subject: [PATCH 0687/1450] net: hibmcge: Add nway_reset supported in this module Add nway_reset supported in this module Signed-off-by: Jijie Shao Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241216040532.1566229-8-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c index 326228b7b801b..00364a438ec2e 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c @@ -189,6 +189,7 @@ static const struct ethtool_ops hbg_ethtool_ops = { .get_pauseparam = hbg_ethtool_get_pauseparam, .set_pauseparam = hbg_ethtool_set_pauseparam, .reset = hbg_ethtool_reset, + .nway_reset = phy_ethtool_nway_reset, }; void hbg_ethtool_set_ops(struct net_device *netdev) From fca2977629f49dee437e217c3fc423b6e0cad98c Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Mon, 7 Oct 2024 10:23:58 +0200 Subject: [PATCH 0688/1450] can: m_can: set init flag earlier in probe While an m_can controller usually already has the init flag from a hardware reset, no such reset happens on the integrated m_can_pci of the Intel Elkhart Lake. If the CAN controller is found in an active state, m_can_dev_setup() would fail because m_can_niso_supported() calls m_can_cccr_update_bits(), which refuses to modify any other configuration bits when CCCR_INIT is not set. To avoid this issue, set CCCR_INIT before attempting to modify any other configuration flags. Fixes: cd5a46ce6fa6 ("can: m_can: don't enable transceiver when probing") Signed-off-by: Matthias Schiffer Reviewed-by: Markus Schneider-Pargmann Link: https://patch.msgid.link/e247f331cb72829fcbdfda74f31a59cbad1a6006.1728288535.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 533bcb77c9f93..67c404fbe1667 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1695,6 +1695,14 @@ static int m_can_dev_setup(struct m_can_classdev *cdev) return -EINVAL; } + /* Write the INIT bit, in case no hardware reset has happened before + * the probe (for example, it was observed that the Intel Elkhart Lake + * SoCs do not properly reset the CAN controllers on reboot) + */ + err = m_can_cccr_update_bits(cdev, CCCR_INIT, CCCR_INIT); + if (err) + return err; + if (!cdev->is_peripheral) netif_napi_add(dev, &cdev->napi, m_can_poll); @@ -1746,11 +1754,7 @@ static int m_can_dev_setup(struct m_can_classdev *cdev) return -EINVAL; } - /* Forcing standby mode should be redundant, as the chip should be in - * standby after a reset. Write the INIT bit anyways, should the chip - * be configured by previous stage. - */ - return m_can_cccr_update_bits(cdev, CCCR_INIT, CCCR_INIT); + return 0; } static void m_can_stop(struct net_device *dev) From 743375f8deee360b0e902074bab99b0c9368d42f Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Mon, 7 Oct 2024 10:23:59 +0200 Subject: [PATCH 0689/1450] can: m_can: fix missed interrupts with m_can_pci The interrupt line of PCI devices is interpreted as edge-triggered, however the interrupt signal of the m_can controller integrated in Intel Elkhart Lake CPUs appears to be generated level-triggered. Consider the following sequence of events: - IR register is read, interrupt X is set - A new interrupt Y is triggered in the m_can controller - IR register is written to acknowledge interrupt X. Y remains set in IR As at no point in this sequence no interrupt flag is set in IR, the m_can interrupt line will never become deasserted, and no edge will ever be observed to trigger another run of the ISR. This was observed to result in the TX queue of the EHL m_can to get stuck under high load, because frames were queued to the hardware in m_can_start_xmit(), but m_can_finish_tx() was never run to account for their successful transmission. On an Elkhart Lake based board with the two CAN interfaces connected to each other, the following script can reproduce the issue: ip link set can0 up type can bitrate 1000000 ip link set can1 up type can bitrate 1000000 cangen can0 -g 2 -I 000 -L 8 & cangen can0 -g 2 -I 001 -L 8 & cangen can0 -g 2 -I 002 -L 8 & cangen can0 -g 2 -I 003 -L 8 & cangen can0 -g 2 -I 004 -L 8 & cangen can0 -g 2 -I 005 -L 8 & cangen can0 -g 2 -I 006 -L 8 & cangen can0 -g 2 -I 007 -L 8 & cangen can1 -g 2 -I 100 -L 8 & cangen can1 -g 2 -I 101 -L 8 & cangen can1 -g 2 -I 102 -L 8 & cangen can1 -g 2 -I 103 -L 8 & cangen can1 -g 2 -I 104 -L 8 & cangen can1 -g 2 -I 105 -L 8 & cangen can1 -g 2 -I 106 -L 8 & cangen can1 -g 2 -I 107 -L 8 & stress-ng --matrix 0 & To fix the issue, repeatedly read and acknowledge interrupts at the start of the ISR until no interrupt flags are set, so the next incoming interrupt will also result in an edge on the interrupt line. While we have received a report that even with this patch, the TX queue can become stuck under certain (currently unknown) circumstances on the Elkhart Lake, this patch completely fixes the issue with the above reproducer, and it is unclear whether the remaining issue has a similar cause at all. Fixes: cab7ffc0324f ("can: m_can: add PCI glue driver for Intel Elkhart Lake") Signed-off-by: Matthias Schiffer Reviewed-by: Markus Schneider-Pargmann Link: https://patch.msgid.link/fdf0439c51bcb3a46c21e9fb21c7f1d06363be84.1728288535.git.matthias.schiffer@ew.tq-group.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 22 +++++++++++++++++----- drivers/net/can/m_can/m_can.h | 1 + drivers/net/can/m_can/m_can_pci.c | 1 + 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 67c404fbe1667..97cd8bbf2e32a 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1220,20 +1220,32 @@ static void m_can_coalescing_update(struct m_can_classdev *cdev, u32 ir) static int m_can_interrupt_handler(struct m_can_classdev *cdev) { struct net_device *dev = cdev->net; - u32 ir; + u32 ir = 0, ir_read; int ret; if (pm_runtime_suspended(cdev->dev)) return IRQ_NONE; - ir = m_can_read(cdev, M_CAN_IR); + /* The m_can controller signals its interrupt status as a level, but + * depending in the integration the CPU may interpret the signal as + * edge-triggered (for example with m_can_pci). For these + * edge-triggered integrations, we must observe that IR is 0 at least + * once to be sure that the next interrupt will generate an edge. + */ + while ((ir_read = m_can_read(cdev, M_CAN_IR)) != 0) { + ir |= ir_read; + + /* ACK all irqs */ + m_can_write(cdev, M_CAN_IR, ir); + + if (!cdev->irq_edge_triggered) + break; + } + m_can_coalescing_update(cdev, ir); if (!ir) return IRQ_NONE; - /* ACK all irqs */ - m_can_write(cdev, M_CAN_IR, ir); - if (cdev->ops->clear_interrupts) cdev->ops->clear_interrupts(cdev); diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h index 92b2bd8628e6b..ef39e8e527ab6 100644 --- a/drivers/net/can/m_can/m_can.h +++ b/drivers/net/can/m_can/m_can.h @@ -99,6 +99,7 @@ struct m_can_classdev { int pm_clock_support; int pm_wake_source; int is_peripheral; + bool irq_edge_triggered; // Cached M_CAN_IE register content u32 active_interrupts; diff --git a/drivers/net/can/m_can/m_can_pci.c b/drivers/net/can/m_can/m_can_pci.c index d72fe771dfc7a..9ad7419f88f83 100644 --- a/drivers/net/can/m_can/m_can_pci.c +++ b/drivers/net/can/m_can/m_can_pci.c @@ -127,6 +127,7 @@ static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id) mcan_class->pm_clock_support = 1; mcan_class->pm_wake_source = 0; mcan_class->can.clock.freq = id->driver_data; + mcan_class->irq_edge_triggered = true; mcan_class->ops = &m_can_pci_ops; pci_set_drvdata(pci, mcan_class); From 05aa156e156ef3168e7ab8a68721945196495c17 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Fri, 13 Dec 2024 21:17:58 -0800 Subject: [PATCH 0690/1450] powerpc/pseries/vas: Add close() callback in vas_vm_ops struct The mapping VMA address is saved in VAS window struct when the paste address is mapped. This VMA address is used during migration to unmap the paste address if the window is active. The paste address mapping will be removed when the window is closed or with the munmap(). But the VMA address in the VAS window is not updated with munmap() which is causing invalid access during migration. The KASAN report shows: [16386.254991] BUG: KASAN: slab-use-after-free in reconfig_close_windows+0x1a0/0x4e8 [16386.255043] Read of size 8 at addr c00000014a819670 by task drmgr/696928 [16386.255096] CPU: 29 UID: 0 PID: 696928 Comm: drmgr Kdump: loaded Tainted: G B 6.11.0-rc5-nxgzip #2 [16386.255128] Tainted: [B]=BAD_PAGE [16386.255148] Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.00 (NH1110_016) hv:phyp pSeries [16386.255181] Call Trace: [16386.255202] [c00000016b297660] [c0000000018ad0ac] dump_stack_lvl+0x84/0xe8 (unreliable) [16386.255246] [c00000016b297690] [c0000000006e8a90] print_report+0x19c/0x764 [16386.255285] [c00000016b297760] [c0000000006e9490] kasan_report+0x128/0x1f8 [16386.255309] [c00000016b297880] [c0000000006eb5c8] __asan_load8+0xac/0xe0 [16386.255326] [c00000016b2978a0] [c00000000013f898] reconfig_close_windows+0x1a0/0x4e8 [16386.255343] [c00000016b297990] [c000000000140e58] vas_migration_handler+0x3a4/0x3fc [16386.255368] [c00000016b297a90] [c000000000128848] pseries_migrate_partition+0x4c/0x4c4 ... [16386.256136] Allocated by task 696554 on cpu 31 at 16377.277618s: [16386.256149] kasan_save_stack+0x34/0x68 [16386.256163] kasan_save_track+0x34/0x80 [16386.256175] kasan_save_alloc_info+0x58/0x74 [16386.256196] __kasan_slab_alloc+0xb8/0xdc [16386.256209] kmem_cache_alloc_noprof+0x200/0x3d0 [16386.256225] vm_area_alloc+0x44/0x150 [16386.256245] mmap_region+0x214/0x10c4 [16386.256265] do_mmap+0x5fc/0x750 [16386.256277] vm_mmap_pgoff+0x14c/0x24c [16386.256292] ksys_mmap_pgoff+0x20c/0x348 [16386.256303] sys_mmap+0xd0/0x160 ... [16386.256350] Freed by task 0 on cpu 31 at 16386.204848s: [16386.256363] kasan_save_stack+0x34/0x68 [16386.256374] kasan_save_track+0x34/0x80 [16386.256384] kasan_save_free_info+0x64/0x10c [16386.256396] __kasan_slab_free+0x120/0x204 [16386.256415] kmem_cache_free+0x128/0x450 [16386.256428] vm_area_free_rcu_cb+0xa8/0xd8 [16386.256441] rcu_do_batch+0x2c8/0xcf0 [16386.256458] rcu_core+0x378/0x3c4 [16386.256473] handle_softirqs+0x20c/0x60c [16386.256495] do_softirq_own_stack+0x6c/0x88 [16386.256509] do_softirq_own_stack+0x58/0x88 [16386.256521] __irq_exit_rcu+0x1a4/0x20c [16386.256533] irq_exit+0x20/0x38 [16386.256544] interrupt_async_exit_prepare.constprop.0+0x18/0x2c ... [16386.256717] Last potentially related work creation: [16386.256729] kasan_save_stack+0x34/0x68 [16386.256741] __kasan_record_aux_stack+0xcc/0x12c [16386.256753] __call_rcu_common.constprop.0+0x94/0xd04 [16386.256766] vm_area_free+0x28/0x3c [16386.256778] remove_vma+0xf4/0x114 [16386.256797] do_vmi_align_munmap.constprop.0+0x684/0x870 [16386.256811] __vm_munmap+0xe0/0x1f8 [16386.256821] sys_munmap+0x54/0x6c [16386.256830] system_call_exception+0x1a0/0x4a0 [16386.256841] system_call_vectored_common+0x15c/0x2ec [16386.256868] The buggy address belongs to the object at c00000014a819670 which belongs to the cache vm_area_struct of size 168 [16386.256887] The buggy address is located 0 bytes inside of freed 168-byte region [c00000014a819670, c00000014a819718) [16386.256915] The buggy address belongs to the physical page: [16386.256928] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x14a81 [16386.256950] memcg:c0000000ba430001 [16386.256961] anon flags: 0x43ffff800000000(node=4|zone=0|lastcpupid=0x7ffff) [16386.256975] page_type: 0xfdffffff(slab) [16386.256990] raw: 043ffff800000000 c00000000501c080 0000000000000000 5deadbee00000001 [16386.257003] raw: 0000000000000000 00000000011a011a 00000001fdffffff c0000000ba430001 [16386.257018] page dumped because: kasan: bad access detected This patch adds close() callback in vas_vm_ops vm_operations_struct which will be executed during munmap() before freeing VMA. The VMA address in the VAS window is set to NULL after holding the window mmap_mutex. Fixes: 37e6764895ef ("powerpc/pseries/vas: Add VAS migration handler") Signed-off-by: Haren Myneni Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20241214051758.997759-1-haren@linux.ibm.com --- arch/powerpc/platforms/book3s/vas-api.c | 36 +++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index f381b177ea06a..0b6365d85d117 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -464,7 +464,43 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +/* + * During mmap() paste address, mapping VMA is saved in VAS window + * struct which is used to unmap during migration if the window is + * still open. But the user space can remove this mapping with + * munmap() before closing the window and the VMA address will + * be invalid. Set VAS window VMA to NULL in this function which + * is called before VMA free. + */ +static void vas_mmap_close(struct vm_area_struct *vma) +{ + struct file *fp = vma->vm_file; + struct coproc_instance *cp_inst = fp->private_data; + struct vas_window *txwin; + + /* Should not happen */ + if (!cp_inst || !cp_inst->txwin) { + pr_err("No attached VAS window for the paste address mmap\n"); + return; + } + + txwin = cp_inst->txwin; + /* + * task_ref.vma is set in coproc_mmap() during mmap paste + * address. So it has to be the same VMA that is getting freed. + */ + if (WARN_ON(txwin->task_ref.vma != vma)) { + pr_err("Invalid paste address mmaping\n"); + return; + } + + mutex_lock(&txwin->task_ref.mmap_mutex); + txwin->task_ref.vma = NULL; + mutex_unlock(&txwin->task_ref.mmap_mutex); +} + static const struct vm_operations_struct vas_vm_ops = { + .close = vas_mmap_close, .fault = vas_mmap_fault, }; From edc19bd0e571c732cd01c8da62f904e6d2a29a48 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Tue, 17 Dec 2024 16:00:21 +0100 Subject: [PATCH 0691/1450] pwm: stm32: Fix complementary output in round_waveform_tohw() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the timer supports complementary output, the CCxNE bit must be set additionally to the CCxE bit. So to not overwrite the latter use |= instead of = to set the former. Fixes: deaba9cff809 ("pwm: stm32: Implementation of the waveform callbacks") Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/r/20241217150021.2030213-1-fabrice.gasnier@foss.st.com [ukleinek: Slightly improve commit log] Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index b889e64522c3d..17e591f61efb6 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -84,7 +84,7 @@ static int stm32_pwm_round_waveform_tohw(struct pwm_chip *chip, wfhw->ccer = TIM_CCER_CCxE(ch + 1); if (priv->have_complementary_output) - wfhw->ccer = TIM_CCER_CCxNE(ch + 1); + wfhw->ccer |= TIM_CCER_CCxNE(ch + 1); rate = clk_get_rate(priv->clk); From 5a49edec44f638952da8dc8d754e76f462c19034 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 15 Dec 2024 17:43:55 +0000 Subject: [PATCH 0692/1450] net: dsa: qca8k: Fix inconsistent use of jiffies vs milliseconds wait_for_complete_timeout() expects a timeout in jiffies. With the driver, some call sites converted QCA8K_ETHERNET_TIMEOUT to jiffies, others did not. Make the code consistent by changes the #define to include a call to msecs_to_jiffies, and remove all other calls to msecs_to_jiffies. Signed-off-by: Andrew Lunn Tested-by: from Christian would be very welcome. Signed-off-by: David S. Miller --- drivers/net/dsa/qca/qca8k-8xxx.c | 4 ++-- drivers/net/dsa/qca/qca8k.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index ec74e3c2b0e9b..90e24bc00b99c 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -342,7 +342,7 @@ static int qca8k_read_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len) dev_queue_xmit(skb); ret = wait_for_completion_timeout(&mgmt_eth_data->rw_done, - msecs_to_jiffies(QCA8K_ETHERNET_TIMEOUT)); + QCA8K_ETHERNET_TIMEOUT); *val = mgmt_eth_data->data[0]; if (len > QCA_HDR_MGMT_DATA1_LEN) @@ -394,7 +394,7 @@ static int qca8k_write_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len) dev_queue_xmit(skb); ret = wait_for_completion_timeout(&mgmt_eth_data->rw_done, - msecs_to_jiffies(QCA8K_ETHERNET_TIMEOUT)); + QCA8K_ETHERNET_TIMEOUT); ack = mgmt_eth_data->ack; diff --git a/drivers/net/dsa/qca/qca8k.h b/drivers/net/dsa/qca/qca8k.h index 3664a2e2f1f64..24962a395754c 100644 --- a/drivers/net/dsa/qca/qca8k.h +++ b/drivers/net/dsa/qca/qca8k.h @@ -16,7 +16,7 @@ #define QCA8K_ETHERNET_MDIO_PRIORITY 7 #define QCA8K_ETHERNET_PHY_PRIORITY 6 -#define QCA8K_ETHERNET_TIMEOUT 5 +#define QCA8K_ETHERNET_TIMEOUT msecs_to_jiffies(5) #define QCA8K_NUM_PORTS 7 #define QCA8K_NUM_CPU_PORTS 2 From c1bad69f8baf562b1d522740dc76e48f2a2a1918 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 16 Dec 2024 16:56:05 +0000 Subject: [PATCH 0693/1450] net: Remove bouncing hippi list linux-hippi is bouncing with: : Sorry, no mailbox here by that name. (#5.1.1) Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6cced90772fcd..907b379af0100 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10287,7 +10287,6 @@ F: drivers/input/touchscreen/himax_hx83112b.c HIPPI M: Jes Sorensen -L: linux-hippi@sunsite.dk S: Maintained F: drivers/net/hippi/ F: include/linux/hippidevice.h From 4feaedf7d243f1a9af36dfb2711a5641fe3559dc Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 16 Dec 2024 22:26:44 +0100 Subject: [PATCH 0694/1450] thermal/thresholds: Fix boundaries and detection routine The current implementation does not work if the thermal zone is interrupt driven only. The boundaries are not correctly checked and computed as it happens only when the temperature is increasing or decreasing. The problem arises because the routine to detect when we cross a threshold is correlated with the computation of the boundaries. We assume we have to recompute the boundaries when a threshold is crossed but actually we should do that even if the it is not the case. Mixing the boundaries computation and the threshold detection for the sake of optimizing the routine is much more complex as it appears intuitively and prone to errors. This fix separates the boundaries computation and the threshold crossing detection into different routines. The result is a code much more simple to understand, thus easier to maintain. The drawback is we browse the thresholds list several time but we can consider that as neglictible because that happens when the temperature is updated. There are certainly some aeras to improve in the temperature update routine but it would be not adequate as this change aims to fix the thresholds for v6.13. Fixes: 445936f9e258 ("thermal: core: Add user thresholds support") Tested-by: Daniel Lezcano # rock5b, Lenovo x13s Signed-off-by: Daniel Lezcano Link: https://patch.msgid.link/20241216212644.1145122-1-daniel.lezcano@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_thresholds.c | 68 +++++++++++++++------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/drivers/thermal/thermal_thresholds.c b/drivers/thermal/thermal_thresholds.c index d9b2a0bb44fca..38f5fd0e89303 100644 --- a/drivers/thermal/thermal_thresholds.c +++ b/drivers/thermal/thermal_thresholds.c @@ -69,58 +69,60 @@ static struct user_threshold *__thermal_thresholds_find(const struct list_head * return NULL; } -static bool __thermal_threshold_is_crossed(struct user_threshold *threshold, int temperature, - int last_temperature, int direction, - int *low, int *high) +static bool thermal_thresholds_handle_raising(struct list_head *thresholds, int temperature, + int last_temperature) { + struct user_threshold *t; - if (temperature >= threshold->temperature) { - if (threshold->temperature > *low && - THERMAL_THRESHOLD_WAY_DOWN & threshold->direction) - *low = threshold->temperature; + list_for_each_entry(t, thresholds, list_node) { - if (last_temperature < threshold->temperature && - threshold->direction & direction) - return true; - } else { - if (threshold->temperature < *high && THERMAL_THRESHOLD_WAY_UP - & threshold->direction) - *high = threshold->temperature; + if (!(t->direction & THERMAL_THRESHOLD_WAY_UP)) + continue; - if (last_temperature >= threshold->temperature && - threshold->direction & direction) + if (temperature >= t->temperature && + last_temperature < t->temperature) return true; } return false; } -static bool thermal_thresholds_handle_raising(struct list_head *thresholds, int temperature, - int last_temperature, int *low, int *high) +static bool thermal_thresholds_handle_dropping(struct list_head *thresholds, int temperature, + int last_temperature) { struct user_threshold *t; - list_for_each_entry(t, thresholds, list_node) { - if (__thermal_threshold_is_crossed(t, temperature, last_temperature, - THERMAL_THRESHOLD_WAY_UP, low, high)) + list_for_each_entry_reverse(t, thresholds, list_node) { + + if (!(t->direction & THERMAL_THRESHOLD_WAY_DOWN)) + continue; + + if (temperature <= t->temperature && + last_temperature > t->temperature) return true; } return false; } -static bool thermal_thresholds_handle_dropping(struct list_head *thresholds, int temperature, - int last_temperature, int *low, int *high) +static void thermal_threshold_find_boundaries(struct list_head *thresholds, int temperature, + int *low, int *high) { struct user_threshold *t; - list_for_each_entry_reverse(t, thresholds, list_node) { - if (__thermal_threshold_is_crossed(t, temperature, last_temperature, - THERMAL_THRESHOLD_WAY_DOWN, low, high)) - return true; + list_for_each_entry(t, thresholds, list_node) { + if (temperature < t->temperature && + (t->direction & THERMAL_THRESHOLD_WAY_UP) && + *high > t->temperature) + *high = t->temperature; } - return false; + list_for_each_entry_reverse(t, thresholds, list_node) { + if (temperature > t->temperature && + (t->direction & THERMAL_THRESHOLD_WAY_DOWN) && + *low < t->temperature) + *low = t->temperature; + } } void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high) @@ -132,6 +134,8 @@ void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *hi lockdep_assert_held(&tz->lock); + thermal_threshold_find_boundaries(thresholds, temperature, low, high); + /* * We need a second update in order to detect a threshold being crossed */ @@ -151,12 +155,12 @@ void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *hi * - decreased : thresholds are crossed the way down */ if (temperature > last_temperature) { - if (thermal_thresholds_handle_raising(thresholds, temperature, - last_temperature, low, high)) + if (thermal_thresholds_handle_raising(thresholds, + temperature, last_temperature)) thermal_notify_threshold_up(tz); } else { - if (thermal_thresholds_handle_dropping(thresholds, temperature, - last_temperature, low, high)) + if (thermal_thresholds_handle_dropping(thresholds, + temperature, last_temperature)) thermal_notify_threshold_down(tz); } } From c9e3ebdc52ebe028f238c9df5162ae92483bedd5 Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Wed, 18 Dec 2024 17:13:07 +0800 Subject: [PATCH 0695/1450] ASoC: rt722: add delay time to wait for the calibration procedure The calibration procedure needs some time to finish. This patch adds the delay time to ensure the calibration procedure is completed correctly. Signed-off-by: Shuming Fan Link: https://patch.msgid.link/20241218091307.96656-1-shumingf@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/rt722-sdca.c b/sound/soc/codecs/rt722-sdca.c index 908846e994df3..e17a142d03b99 100644 --- a/sound/soc/codecs/rt722-sdca.c +++ b/sound/soc/codecs/rt722-sdca.c @@ -1468,13 +1468,18 @@ static void rt722_sdca_jack_preset(struct rt722_sdca_priv *rt722) 0x008d); /* check HP calibration FSM status */ for (loop_check = 0; loop_check < chk_cnt; loop_check++) { + usleep_range(10000, 11000); ret = rt722_sdca_index_read(rt722, RT722_VENDOR_CALI, RT722_DAC_DC_CALI_CTL3, &calib_status); - if (ret < 0 || loop_check == chk_cnt) + if (ret < 0) dev_dbg(&rt722->slave->dev, "calibration failed!, ret=%d\n", ret); if ((calib_status & 0x0040) == 0x0) break; } + + if (loop_check == chk_cnt) + dev_dbg(&rt722->slave->dev, "%s, calibration time-out!\n", __func__); + /* Set ADC09 power entity floating control */ rt722_sdca_index_write(rt722, RT722_VENDOR_HDA_CTL, RT722_ADC0A_08_PDE_FLOAT_CTL, 0x2a12); From 26fff8a4432ffd03409346b7dae1e1a2c5318b7c Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 17 Dec 2024 18:02:11 -0800 Subject: [PATCH 0696/1450] block/bdev: use helper for max block size check We already have a helper for checking the limits on the block size both low and high, just use that. No functional changes. Reviewed-by: John Garry Signed-off-by: Luis Chamberlain Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241218020212.3657139-2-mcgrof@kernel.org Signed-off-by: Jens Axboe --- block/bdev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 738e3c8457e7f..9d73a8fbf7f99 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -155,8 +155,7 @@ int set_blocksize(struct file *file, int size) struct inode *inode = file->f_mapping->host; struct block_device *bdev = I_BDEV(inode); - /* Size must be a power of two, and between 512 and PAGE_SIZE */ - if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + if (blk_validate_block_size(size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ From 51588b1b77b65cd0fb3440f78f37bef7178a2715 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 17 Dec 2024 18:02:12 -0800 Subject: [PATCH 0697/1450] nvme: use blk_validate_block_size() for max LBA check The block layer already has support to validates proper block sizes with blk_validate_block_size(), we can leverage that as well. No functional changes. Signed-off-by: Luis Chamberlain Reviewed-by: John Garry Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241218020212.3657139-3-mcgrof@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d169a30eb935e..a970168a3014e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2034,7 +2034,7 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, * or smaller than a sector size yet, so catch this early and don't * allow block I/O. */ - if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) { + if (blk_validate_block_size(bs)) { bs = (1 << 9); valid = false; } From 224749be6c23efe7fb8a030854f4fc5d1dd813b3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Dec 2024 18:16:14 +0800 Subject: [PATCH 0698/1450] block: Revert "block: Fix potential deadlock while freezing queue and acquiring sysfs_lock" This reverts commit be26ba96421ab0a8fa2055ccf7db7832a13c44d2. Commit be26ba96421a ("block: Fix potential deadlock while freezing queue and acquiring sysfs_loc") actually reverts commit 22465bbac53c ("blk-mq: move cpuhp callback registering out of q->sysfs_lock"), and causes the original resctrl lockdep warning. So revert it and we need to fix the issue in another way. Cc: Nilay Shroff Fixes: be26ba96421a ("block: Fix potential deadlock while freezing queue and acquiring sysfs_loc") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241218101617.3275704-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 16 ++++++++++------ block/blk-mq.c | 29 +++++++++++------------------ block/blk-sysfs.c | 4 ++-- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index cd5ea6eaa76b0..156e9bb07abf1 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -275,13 +275,15 @@ void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - lockdep_assert_held(&q->sysfs_dir_lock); - + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) - return; + goto unlock; queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); + +unlock: + mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register_hctxs(struct request_queue *q) @@ -290,10 +292,9 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) unsigned long i; int ret = 0; - lockdep_assert_held(&q->sysfs_dir_lock); - + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) - return ret; + goto unlock; queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); @@ -301,5 +302,8 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) break; } +unlock: + mutex_unlock(&q->sysfs_dir_lock); + return ret; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 6b6111513986f..92e8ddf34575d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4453,8 +4453,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, unsigned long i, j; /* protect against switching io scheduler */ - lockdep_assert_held(&q->sysfs_lock); - + mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4487,6 +4486,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); + mutex_unlock(&q->sysfs_lock); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4518,14 +4518,10 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, xa_init(&q->hctx_table); - mutex_lock(&q->sysfs_lock); - blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; - mutex_unlock(&q->sysfs_lock); - INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); @@ -4544,7 +4540,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return 0; err_hctxs: - mutex_unlock(&q->sysfs_lock); blk_mq_release(q); err_exit: q->mq_ops = NULL; @@ -4925,12 +4920,12 @@ static bool blk_mq_elv_switch_none(struct list_head *head, return false; /* q->elevator needs protection from ->sysfs_lock */ - lockdep_assert_held(&q->sysfs_lock); + mutex_lock(&q->sysfs_lock); /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); - goto out; + goto unlock; } INIT_LIST_HEAD(&qe->node); @@ -4940,7 +4935,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head, __elevator_get(qe->type); list_add(&qe->node, head); elevator_disable(q); -out: +unlock: + mutex_unlock(&q->sysfs_lock); + return true; } @@ -4969,9 +4966,11 @@ static void blk_mq_elv_switch_back(struct list_head *head, list_del(&qe->node); kfree(qe); + mutex_lock(&q->sysfs_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); + mutex_unlock(&q->sysfs_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, @@ -4991,11 +4990,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; - list_for_each_entry(q, &set->tag_list, tag_set_list) { - mutex_lock(&q->sysfs_dir_lock); - mutex_lock(&q->sysfs_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); - } /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done @@ -5051,11 +5047,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_elv_switch_back(&head, q); - list_for_each_entry(q, &set->tag_list, tag_set_list) { + list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); - mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); - } /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 64f70c713d2f9..767598e719ab0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -706,11 +706,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, if (entry->load_module) entry->load_module(disk, page, length); - mutex_lock(&q->sysfs_lock); blk_mq_freeze_queue(q); + mutex_lock(&q->sysfs_lock); res = entry->store(disk, page, length); - blk_mq_unfreeze_queue(q); mutex_unlock(&q->sysfs_lock); + blk_mq_unfreeze_queue(q); return res; } From 85672ca9ceeaa1dcf2777a7048af5f4aee3fd02b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Dec 2024 18:16:15 +0800 Subject: [PATCH 0699/1450] block: avoid to reuse `hctx` not removed from cpuhp callback list If the 'hctx' isn't removed from cpuhp callback list, we can't reuse it, otherwise use-after-free may be triggered. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412172217.b906db7c-lkp@intel.com Tested-by: kernel test robot Fixes: 22465bbac53c ("blk-mq: move cpuhp callback registering out of q->sysfs_lock") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241218101617.3275704-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 92e8ddf34575d..8ac19d4ae3c0a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4412,6 +4412,15 @@ struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); +/* + * Only hctx removed from cpuhp list can be reused + */ +static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) +{ + return hlist_unhashed(&hctx->cpuhp_online) && + hlist_unhashed(&hctx->cpuhp_dead); +} + static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) @@ -4421,7 +4430,7 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { - if (tmp->numa_node == node) { + if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { hctx = tmp; break; } From 7f9a1eed1ad8b274ed9163a02cef891a90427237 Mon Sep 17 00:00:00 2001 From: Jon Lin Date: Wed, 18 Dec 2024 23:47:41 +0800 Subject: [PATCH 0700/1450] spi: rockchip-sfc: Fix error in remove progress Fix error in remove progress: [ 43.026148] Call trace: [ 43.026370] klist_next+0x1c/0x1d4 [ 43.026671] device_for_each_child+0x48/0xac [ 43.027049] spi_unregister_controller+0x30/0x130 [ 43.027469] rockchip_sfc_remove+0x48/0x80 [spi_rockchip_sfc] Signed-off-by: Jon Lin Link: https://patch.msgid.link/20241218154741.901591-1-jon.lin@rock-chips.com Signed-off-by: Mark Brown --- drivers/spi/spi-rockchip-sfc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-rockchip-sfc.c b/drivers/spi/spi-rockchip-sfc.c index 69d0f21755684..70bbb459caa4f 100644 --- a/drivers/spi/spi-rockchip-sfc.c +++ b/drivers/spi/spi-rockchip-sfc.c @@ -182,6 +182,7 @@ struct rockchip_sfc { bool use_dma; u32 max_iosize; u16 version; + struct spi_controller *host; }; static int rockchip_sfc_reset(struct rockchip_sfc *sfc) @@ -574,6 +575,7 @@ static int rockchip_sfc_probe(struct platform_device *pdev) sfc = spi_controller_get_devdata(host); sfc->dev = dev; + sfc->host = host; sfc->regbase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(sfc->regbase)) @@ -651,8 +653,8 @@ static int rockchip_sfc_probe(struct platform_device *pdev) static void rockchip_sfc_remove(struct platform_device *pdev) { - struct spi_controller *host = platform_get_drvdata(pdev); struct rockchip_sfc *sfc = platform_get_drvdata(pdev); + struct spi_controller *host = sfc->host; spi_unregister_controller(host); From cc0c53f4fac562efb3aca2bc493515e77642ae33 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Jun 2024 14:12:45 -0700 Subject: [PATCH 0701/1450] wifi: iwlwifi: mvm: Fix __counted_by usage in cfg80211_wowlan_nd_* Both struct cfg80211_wowlan_nd_match and struct cfg80211_wowlan_nd_info pre-allocate space for channels and matches, but then may end up using fewer that the full allocation. Shrink the associated counter (n_channels and n_matches) after counting the results. This avoids compile-time (and run-time) warnings from __counted_by. (The counter member needs to be updated _before_ accessing the array index.) Seen with coming GCC 15: drivers/net/wireless/intel/iwlwifi/mvm/d3.c: In function 'iwl_mvm_query_set_freqs': drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2877:66: warning: operation on 'match->n_channels' may be undefined [-Wsequence-point] 2877 | match->channels[match->n_channels++] = | ~~~~~~~~~~~~~~~~~^~ drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2885:66: warning: operation on 'match->n_channels' may be undefined [-Wsequence-point] 2885 | match->channels[match->n_channels++] = | ~~~~~~~~~~~~~~~~~^~ drivers/net/wireless/intel/iwlwifi/mvm/d3.c: In function 'iwl_mvm_query_netdetect_reasons': drivers/net/wireless/intel/iwlwifi/mvm/d3.c:2982:58: warning: operation on 'net_detect->n_matches' may be undefined [-Wsequence-point] 2982 | net_detect->matches[net_detect->n_matches++] = match; | ~~~~~~~~~~~~~~~~~~~~~^~ Cc: stable@vger.kernel.org Fixes: aa4ec06c455d ("wifi: cfg80211: use __counted_by where appropriate") Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20240619211233.work.355-kees@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index f85c01e04ebf6..7d973546c9fb8 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -2954,6 +2954,7 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, int idx) { int i; + int n_channels = 0; if (fw_has_api(&mvm->fw->ucode_capa, IWL_UCODE_TLV_API_SCAN_OFFLOAD_CHANS)) { @@ -2962,7 +2963,7 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, for (i = 0; i < SCAN_OFFLOAD_MATCHING_CHANNELS_LEN * 8; i++) if (matches[idx].matching_channels[i / 8] & (BIT(i % 8))) - match->channels[match->n_channels++] = + match->channels[n_channels++] = mvm->nd_channels[i]->center_freq; } else { struct iwl_scan_offload_profile_match_v1 *matches = @@ -2970,9 +2971,11 @@ static void iwl_mvm_query_set_freqs(struct iwl_mvm *mvm, for (i = 0; i < SCAN_OFFLOAD_MATCHING_CHANNELS_LEN_V1 * 8; i++) if (matches[idx].matching_channels[i / 8] & (BIT(i % 8))) - match->channels[match->n_channels++] = + match->channels[n_channels++] = mvm->nd_channels[i]->center_freq; } + /* We may have ended up with fewer channels than we allocated. */ + match->n_channels = n_channels; } /** @@ -3053,6 +3056,8 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, GFP_KERNEL); if (!net_detect || !n_matches) goto out_report_nd; + net_detect->n_matches = n_matches; + n_matches = 0; for_each_set_bit(i, &matched_profiles, mvm->n_nd_match_sets) { struct cfg80211_wowlan_nd_match *match; @@ -3066,8 +3071,9 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, GFP_KERNEL); if (!match) goto out_report_nd; + match->n_channels = n_channels; - net_detect->matches[net_detect->n_matches++] = match; + net_detect->matches[n_matches++] = match; /* We inverted the order of the SSIDs in the scan * request, so invert the index here. @@ -3082,6 +3088,8 @@ static void iwl_mvm_query_netdetect_reasons(struct iwl_mvm *mvm, iwl_mvm_query_set_freqs(mvm, d3_data->nd_results, match, i); } + /* We may have fewer matches than we allocated. */ + net_detect->n_matches = n_matches; out_report_nd: wakeup.net_detect = net_detect; From 349f0086ba8b2a169877d21ff15a4d9da3a60054 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 18 Dec 2024 09:02:28 +0100 Subject: [PATCH 0702/1450] x86/static-call: fix 32-bit build In 32-bit x86 builds CONFIG_STATIC_CALL_INLINE isn't set, leading to static_call_initialized not being available. Define it as "0" in that case. Reported-by: Stephen Rothwell Fixes: 0ef8047b737d ("x86/static-call: provide a way to do very early static-call updates") Signed-off-by: Juergen Gross Acked-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- include/linux/static_call.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 785980af89729..78a77a4ae0ea8 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,7 +138,6 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include -extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ @@ -161,6 +160,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool #ifdef CONFIG_HAVE_STATIC_CALL_INLINE +extern int static_call_initialized; + extern int __init static_call_init(void); extern void static_call_force_reinit(void); @@ -226,6 +227,8 @@ extern long __static_call_return0(void); #elif defined(CONFIG_HAVE_STATIC_CALL) +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } #define DEFINE_STATIC_CALL(name, _func) \ @@ -282,6 +285,8 @@ extern long __static_call_return0(void); #else /* Generic implementation */ +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } static inline long __static_call_return0(void) From 536ae08d7b6ae16872f0b3c2679e656a7fc9d5e2 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 11 Dec 2024 09:56:01 -0600 Subject: [PATCH 0703/1450] drm/amd: Require CONFIG_HOTPLUG_PCI_PCIE for BOCO If the kernel hasn't been compiled with PCIe hotplug support this can lead to problems with dGPUs that use BOCO because they effectively drop off the bus. To prevent issues, disable BOCO support when compiled without PCIe hotplug. Reported-by: Gabriel Marcano Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/1707#note_2696862 Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20241211155601.3585256-1-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 1ad5bdc28bafa66db0f041cc6cdd278a80426aae) --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d272d95dd5b2f..cd4fac1208342 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -417,6 +417,9 @@ bool amdgpu_device_supports_boco(struct drm_device *dev) { struct amdgpu_device *adev = drm_to_adev(dev); + if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) + return false; + if (adev->has_pr3 || ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) return true; From a93b1020eb9386d7da11608477121b10079c076a Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Fri, 6 Dec 2024 13:17:45 +0100 Subject: [PATCH 0704/1450] drm/amdgpu: don't access invalid sched MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 2320c9e6a768 ("drm/sched: memset() 'job' in drm_sched_job_init()") accessing job->base.sched can produce unexpected results as the initialisation of (*job)->base.sched done in amdgpu_job_alloc is overwritten by the memset. This commit fixes an issue when a CS would fail validation and would be rejected after job->num_ibs is incremented. In this case, amdgpu_ib_free(ring->adev, ...) will be called, which would crash the machine because the ring value is bogus. To fix this, pass a NULL pointer to amdgpu_ib_free(): we can do this because the device is actually not used in this function. The next commit will remove the ring argument completely. Fixes: 2320c9e6a768 ("drm/sched: memset() 'job' in drm_sched_job_init()") Signed-off-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 2ae520cb12831d264ceb97c61f72c59d33c0dbd7) --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index b9d08bc965813..a21c510c408ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -255,7 +255,6 @@ void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, void amdgpu_job_free_resources(struct amdgpu_job *job) { - struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); struct dma_fence *f; unsigned i; @@ -268,7 +267,7 @@ void amdgpu_job_free_resources(struct amdgpu_job *job) f = NULL; for (i = 0; i < job->num_ibs; ++i) - amdgpu_ib_free(ring->adev, &job->ibs[i], f); + amdgpu_ib_free(NULL, &job->ibs[i], f); } static void amdgpu_job_free_cb(struct drm_sched_job *s_job) From 146b6057e1fd28fb1a38d300bf76a38dfba7f9fb Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 17 Dec 2024 13:55:48 +0100 Subject: [PATCH 0705/1450] wifi: cw1200: Fix potential NULL dereference A recent refactoring was identified by smatch to cause another potential NULL dereference: drivers/net/wireless/st/cw1200/cw1200_spi.c:440 cw1200_spi_disconnect() error: we previously assumed 'self' could be null (see line 433) Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202411271742.Xa7CNVh1-lkp@intel.com/ Fixes: 2719a9e7156c ("wifi: cw1200: Convert to GPIO descriptors") Signed-off-by: Linus Walleij Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241217-cw1200-fix-v1-1-911e6b5823ec@linaro.org --- drivers/net/wireless/st/cw1200/cw1200_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/st/cw1200/cw1200_spi.c b/drivers/net/wireless/st/cw1200/cw1200_spi.c index 862964a8cc876..52386dfb5f4aa 100644 --- a/drivers/net/wireless/st/cw1200/cw1200_spi.c +++ b/drivers/net/wireless/st/cw1200/cw1200_spi.c @@ -442,8 +442,8 @@ static void cw1200_spi_disconnect(struct spi_device *func) cw1200_core_release(self->core); self->core = NULL; } + cw1200_spi_off(self, dev_get_platdata(&func->dev)); } - cw1200_spi_off(self, dev_get_platdata(&func->dev)); } static int __maybe_unused cw1200_spi_suspend(struct device *dev) From 458600da793da12e0f3724ecbea34a80703f4d5b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:47:48 -0500 Subject: [PATCH 0706/1450] drm/amdgpu/nbio7.7: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 22b9555bc90df22b585bdd1f161b61584b13af51) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c index 1ac730328516f..3fb6d2aa7e3b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c @@ -247,7 +247,7 @@ static void nbio_v7_7_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF0_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 7, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4, data); From 8c1ecc7197a88c6ae62de56e1c0887f220712a32 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:00:07 -0500 Subject: [PATCH 0707/1450] drm/amdgpu/nbio7.11: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 2c8eeaaa0fe5841ccf07a0eb51b1426f34ef39f7) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c index 814ab59fdd4a3..41421da63a084 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c @@ -275,7 +275,7 @@ static void nbio_v7_11_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 11, 0): case IP_VERSION(7, 11, 1): case IP_VERSION(7, 11, 2): From 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:03:20 -0500 Subject: [PATCH 0708/1450] drm/amdgpu/mmhub4.1: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 63bfd24088b42c6f55c2096bfc41b50213d419b2) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c index 0fbc3be81f140..f2ab5001b4924 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c @@ -108,7 +108,7 @@ mmhub_v4_1_0_print_l2_protection_fault_status(struct amdgpu_device *adev, dev_err(adev->dev, "MMVM_L2_PROTECTION_FAULT_STATUS_LO32:0x%08X\n", status); - switch (adev->ip_versions[MMHUB_HWIP][0]) { + switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) { case IP_VERSION(4, 1, 0): mmhub_cid = mmhub_client_ids_v4_1_0[cid][rw]; break; From 41be00f839e9ee7753892a73a36ce4c14c6f5cbf Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:04:58 -0500 Subject: [PATCH 0709/1450] drm/amdgpu/gfx12: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit f1fd1d0f40272948aa6ab82a3a82ecbbc76dff53) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index fe7c48f2fb2a7..da327ab48a572 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -4123,7 +4123,7 @@ static int gfx_v12_0_set_clockgating_state(void *handle, if (amdgpu_sriov_vf(adev)) return 0; - switch (adev->ip_versions[GC_HWIP][0]) { + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): gfx_v12_0_update_gfx_clock_gating(adev, From 9e752ee26c1031312a01d2afc281f5f6fdfca176 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:06:26 -0500 Subject: [PATCH 0710/1450] drm/amdgpu/smu14.0.2: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 8f2cd1067afe68372a1723e05e19b68ed187676a) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 6a565ce74d5b3..5cad09c5f2ff2 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -2096,7 +2096,7 @@ static int smu_v14_0_2_enable_gfx_features(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; - if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(14, 0, 2)) + if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 2)) return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_EnableAllSmuFeatures, FEATURE_PWR_GFX, NULL); else From 8d1a13816e59254bd3b18f5ae0895230922bd120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 12 Dec 2024 16:29:18 +0100 Subject: [PATCH 0711/1450] drm/amdgpu: fix amdgpu_coredump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VM pointer might already be outdated when that function is called. Use the PASID instead to gather the information instead. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 57f812d171af4ba233d3ed7c94dfa5b8e92dcc04) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index 946c48829f197..824f9da5b6cea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -343,11 +343,10 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, coredump->skip_vram_check = skip_vram_check; coredump->reset_vram_lost = vram_lost; - if (job && job->vm) { - struct amdgpu_vm *vm = job->vm; + if (job && job->pasid) { struct amdgpu_task_info *ti; - ti = amdgpu_vm_get_task_info_vm(vm); + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); if (ti) { coredump->reset_task_info = *ti; amdgpu_vm_put_task_info(ti); From 85230ee36d88e7a09fb062d43203035659dd10a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Tue, 17 Dec 2024 18:22:56 +0100 Subject: [PATCH 0712/1450] drm/amdgpu: Handle NULL bo->tbo.resource (again) in amdgpu_vm_bo_update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third time's the charm, I hope? Fixes: d3116756a710 ("drm/ttm: rename bo->mem and make it a pointer") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3837 Reviewed-by: Christian König Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher (cherry picked from commit 695c2c745e5dff201b75da8a1d237ce403600d04) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ddd7f05e4db9f..c9c48b782ec1b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1266,10 +1266,9 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, * next command submission. */ if (amdgpu_vm_is_bo_always_valid(vm, bo)) { - uint32_t mem_type = bo->tbo.resource->mem_type; - - if (!(bo->preferred_domains & - amdgpu_mem_type_to_domain(mem_type))) + if (bo->tbo.resource && + !(bo->preferred_domains & + amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type))) amdgpu_vm_bo_evicted(&bo_va->base); else amdgpu_vm_bo_idle(&bo_va->base); From aeda9245c7ce6afbf0bf1be164ecef7552384c29 Mon Sep 17 00:00:00 2001 From: Alex Shumsky Date: Fri, 13 Dec 2024 11:14:02 +0300 Subject: [PATCH 0713/1450] wifi: brcmfmac: clarify unmodifiable headroom log message Replace misleading log "insufficient headroom (0)" with more clear "unmodifiable headroom". Signed-off-by: Alex Shumsky Acked-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241213081402.625003-1-alexthreed@gmail.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c index da72fd2d541ff..c3a57e30c855a 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c @@ -327,8 +327,8 @@ static netdev_tx_t brcmf_netdev_start_xmit(struct sk_buff *skb, if (skb_headroom(skb) < drvr->hdrlen || skb_header_cloned(skb)) { head_delta = max_t(int, drvr->hdrlen - skb_headroom(skb), 0); - brcmf_dbg(INFO, "%s: insufficient headroom (%d)\n", - brcmf_ifname(ifp), head_delta); + brcmf_dbg(INFO, "%s: %s headroom\n", brcmf_ifname(ifp), + head_delta ? "insufficient" : "unmodifiable"); atomic_inc(&drvr->bus_if->stats.pktcowed); ret = pskb_expand_head(skb, ALIGN(head_delta, NET_SKB_PAD), 0, GFP_ATOMIC); From 8ab3bf4764136e8ad8d1064c304be50297bcf9ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Dec 2024 12:30:10 +0100 Subject: [PATCH 0714/1450] wifi: wlcore: sysfs: constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20241216-sysfs-const-bin_attr-net-v1-3-ec460b91f274@weissschuh.net --- drivers/net/wireless/ti/wlcore/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ti/wlcore/sysfs.c b/drivers/net/wireless/ti/wlcore/sysfs.c index c07acfcbbd9c8..7c57d4c8744ad 100644 --- a/drivers/net/wireless/ti/wlcore/sysfs.c +++ b/drivers/net/wireless/ti/wlcore/sysfs.c @@ -88,7 +88,7 @@ static ssize_t hw_pg_ver_show(struct device *dev, static DEVICE_ATTR_RO(hw_pg_ver); static ssize_t wl1271_sysfs_read_fwlog(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buffer, loff_t pos, size_t count) { struct device *dev = kobj_to_dev(kobj); @@ -121,7 +121,7 @@ static ssize_t wl1271_sysfs_read_fwlog(struct file *filp, struct kobject *kobj, static const struct bin_attribute fwlog_attr = { .attr = { .name = "fwlog", .mode = 0400 }, - .read = wl1271_sysfs_read_fwlog, + .read_new = wl1271_sysfs_read_fwlog, }; int wlcore_sysfs_init(struct wl1271 *wl) From 88395c071f08d9ea2314045230206cc5a3f82ef0 Mon Sep 17 00:00:00 2001 From: Soham Chakradeo Date: Tue, 17 Dec 2024 18:51:58 +0000 Subject: [PATCH 0715/1450] selftests/net: packetdrill: import tcp/ecn, tcp/close, tcp/sack, tcp/tcp_info Same as initial tests, import verbatim from github.com/google/packetdrill, aside from: - update `source ./defaults.sh` path to adjust for flat dir - add SPDX headers - remove author statements if any - drop blank lines at EOF Same test process as previous tests. Both with and without debug mode. Recording the steps once: make mrproper vng --build \ --config tools/testing/selftests/net/packetdrill/config \ --config kernel/configs/debug.config vng -v --run . --user root --cpus 4 -- \ make -C tools/testing/selftests TARGETS=net/packetdrill run_tests Signed-off-by: Willem de Bruijn Signed-off-by: Soham Chakradeo Link: https://patch.msgid.link/20241217185203.297935-2-sohamch.kernel@gmail.com Signed-off-by: Jakub Kicinski --- ...lose_close-local-close-then-remote-fin.pkt | 23 +++++++ .../tcp_close_close-on-syn-sent.pkt | 21 ++++++ .../tcp_close_close-remote-fin-then-close.pkt | 36 ++++++++++ .../net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt | 21 ++++++ .../tcp_sack_sack-route-refresh-ip-tos.pkt | 37 +++++++++++ ...ack_sack-shift-sacked-2-6-8-3-9-nofack.pkt | 64 ++++++++++++++++++ ..._sack_sack-shift-sacked-7-3-4-8-9-fack.pkt | 66 +++++++++++++++++++ ..._sack_sack-shift-sacked-7-5-6-8-9-fack.pkt | 62 +++++++++++++++++ .../tcp_tcp_info_tcp-info-last_data_recv.pkt | 20 ++++++ .../tcp_tcp_info_tcp-info-rwnd-limited.pkt | 54 +++++++++++++++ .../tcp_tcp_info_tcp-info-sndbuf-limited.pkt | 38 +++++++++++ 11 files changed, 442 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt new file mode 100644 index 0000000000000..8514d6bdbb6de --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-local-close-then-remote-fin.pkt @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test basic connection teardown where local process closes first: +// the local process calls close() first, so we send a FIN, and receive an ACK. +// Then we receive a FIN and ACK it. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +.01...0.011 connect(3, ..., ...) = 0 + +0 > S 0:0(0) <...> + +0 < S. 0:0(0) ack 1 win 32768 + +0 > . 1:1(0) ack 1 + + +0 write(3, ..., 1000) = 1000 + +0 > P. 1:1001(1000) ack 1 + +0 < . 1:1(0) ack 1001 win 257 + + +0 close(3) = 0 + +0 > F. 1001:1001(0) ack 1 + +0 < . 1:1(0) ack 1002 win 257 + + +0 < F. 1:1(0) ack 1002 win 257 + +0 > . 1002:1002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt new file mode 100644 index 0000000000000..04103134bd99c --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-on-syn-sent.pkt @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test to make sure no RST is being sent when close() +// is called on a socket with SYN_SENT state. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) <...> + +// Application decideds to close the socket in SYN_SENT state +// Make sure no RST is sent after close(). + +0 close(3) = 0 + +// Receive syn-ack to trigger the send side packet examination: +// If a RESET were sent right after close(), it would have failed with +// a mismatched timestamp. + +.1 < S. 0:0(0) ack 1 win 32000 + +0 > R 1:1(0) diff --git a/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt b/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt new file mode 100644 index 0000000000000..5f3a2914213a1 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_close_close-remote-fin-then-close.pkt @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +// Verify behavior for the sequence: remote side sends FIN, then we close(). +// Since the remote side (client) closes first, we test our LAST_ACK code path. + +`./defaults.sh` + +// Initialize a server socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +0 < . 1:1(0) ack 1 win 257 + + +0 accept(3, ..., ...) = 4 + +// Client closes first. + +.01 < F. 1:1(0) ack 1 win 257 + +0 > . 1:1(0) ack 2 + +// App notices that client closed. + +0 read(4, ..., 1000) = 0 + +// Then we close. + +.01 close(4) = 0 + +0 > F. 1:1(0) ack 2 + +// Client ACKs our FIN. + +.01 < . 2:2(0) ack 2 win 257 + +// Verify that we send RST in response to any incoming segments +// (because the kernel no longer has any record of this socket). + +.01 < . 2:2(0) ack 2 win 257 + +0 > R 2:2(0) diff --git a/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt b/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt new file mode 100644 index 0000000000000..643baf3267cf9 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_ecn_ecn-uses-ect0.pkt @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test ECN: verify that Linux TCP ECN sending code uses ECT0 (not ECT1). +// +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=1 # fully enabled +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 + +// ECN handshake: send EW flags in SYN packet, E flag in SYN-ACK response ++.002 ... 0.004 connect(4, ..., ...) = 0 + + +0 > SEW 0:0(0) ++.002 < SE. 0:0(0) ack 1 win 32767 + +0 > . 1:1(0) ack 1 + +// Write 1 MSS. ++.002 write(4, ..., 1000) = 1000 +// Send 1 MSS with ect0. + +0 > [ect0] P. 1:1001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt new file mode 100644 index 0000000000000..310ef31518da6 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-route-refresh-ip-tos.pkt @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +// Verify that setsockopt calls that force a route refresh do not +// cause problems matching SACKs with packets in the write queue. +// This variant tests IP_TOS. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_IP, IP_MTU_DISCOVER, [IP_PMTUDISC_DONT], 1) = 0 + +0...0.010 connect(3, ..., ...) = 0 + + +0 > S 0:0(0) + +.01 < S. 0:0(0) ack 1 win 65535 + +0 > . 1:1(0) ack 1 + + +.01 write(3, ..., 5840) = 5840 + +0 > P. 1:5841(5840) ack 1 + +.01 < . 1:1(0) ack 5841 win 65535 + + +.01 write(3, ..., 5840) = 5840 + +0 > P. 5841:11681(5840) ack 1 + +.01 < . 1:1(0) ack 11681 win 65535 + + +.01 write(3, ..., 14600) = 14600 + +0 > P. 11681:26281(14600) ack 1 + +// Try the socket option that we know can force a route refresh. + +0 setsockopt(3, SOL_IP, IP_TOS, [4], 1) = 0 +// Then revert to avoid routing/mangling/etc implications of that setting. + +0 setsockopt(3, SOL_IP, IP_TOS, [0], 1) = 0 + +// Verify that we do not retransmit the SACKed segments. + +.01 < . 1:1(0) ack 13141 win 65535 + +0 > . 13141:16061(2920) ack 1 + +0 > P. 17521:20441(2920) ack 1 + +.01 < . 1:1(0) ack 26281 win 65535 diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt new file mode 100644 index 0000000000000..f185e1ac57ea6 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-2-6-8-3-9-nofack.pkt @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb. +// This variant tests non-FACK SACK with SACKs coming in the order +// 2 6 8 3 9, to test what happens when we get a new SACKed range +// (for packet 3) that is on the right of an existing SACKed range +// (for packet 2). + +`./defaults.sh` + +// Establish a connection and send 10 MSS. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 1024 + +0 accept(3, ..., ...) = 4 + + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + + +.1 < . 1:1(0) ack 1 win 257 ++.001 < . 1:1(0) ack 1 win 257 ++.001 < . 1:1(0) ack 1 win 257 + +// 3 SACKed packets, so we enter Fast Recovery. + +0 > . 1:1001(1000) ack 1 + +0 %{ assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state }% + +0 %{ assert tcpi_lost == 6, tcpi_lost }% + +// SACK for 3001:4001. +// This SACK for an adjacent range causes the sender to +// shift the newly-SACKed range onto the previous skb. ++.007 < . 1:1(0) ack 1 win 257 + +0 > . 1001:2001(1000) ack 1 + +0 %{ assert tcpi_lost == 5, tcpi_lost }% + +0 %{ assert tcpi_reordering == 6, tcpi_reordering }% // 8001:9001 -> 3001:4001 is 6 + +// SACK for 9001:10001. + +.01 < . 1:1(0) ack 1 win 257 + +0 %{ assert tcpi_lost == 5, tcpi_lost }% + +// ACK for 1:1001 as packets from t=0.303 arrive. ++.083 < . 1:1(0) ack 1001 win 257 + +0 %{ assert tcpi_lost == 4,tcpi_lost }% + +// ACK for 1:4001 as packets from t=0.310 arrive. ++.017 < . 1:1(0) ack 4001 win 257 + +0 %{ assert tcpi_lost == 3,tcpi_lost }% + +// ACK for 1:7001 as packets from t=0.320 arrive. + +.01 < . 1:1(0) ack 7001 win 257 + +// ACK for all data as packets from t=0.403 arrive. + +.1 < . 1:1(0) ack 10001 win 257 + +0 %{ +assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state +assert tcpi_unacked == 0, tcpi_unacked +assert tcpi_sacked == 0, tcpi_sacked +assert tcpi_lost == 0, tcpi_lost +assert tcpi_retrans == 0, tcpi_retrans +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt new file mode 100644 index 0000000000000..0093b4973934e --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-3-4-8-9-fack.pkt @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb. +// This variant tests the case where we mark packets 0-4 lost, then +// get a SACK for 3, and then a SACK for 4. + +`./defaults.sh` + +// Establish a connection and send 10 MSS. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 1024 + +0 accept(3, ..., ...) = 4 + + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// SACK for 7001:8001. Using RACK we delay the fast retransmit. + +.1 < . 1:1(0) ack 1 win 257 +// RACK reordering timer ++.027 > . 1:1001(1000) ack 1 + +0 %{ +assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state +assert tcpi_lost == 7, tcpi_lost # RACK thinks 1:7001 are lost +assert tcpi_reordering == 3, tcpi_reordering +}% + +// SACK for 3001:4001. ++.002 < . 1:1(0) ack 1 win 257 + +0 > . 1001:2001(1000) ack 1 + +0 %{ +assert tcpi_lost == 6, tcpi_lost # since 3001:4001 is no longer lost +assert tcpi_reordering == 5, tcpi_reordering # 7001:8001 -> 3001:4001 +}% + +// SACK for 4001:5001. +// This SACK for an adjacent range causes the sender to +// shift the newly-SACKed range onto the previous skb. +// It uses the RFC3517 algorithm to mark 1:3001 lost +// because >=3 higher-sequence packets are SACKed. ++.002 < . 1:1(0) ack 1 win 257 + +0 > . 2001:3001(1000) ack 1 + +0 %{ +assert tcpi_lost == 5,tcpi_lost # SACK/RFC3517 thinks 1:3001 are lost +}% + +// SACK for 8001:9001. ++.002 < . 1:1(0) ack 1 win 257 + +// SACK for 9001:10001. ++.002 < . 1:1(0) ack 1 win 257 + +0 > . 5001:6001(1000) ack 1 + +// To simplify clean-up, say we get an ACK for all data. + +.1 < . 1:1(0) ack 10001 win 257 + +0 %{ +assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state +assert tcpi_unacked == 0, tcpi_unacked +assert tcpi_sacked == 0, tcpi_sacked +assert tcpi_lost == 0, tcpi_lost +assert tcpi_retrans == 0, tcpi_retrans +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt new file mode 100644 index 0000000000000..980a832dc81ca --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sack_sack-shift-sacked-7-5-6-8-9-fack.pkt @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test shifting of newly-SACKed ranges onto the previous already-SACKed skb. +// This variant tests the case where we mark packets 0-4 lost, then +// get a SACK for 5, and then a SACK for 6. + +`./defaults.sh` + +// Establish a connection and send 10 MSS. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 1024 + +0 accept(3, ..., ...) = 4 + + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// SACK for 7001:8001. Using RACK we delay a fast retransmit. + +.1 < . 1:1(0) ack 1 win 257 ++.027 > . 1:1001(1000) ack 1 + +0 %{ +assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state +assert tcpi_lost == 7,tcpi_lost # RACK thinks 1:7001 are lost +assert tcpi_reordering == 3, tcpi_reordering +}% + +// SACK for 5001:6001. + +0 < . 1:1(0) ack 1 win 257 + +0 > . 1001:2001(1000) ack 1 + +0 %{ +assert tcpi_lost == 6, tcpi_lost +assert tcpi_reordering == 3, tcpi_reordering # 7001:8001 -> 5001:6001 is 3 +}% + +// SACK for 6001:7001. +// This SACK for an adjacent range causes the sender to +// shift the newly-SACKed range onto the previous skb. + +0 < . 1:1(0) ack 1 win 257 + +0 > . 2001:3001(1000) ack 1 + +0 %{ assert tcpi_lost == 5, tcpi_lost }% + +// SACK for 8001:9001. + +0 < . 1:1(0) ack 1 win 257 + +0 > . 3001:4001(1000) ack 1 + +// SACK for 9001:10001. + +0 < . 1:1(0) ack 1 win 257 + +0 > . 4001:5001(1000) ack 1 + +// To simplify clean-up, say we get an ACK for all data. + +.1 < . 1:1(0) ack 10001 win 257 + +0 %{ +assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state +assert tcpi_unacked == 0, tcpi_unacked +assert tcpi_sacked == 0, tcpi_sacked +assert tcpi_lost == 0, tcpi_lost +assert tcpi_retrans == 0, tcpi_retrans +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt new file mode 100644 index 0000000000000..d7fdb43a8e89a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-last_data_recv.pkt @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test tcpi_last_data_recv for active session +`./defaults.sh` + +// Create a socket and set it to non-blocking. ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + ++0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) ++0 > S 0:0(0) ++.030 < S. 0:0(0) ack 1 win 10000 ++0 > . 1:1(0) ack 1 + ++1 %{ assert 990 <= tcpi_last_data_recv <= 1010, tcpi_last_data_recv }% + ++0 < . 1:1001(1000) ack 1 win 300 ++0 > . 1:1(0) ack 1001 + ++0 %{ assert tcpi_last_data_recv <= 10, tcpi_last_data_recv }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt new file mode 100644 index 0000000000000..a9bcd46f6cb69 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-rwnd-limited.pkt @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test rwnd limited time in tcp_info for client side. + +`./defaults.sh` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) + +// Server advertises 0 receive window. + +.01 < S. 0:0(0) ack 1 win 0 + + +0 > . 1:1(0) ack 1 + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + +0 fcntl(3, F_SETFL, O_RDWR) = 0 // set back to blocking + +// Make sure that initial rwnd limited time is 0. + +0 %{ assert tcpi_rwnd_limited == 0, tcpi_rwnd_limited }% + +// Receive window limited time starts here. + +0 write(3, ..., 1000) = 1000 + +// Check that rwnd limited time in tcp_info is around 0.1s. + +.1 %{ assert 98000 <= tcpi_rwnd_limited <= 110000, tcpi_rwnd_limited }% + +// Server opens the receive window. + +.1 < . 1:1(0) ack 1 win 2000 + +// Check that rwnd limited time in tcp_info is around 0.2s. + +0 %{ assert 198000 <= tcpi_rwnd_limited <= 210000, tcpi_rwnd_limited }% + + +0 > P. 1:1001(1000) ack 1 + +// Server advertises a very small receive window. + +.03 < . 1:1(0) ack 1001 win 10 + +// Receive window limited time starts again. + +0 write(3, ..., 1000) = 1000 + +// Server opens the receive window again. + +.1 < . 1:1(0) ack 1001 win 2000 +// Check that rwnd limited time in tcp_info is around 0.3s +// and busy time is 0.3 + 0.03 (server opened small window temporarily). + +0 %{ assert 298000 <= tcpi_rwnd_limited <= 310000, tcpi_rwnd_limited;\ + assert 328000 <= tcpi_busy_time <= 340000, tcpi_busy_time;\ +}% + + +0 > P. 1001:2001(1000) ack 1 + +.02 < . 1:1(0) ack 2001 win 2000 + +0 %{ assert 348000 <= tcpi_busy_time <= 360000, tcpi_busy_time }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt new file mode 100644 index 0000000000000..f0de2acd0f8e3 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_tcp_info_tcp-info-sndbuf-limited.pkt @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test send-buffer-limited time in tcp_info for client side. +`./defaults.sh` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) + +.01 < S. 0:0(0) ack 1 win 10000 + +0 > . 1:1(0) ack 1 + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + +0 fcntl(3, F_SETFL, O_RDWR) = 0 // set back to blocking + +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [10000], 4) = 0 + +0 getsockopt(3, SOL_SOCKET, SO_SNDBUF, [20000], [4]) = 0 + + +.09...0.14 write(3, ..., 150000) = 150000 + + +.01 < . 1:1(0) ack 10001 win 10000 + + +.01 < . 1:1(0) ack 30001 win 10000 + +// cwnd goes from 40(60KB) to 80(120KB), and that we hit the tiny sndbuf limit 10KB + +.01 < . 1:1(0) ack 70001 win 10000 + + +.02 < . 1:1(0) ack 95001 win 10000 + +0 %{ assert 19000 <= tcpi_sndbuf_limited <= 21000, tcpi_sndbuf_limited; \ + assert 49000 <= tcpi_busy_time <= 52000, tcpi_busy_time; \ + assert 0 == tcpi_rwnd_limited, tcpi_rwnd_limited }% + +// This ack frees up enough buffer so we are no longer +// buffer limited (socket flag SOCK_NOSPACE is cleared) + +.02 < . 1:1(0) ack 150001 win 10000 + +0 %{ assert 19000 <= tcpi_sndbuf_limited <= 21000, tcpi_sndbuf_limited;\ + assert 69000 <= tcpi_busy_time <= 73000, tcpi_busy_time;\ + assert 0 == tcpi_rwnd_limited, tcpi_rwnd_limited }% From eab35989cc37e168550b7bfa690905ea2d1ae603 Mon Sep 17 00:00:00 2001 From: Soham Chakradeo Date: Tue, 17 Dec 2024 18:51:59 +0000 Subject: [PATCH 0716/1450] selftests/net: packetdrill: import tcp/fast_recovery, tcp/nagle, tcp/timestamping Use the standard import and testing method, as described in the import of tcp/ecn , tcp/close , tcp/sack , tcp/tcp_info. Signed-off-by: Willem de Bruijn Signed-off-by: Soham Chakradeo Link: https://patch.msgid.link/20241217185203.297935-3-sohamch.kernel@gmail.com Signed-off-by: Jakub Kicinski --- .../tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt | 72 +++++++++ ...t_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt | 50 ++++++ ...tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt | 43 ++++++ ...ecovery_prr-ss-ack-below-snd_una-cubic.pkt | 41 +++++ .../packetdrill/tcp_nagle_https_client.pkt | 40 +++++ .../tcp_nagle_sendmsg_msg_more.pkt | 66 ++++++++ .../tcp_nagle_sockopt_cork_nodelay.pkt | 43 ++++++ ...tcp_timestamping_client-only-last-byte.pkt | 92 +++++++++++ .../packetdrill/tcp_timestamping_partial.pkt | 91 +++++++++++ .../packetdrill/tcp_timestamping_server.pkt | 145 ++++++++++++++++++ 10 files changed, 683 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt new file mode 100644 index 0000000000000..0d3c8077e830a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-10pkt-lost-1.pkt @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test PRR-slowstart implementation. +// In this variant we test a simple case where in-flight == ssthresh +// all the way through recovery, so during fast recovery we send one segment +// for each segment SACKed/ACKed. + +// Set up config. +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 +// RTT 100ms + +.1 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Send 10 data segments. + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// Lost packet 1:1001. + +.11 < . 1:1(0) ack 1 win 320 + +.01 < . 1:1(0) ack 1 win 320 + +.01 < . 1:1(0) ack 1 win 320 +// Enter fast recovery. + +0 > . 1:1001(1000) ack 1 + +.01 %{ +assert tcpi_ca_state == TCP_CA_Recovery, tcpi_ca_state +assert tcpi_snd_cwnd == 7, tcpi_snd_cwnd +assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh +}% + +// Write some more, which we will send 1 MSS at a time, +// as in-flight segments are SACKed or ACKed. + +.01 write(4, ..., 7000) = 7000 + + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 10001:11001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 11001:12001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 12001:13001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 13001:14001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 14001:15001(1000) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 15001:16001(1000) ack 1 + + +.02 < . 1:1(0) ack 10001 win 320 + +0 > P. 16001:17001(1000) ack 1 +// Leave fast recovery. + +.01 %{ +assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state +assert tcpi_snd_cwnd == 7, tcpi_snd_cwnd +assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh +}% + + +.03 < . 1:1(0) ack 12001 win 320 + +.02 < . 1:1(0) ack 14001 win 320 + +.02 < . 1:1(0) ack 16001 win 320 + +.02 < . 1:1(0) ack 17001 win 320 diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt new file mode 100644 index 0000000000000..7842a10b69672 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost-1_4-11_16.pkt @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test PRR-slowstart implementation. The sender sends 20 packets. Packet +// 1 to 4, and 11 to 16 are dropped. +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Write 20 data segments. + +0 write(4, ..., 20000) = 20000 + +0 > P. 1:10001(10000) ack 1 + +// Receive first DUPACK, entering PRR part + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 10001:11001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 + +0 > . 11001:12001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 + +0 > . 1:1001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 + +0 > . 1001:2001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 + +0 > . 2001:3001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 + +0 > . 3001:4001(1000) ack 1 +// Enter PRR CRB ++.002 < . 1:1(0) ack 1 win 320 + +0 > . 12001:13001(1000) ack 1 ++.002 < . 1:1(0) ack 1 win 320 + +0 > . 13001:14001(1000) ack 1 +// Enter PRR slow start + +.01 < . 1:1(0) ack 1001 win 320 + +0 > P. 14001:16001(2000) ack 1 ++.002 < . 1:1(0) ack 1001 win 320 + +0 > . 1001:2001(1000) ack 1 + +0 > . 16001:17001(1000) ack 1 +// inflight reaches ssthresh, goes into packet conservation mode ++.002 < . 1:1(0) ack 1001 win 320 + +0 > . 17001:18001(1000) ack 1 ++.002 < . 1:1(0) ack 1001 win 320 + +0 > . 18001:19001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt new file mode 100644 index 0000000000000..b66d7644c3b67 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-30pkt-lost1_4.pkt @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test PRR-slowstart implementation. The sender sends 20 packets. Packet +// 1 to 4 are lost. The sender writes another 10 packets. +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + + +.01 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Send 20 data segments. + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// Lost packet 1,2,3,4 + +.01 < . 1:1(0) ack 1 win 320 ++.002 < . 1:1(0) ack 1 win 320 + +0 < . 1:1(0) ack 1 win 320 + +0 > . 1:1001(1000) ack 1 + +0 < . 1:1(0) ack 1 win 320 + +0 > . 1001:2001(1000) ack 1 + +0 < . 1:1(0) ack 1 win 320 + +0 > . 2001:3001(1000) ack 1 + +0 < . 1:1(0) ack 1 win 320 + +0 > . 3001:4001(1000) ack 1 + +// Receiver ACKs all data. + +.01 < . 1:1(0) ack 1001 win 320 + +0 < . 1:1(0) ack 2001 win 320 + +0 < . 1:1(0) ack 3001 win 320 + +0 < . 1:1(0) ack 10001 win 320 + +// Writes another 10 packets, which the ssthresh*mss amount +// should be sent right away + +.01 write(4, ..., 10000) = 10000 + +0 > . 10001:17001(7000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt new file mode 100644 index 0000000000000..8e87bfecabb57 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fast_recovery_prr-ss-ack-below-snd_una-cubic.pkt @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test PRR-slowstart implementation. +// In this variant we verify that the sender uses SACK info on an ACK +// below snd_una. + +// Set up config. +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 +// RTT 10ms + +.01 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Send 10 data segments. + +0 write(4, ..., 10000) = 10000 + +0 > P. 1:10001(10000) ack 1 + +// Lost packet 1:1001,4001:5001,7001:8001. + +.01 < . 1:1(0) ack 1 win 320 + +0 < . 1:1(0) ack 1 win 320 + +0 < . 1:1(0) ack 1 win 320 + +0 > . 1:1001(1000) ack 1 + ++.012 < . 1:1(0) ack 4001 win 320 + +0 > . 4001:7001(3000) ack 1 + + +0 write(4, ..., 10000) = 10000 + +// The following ACK was reordered - delayed so that it arrives with +// an ACK field below snd_una. Here we check that the newly-SACKed +// 2MSS at 5001:7001 cause us to send out 2 more MSS. ++.002 < . 1:1(0) ack 3001 win 320 + +0 > . 7001:8001(1000) ack 1 + +0 > . 10001:11001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt new file mode 100644 index 0000000000000..7adae7a9ef4a8 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_https_client.pkt @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +// This is a test inspired by an Android client app using SSL. This +// test verifies using TCP_NODELAY would save application latency +// (Perhaps even better with TCP_NAGLE). +// +`./defaults.sh +ethtool -K tun0 tso off gso off +./set_sysctls.py /proc/sys/net/ipv4/tcp_timestamps=0` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + + +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) + +.1 < S. 0:0(0) ack 1 win 5792 + +0 > . 1:1(0) ack 1 + +// SSL handshake (resumed session) + +0 write(4, ..., 517) = 517 + +0 > P. 1:518(517) ack 1 + +.1 < . 1:1(0) ack 518 win 229 + + +0 < P. 1:144(143) ack 1 win 229 + +0 > . 518:518(0) ack 144 + +0 read(4, ..., 1000) = 143 + +// Application POST header (51B) and body (2002B) + +0 write(4, ..., 51) = 51 + +0 > P. 518:569(51) ack 144 + +.03 write(4, ..., 2002) = 2002 + +0 > . 569:1543(974) ack 144 + +0 > P. 1543:2517(974) ack 144 +// Without disabling Nagle, this packet will not happen until the remote ACK. + +0 > P. 2517:2571(54) ack 144 + + +.1 < . 1:1(0) ack 2571 win 229 + +// Reset sysctls +`/tmp/sysctl_restore_${PPID}.sh` diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt new file mode 100644 index 0000000000000..fa9c018139967 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_sendmsg_msg_more.pkt @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test the MSG_MORE flag will correctly corks the tiny writes +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.01 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 +// Disable Nagle by default on this socket. + +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + +// Test the basic case: MSG_MORE overwrites TCP_NODELAY and enables Nagle. + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 40}], msg_flags=0}, MSG_MORE) = 40 + +.21~+.215 > P. 1:41(40) ack 1 + +.01 < . 1:1(0) ack 41 win 257 + +// Test unsetting MSG_MORE releases the packet + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 100}], msg_flags=0}, MSG_MORE) = 100 ++.005 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 160}], msg_flags=0}, MSG_MORE) = 160 + +.01 sendmsg(4, {msg_name(...)=..., + msg_iov(3)=[{..., 100}, {..., 200}, {..., 195}], + msg_flags=0}, MSG_MORE) = 495 ++.008 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 5}], msg_flags=0}, 0) = 5 + +0 > P. 41:801(760) ack 1 + +.02 < . 1:1(0) ack 801 win 257 + + +// Test >MSS write will unleash MSS packets but hold on the remaining data. + +.1 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 3100}], msg_flags=0}, MSG_MORE) = 3100 + +0 > . 801:3801(3000) ack 1 ++.003 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 50}], msg_flags=0}, MSG_MORE) = 50 + + +.01 < . 1:1(0) ack 2801 win 257 +// Err... we relase the remaining right after the ACK? note that PUSH is reset + +0 > . 3801:3951(150) ack 1 + +// Test we'll hold on the subsequent writes when inflight (3801:3951) > 0 ++.001 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 1}], msg_flags=0}, MSG_MORE) = 1 ++.002 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 2}], msg_flags=0}, MSG_MORE) = 2 ++.003 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 3}], msg_flags=0}, MSG_MORE) = 3 ++.004 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 4}], msg_flags=0}, MSG_MORE) = 4 + +.02 < . 1:1(0) ack 3951 win 257 + +0 > . 3951:3961(10) ack 1 + +.02 < . 1:1(0) ack 3961 win 257 + + +// Test the case a MSG_MORE send followed by a write flushes the data + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(1)=[{..., 20}], msg_flags=0}, MSG_MORE) = 20 + +.05 write(4, ..., 20) = 20 + +0 > P. 3961:4001(40) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt b/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt new file mode 100644 index 0000000000000..0ddec5f7dc1ac --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_nagle_sockopt_cork_nodelay.pkt @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP_CORK and TCP_NODELAY sockopt behavior +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.01 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 +// Set TCP_CORK sockopt to hold small packets + +0 setsockopt(4, SOL_TCP, TCP_CORK, [1], 4) = 0 + + +0 write(4, ..., 40) = 40 + +.05 write(4, ..., 40) = 40 + +// Unset TCP_CORK should push pending bytes out + +.01 setsockopt(4, SOL_TCP, TCP_CORK, [0], 4) = 0 + +0 > P. 1:81(80) ack 1 + +.01 < . 1:1(0) ack 81 win 257 + +// Set TCP_CORK sockopt to hold small packets + +0 setsockopt(4, SOL_TCP, TCP_CORK, [1], 4) = 0 + + +0 write(4, ..., 40) = 40 + +.05 write(4, ..., 40) = 40 + +// Set TCP_NODELAY sockopt should push pending bytes out + +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + +0 > P. 81:161(80) ack 1 + +.01 < . 1:1(0) ack 161 win 257 + +// Set MSG_MORE to hold small packets + +0 send(4, ..., 40, MSG_MORE) = 40 + +.05 send(4, ..., 40, MSG_MORE) = 40 + +// Set TCP_NODELAY sockopt should push pending bytes out + +.01 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + +0 > . 161:241(80) ack 1 + +.01 < . 1:1(0) ack 241 win 257 diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt new file mode 100644 index 0000000000000..2087ec0c746ad --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_client-only-last-byte.pkt @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that tx timestamping sends timestamps only for +// the last byte of each sendmsg. +`./defaults.sh +` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +// Establish connection and verify that there was no error. + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) + +.01 < S. 0:0(0) ack 1 win 20000 + +0 > . 1:1(0) ack 1 + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + +0 fcntl(3, F_SETFL, O_RDWR) = 0 // set back to blocking + + +0 setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING, + [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID], 4) = 0 + + +0 write(3, ..., 11000) = 11000 + +0 > P. 1:10001(10000) ack 1 + +.01 < . 1:1(0) ack 10001 win 4000 + +0 > P. 10001:11001(1000) ack 1 + +.01 < . 1:1(0) ack 11001 win 4000 + +// Make sure that internal TCP timestamps are not overwritten and we have sane +// RTT measurement. + +0 %{ +assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt +}% + +// SCM_TSTAMP_SCHED for the last byte should be received almost immediately +// once 10001 is acked at t=20ms. +// setsockopt(..., [SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_ID], ...) +// is called after when SYN is acked. So, we expect the last byte of the first +// chunk to have a timestamp key of 10999 (i.e., 11000 - 1). + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SCHED, + ee_data=10999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SND for the last byte should be received almost immediately +// once 10001 is acked at t=20ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SND, + ee_data=10999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_ACK for the last byte should be received at t=30ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=30000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=10999}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt new file mode 100644 index 0000000000000..876024a311106 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_partial.pkt @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test tx timestamping for partial writes (IPv4). +`./defaults.sh +` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +// Establish connection and verify that there was no error. + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) + +.01 < S. 0:0(0) ack 1 win 2000 + +0 > . 1:1(0) ack 1 + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + + +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1000], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING, + [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID], 4) = 0 + +// We have a partial write. + +0 write(3, ..., 10000) = 2964 + +0 > . 1:989(988) ack 1 + +0 > P. 989:1977(988) ack 1 + +.01 < . 1:1(0) ack 1977 win 92 + +0 > P. 1977:2965(988) ack 1 + +.01 < . 1:1(0) ack 2965 win 92 + +// Make sure that internal TCP timestamps are not overwritten and we have sane +// RTT measurement. + +0 %{ +assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt +}% + +// SCM_TSTAMP_SCHED for the first chunk should be received almost immediately +// after the first ack at t=20ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SCHED, + ee_data=2963}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SND for the first chunk should be received almost immediately +// after the first ack at t=20ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SND, + ee_data=2963}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_ACK for the first chunk should be received after the last ack at +// t=30ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=30000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=2963}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt new file mode 100644 index 0000000000000..84d94780e6be9 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_server.pkt @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test tx timestamping for server-side (IPv4). +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_TIMESTAMPING, + [SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID], 4) = 0 + +// Write two 2KB chunks. +// setsockopt(..., [SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_OPT_ID], ...) +// is called after when SYN is acked. So, we expect the last byte of the first +// and the second chunks to have timestamp keys of 1999 (i.e., 2000 - 1) and +// 3999 (i.e., 4000 - 1) respectively. + +0 write(4, ..., 2000) = 2000 + +0 write(4, ..., 2000) = 2000 + +0 > P. 1:2001(2000) ack 1 + +0 > P. 2001:4001(2000) ack 1 + +.01 < . 1:1(0) ack 2001 win 514 + +.01 < . 1:1(0) ack 4001 win 514 + +// Make sure that internal TCP timestamps are not overwritten and we have sane +// RTT measurement. + +0 %{ +assert 5000 <= tcpi_rtt <= 20000, 'srtt=%d us' % tcpi_rtt +}% + +// SCM_TSTAMP_SCHED for the first chunk should be received almost immediately +// after write at t=10ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=10000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SCHED, + ee_data=1999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SND for the first chunk should be received almost immediately +// after write at t=10ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=10000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SND, + ee_data=1999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SCHED for the second chunk should be received almost immediately +// after that at t=10ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=10000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SCHED, + ee_data=3999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_SND for the second chunk should be received almost immediately +// after that at t=10ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=10000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_SND, + ee_data=3999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_ACK for the first chunk should be received at t=20ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=20000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=1999}} + ]}, MSG_ERRQUEUE) = 0 +// SCM_TSTAMP_ACK for the second chunk should be received at t=30ms. + +0 recvmsg(4, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=30000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=3999}} + ]}, MSG_ERRQUEUE) = 0 From 6f6692053939038f48c2f9f404fe414038a44431 Mon Sep 17 00:00:00 2001 From: Soham Chakradeo Date: Tue, 17 Dec 2024 18:52:00 +0000 Subject: [PATCH 0717/1450] selftests/net: packetdrill: import tcp/eor, tcp/splice, tcp/ts_recent, tcp/blocking Use the standard import and testing method, as described in the import of tcp/ecn and tcp/close , tcp/sack , tcp/tcp_info. Signed-off-by: Willem de Bruijn Signed-off-by: Soham Chakradeo Link: https://patch.msgid.link/20241217185203.297935-4-sohamch.kernel@gmail.com Signed-off-by: Jakub Kicinski --- .../tcp_blocking_blocking-accept.pkt | 18 +++++ .../tcp_blocking_blocking-connect.pkt | 13 ++++ .../tcp_blocking_blocking-read.pkt | 29 ++++++++ .../tcp_blocking_blocking-write.pkt | 35 +++++++++ .../packetdrill/tcp_eor_no-coalesce-large.pkt | 38 ++++++++++ .../tcp_eor_no-coalesce-retrans.pkt | 72 +++++++++++++++++++ .../packetdrill/tcp_eor_no-coalesce-small.pkt | 36 ++++++++++ .../tcp_eor_no-coalesce-subsequent.pkt | 66 +++++++++++++++++ .../tcp_splice_tcp_splice_loop_test.pkt | 20 ++++++ .../packetdrill/tcp_ts_recent_fin_tsval.pkt | 23 ++++++ .../packetdrill/tcp_ts_recent_invalid_ack.pkt | 25 +++++++ .../packetdrill/tcp_ts_recent_reset_tsval.pkt | 25 +++++++ 12 files changed, 400 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt new file mode 100644 index 0000000000000..38535701656ea --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-accept.pkt @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test for blocking accept. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +0...0.200 accept(3, ..., ...) = 4 + + +.1 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 257 + + +.1 write(4, ..., 2000) = 2000 + +0 > P. 1:2001(2000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt new file mode 100644 index 0000000000000..3692ef102381d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-connect.pkt @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test for blocking connect. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + + +.1...0.200 connect(3, ..., ...) = 0 + + +0 > S 0:0(0) + +.1 < S. 0:0(0) ack 1 win 5792 + +0 > . 1:1(0) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt new file mode 100644 index 0000000000000..914eabab367ae --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-read.pkt @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test for blocking read. +--tolerance_usecs=10000 + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + + +0...0.100 read(4, ..., 2000) = 2000 + +.1 < P. 1:2001(2000) ack 1 win 257 + +0 > . 1:1(0) ack 2001 + + +.1...0.200 read(4, ..., 2000) = 2000 + +.1 < P. 2001:4001(2000) ack 1 win 257 + +0 > . 1:1(0) ack 4001 + + +.1 < P. 4001:6001(2000) ack 1 win 257 + +0 > . 1:1(0) ack 6001 + +0...0.000 read(4, ..., 1000) = 1000 + +0...0.000 read(4, ..., 1000) = 1000 diff --git a/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt new file mode 100644 index 0000000000000..cec5a0725d95d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_blocking_blocking-write.pkt @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test for blocking write. +--tolerance_usecs=10000 + +`./defaults.sh +./set_sysctls.py /proc/sys/net/ipv4/tcp_min_tso_segs=10 +` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 50000 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 50000 + +0 accept(3, ..., ...) = 4 + +// Kernel doubles our value -> sk->sk_sndbuf is set to 42000 + +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [21000], 4) = 0 + +0 getsockopt(4, SOL_SOCKET, SO_SNDBUF, [42000], [4]) = 0 + +// A write of 60000 does not block. + +0...0.300 write(4, ..., 61000) = 61000 // this write() blocks + + +.1 < . 1:1(0) ack 10001 win 50000 + + +.1 < . 1:1(0) ack 30001 win 50000 + +// This ACK should wakeup the write(). An ACK of 35001 does not. + +.1 < . 1:1(0) ack 36001 win 50000 + +// Reset to sysctls defaults. +`/tmp/sysctl_restore_${PPID}.sh` diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt new file mode 100644 index 0000000000000..f95b9b3c9fa17 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-large.pkt @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP does not append any data from consequent writes to the tail +// skb created for the chunk. The large chunk itself should be packetized as +// usual. +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on +// the tail. + +0 write(4, ..., 10400) = 10400 + +// Write another 10040B chunk with no coalescing options. + +0 send(4, ..., 10400, MSG_EOR) = 10400 + +// Write a 2KB chunk. This chunk should not be appended to the packets created +// the previous chunk. + +0 write(4, ..., 2000) = 2000 + + +0 > P. 1:10001(10000) ack 1 ++.001 < . 1:1(0) ack 10001 win 514 +// Now we have enough room to send out the 2 x 400B packets out. + +0 > P. 10001:20801(10800) ack 1 ++.001 < . 1:1(0) ack 20801 win 514 +// This 2KB packet should be sent alone. + +0 > P. 20801:22801(2000) ack 1 ++.001 < . 1:1(0) ack 22801 win 514 diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt new file mode 100644 index 0000000000000..2ff66075288e8 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-retrans.pkt @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP does not append any data from consequent writes to the tail +// skb created for the chunk. Also, when packets are retransmitted, they +// will not be coalesce into the same skb. +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on +// the tail. + +0 write(4, ..., 10400) = 10400 + +// Write 10 400B chunks with no coalescing options. + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 +// This chunk should not be appended to the skbs created for the previous chunk. + +0 write(4, ..., 10000) = 10000 + + +0 > P. 1:10001(10000) ack 1 ++.001 < . 1:1(0) ack 10001 win 514 +// Now we have enough room to send out the 2 x 400B packets out. + +0 > P. 10001:10801(800) ack 1 +// The 9 remaining 400B chunks should be sent as individual packets. + +0 > P. 10801:11201(400) ack 1 + +0 > P. 11201:11601(400) ack 1 + +0 > P. 11601:12001(400) ack 1 + +0 > P. 12001:12401(400) ack 1 + +0 > P. 12401:12801(400) ack 1 + +0 > P. 12801:13201(400) ack 1 + +0 > P. 13201:13601(400) ack 1 + +0 > P. 13601:14001(400) ack 1 + +0 > P. 14001:14401(400) ack 1 +// The last 10KB chunk should be sent separately. + +0 > P. 14401:24401(10000) ack 1 + ++.001 < . 1:1(0) ack 10401 win 514 ++.001 < . 1:1(0) ack 10801 win 514 ++.001 < . 1:1(0) ack 11201 win 514 ++.001 < . 1:1(0) ack 11601 win 514 ++.001 < . 1:1(0) ack 12001 win 514 +// TCP should fill the hole but no coalescing should happen, and all +// retransmissions should be sent out as individual packets. + +// Note : This is timeout based retransmit. +// Do not put +0 here or flakes will come back. ++.004~+.008 > P. 12001:12401(400) ack 1 + ++.001 < . 1:1(0) ack 12401 win 514 + +0 > P. 12401:12801(400) ack 1 + +0 > P. 12801:13201(400) ack 1 ++.001 < . 1:1(0) ack 12801 win 514 ++.001 < . 1:1(0) ack 14401 win 514 ++.001 < . 1:1(0) ack 24401 win 514 diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt new file mode 100644 index 0000000000000..77039c5aac39f --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-small.pkt @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP does not append any data from consequent writes to the tail +// skb created for the chunk. +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on +// the tail. + +0 write(4, ..., 10400) = 10400 + +// Write a 400B chunk with no coalescing options. + +0 send(4, ..., 400, MSG_EOR) = 400 + +// This chunk should not be appended to the skbs created for the previous chunk. + +0 write(4, ..., 10000) = 10000 + + +0 > P. 1:10001(10000) ack 1 ++.001 < . 1:1(0) ack 10001 win 514 +// Now we have enough room to send out the 2 x 400B packets out. + +0 > P. 10001:10801(800) ack 1 + +0 > P. 10801:20801(10000) ack 1 ++.001 < . 1:1(0) ack 10401 win 514 ++.001 < . 1:1(0) ack 10801 win 514 ++.001 < . 1:1(0) ack 20801 win 514 diff --git a/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt new file mode 100644 index 0000000000000..dd5a06250595a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_eor_no-coalesce-subsequent.pkt @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP does not append any data from consequent writes to the tail +// skb created for the chunk even though we have 10 back-to-back small +// writes. +`./defaults.sh +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.01 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + +// Write a 10400B chunk to fill the ICW, and have a 400 byte skb sitting on +// the tail. + +0 write(4, ..., 10400) = 10400 + +// Write 10 400B chunks with no coalescing options. + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 + +0 send(4, ..., 400, MSG_EOR) = 400 +// This chunk should not be appended to the skbs created for the previous chunk. + +0 write(4, ..., 10000) = 10000 + + +0 > P. 1:10001(10000) ack 1 ++.001 < . 1:1(0) ack 10001 win 514 +// Now we have enough room to send out the 2 x 400B packets out. + +0 > P. 10001:10801(800) ack 1 +// The 9 remaining 400B chunks should be sent as individual packets. + +0 > P. 10801:11201(400) ack 1 + +0 > P. 11201:11601(400) ack 1 + +0 > P. 11601:12001(400) ack 1 + +0 > P. 12001:12401(400) ack 1 + +0 > P. 12401:12801(400) ack 1 + +0 > P. 12801:13201(400) ack 1 + +0 > P. 13201:13601(400) ack 1 + +0 > P. 13601:14001(400) ack 1 + +0 > P. 14001:14401(400) ack 1 +// The last 10KB chunk should be sent separately. + +0 > P. 14401:24401(10000) ack 1 + ++.001 < . 1:1(0) ack 10401 win 514 ++.001 < . 1:1(0) ack 10801 win 514 ++.001 < . 1:1(0) ack 11201 win 514 ++.001 < . 1:1(0) ack 11601 win 514 ++.001 < . 1:1(0) ack 12001 win 514 ++.001 < . 1:1(0) ack 12401 win 514 ++.001 < . 1:1(0) ack 12801 win 514 ++.001 < . 1:1(0) ack 13201 win 514 ++.001 < . 1:1(0) ack 13601 win 514 ++.001 < . 1:1(0) ack 14001 win 514 ++.001 < . 1:1(0) ack 14401 win 514 ++.001 < . 1:1(0) ack 24401 win 514 diff --git a/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt b/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt new file mode 100644 index 0000000000000..0cbd43253236f --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_splice_tcp_splice_loop_test.pkt @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh` + +// Initialize a server socket + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 setsockopt(3, SOL_IP, IP_FREEBIND, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Connection should get accepted + +0 < S 0:0(0) win 32972 + +0 > S. 0:0(0) ack 1 <...> + +0 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + + +0 pipe([5, 6]) = 0 + +0 < U. 1:101(100) ack 1 win 257 urg 100 + +0 splice(4, NULL, 6, NULL, 99, 0) = 99 + +0 splice(4, NULL, 6, NULL, 1, 0) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt new file mode 100644 index 0000000000000..e61424a7bd0a8 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_fin_tsval.pkt @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that we send FIN packet with correct TSval +--tcp_ts_tick_usecs=1000 +--tolerance_usecs=7000 + +`./defaults.sh` + +// Create a socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Establish a connection. + +0 < S 0:0(0) win 20000 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 20000 + +0 accept(3, ..., ...) = 4 + + +1 close(4) = 0 +// Check that FIN TSval is updated properly, one second has passed since last sent packet. + +0 > F. 1:1(0) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt new file mode 100644 index 0000000000000..174ce9a1bfc07 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_invalid_ack.pkt @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that we reject TS val updates on a packet with invalid ACK sequence + +`./defaults.sh +` + +// Create a socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Establish a connection. + +.1 < S 0:0(0) win 20000 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 20000 + +0 accept(3, ..., ...) = 4 + +// bad packet with high tsval (its ACK sequence is above our sndnxt) + +0 < F. 1:1(0) ack 9999 win 20000 + + + +0 < . 1:1001(1000) ack 1 win 20000 + +0 > . 1:1(0) ack 1001 diff --git a/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt new file mode 100644 index 0000000000000..2e3b3bb7493a1 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_ts_recent_reset_tsval.pkt @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that we send RST packet with correct TSval +--tcp_ts_tick_usecs=1000 + +`./defaults.sh` + +// Create a socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Establish a connection. + +0 < S 0:0(0) win 20000 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 20000 + +0 accept(3, ..., ...) = 4 + + +0 < . 1:1001(1000) ack 1 win 20000 + +0 > . 1:1(0) ack 1001 + + +1 close(4) = 0 +// Check that RST TSval is updated properly, one second has passed since last sent packet. + +0 > R. 1:1(0) ack 1001 From 5d4cadef52f29eea779a0b44e09f59657c1b46d8 Mon Sep 17 00:00:00 2001 From: Soham Chakradeo Date: Tue, 17 Dec 2024 18:52:01 +0000 Subject: [PATCH 0718/1450] selftests/net: packetdrill: import tcp/user_timeout, tcp/validate, tcp/sendfile, tcp/limited-transmit, tcp/syscall_bad_arg Use the standard import and testing method, as described in the import of tcp/ecn and tcp/close , tcp/sack , tcp/tcp_info. Signed-off-by: Willem de Bruijn Signed-off-by: Soham Chakradeo Link: https://patch.msgid.link/20241217185203.297935-5-sohamch.kernel@gmail.com Signed-off-by: Jakub Kicinski --- ...ited_transmit_limited-transmit-no-sack.pkt | 53 +++++++++++++++++++ ...limited_transmit_limited-transmit-sack.pkt | 50 +++++++++++++++++ .../tcp_sendfile_sendfile-simple.pkt | 26 +++++++++ ...scall_bad_arg_fastopen-invalid-buf-ptr.pkt | 42 +++++++++++++++ .../tcp_syscall_bad_arg_sendmsg-empty-iov.pkt | 30 +++++++++++ ...yscall_bad_arg_syscall-invalid-buf-ptr.pkt | 25 +++++++++ .../tcp_user_timeout_user-timeout-probe.pkt | 37 +++++++++++++ .../tcp_user_timeout_user_timeout.pkt | 32 +++++++++++ ...validate_validate-established-no-flags.pkt | 24 +++++++++ 9 files changed, 319 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt create mode 100644 tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt diff --git a/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt new file mode 100644 index 0000000000000..96b01eb5b7a49 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-no-sack.pkt @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test RFC 3042 "Limited Transmit": "sending a new data segment in +// response to each of the first two duplicate acknowledgments that +// arrive at the sender". +// This variation tests a receiver that doesn't support SACK. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Write some data, and send the initial congestion window. + +0 write(4, ..., 15000) = 15000 + +0 > P. 1:10001(10000) ack 1 + +// Limited transmit: on first dupack, send a new data segment. + +.11 < . 1:1(0) ack 1 win 320 + +0 > . 10001:11001(1000) ack 1 + +// Limited transmit: on second dupack, send a new data segment. + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 11001:12001(1000) ack 1 + +// It turned out to be reordering, not loss. +// We have one packet newly acked (1001:3001 were DUP-ACK'd) +// So we revert state back to Open. Slow start cwnd from 10 to 11 +// and send 11 - 9 = 2 packets + +.01 < . 1:1(0) ack 3001 win 320 + +0 > P. 12001:14001(2000) ack 1 + + +.02 < . 1:1(0) ack 5001 win 320 + +0 > P. 14001:15001(1000) ack 1 + +// Client gradually ACKs all data. + +.02 < . 1:1(0) ack 7001 win 320 + +.02 < . 1:1(0) ack 9001 win 320 + +.02 < . 1:1(0) ack 11001 win 320 + +.02 < . 1:1(0) ack 13001 win 320 + +.02 < . 1:1(0) ack 15001 win 320 + +// Clean up. + +.17 close(4) = 0 + +0 > F. 15001:15001(0) ack 1 + +.1 < F. 1:1(0) ack 15002 win 257 + +0 > . 15002:15002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt new file mode 100644 index 0000000000000..642da51ec3a4f --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_limited_transmit_limited-transmit-sack.pkt @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test RFC 3042 "Limited Transmit": "sending a new data segment in +// response to each of the first two duplicate acknowledgments that +// arrive at the sender". +// This variation tests a receiver that supports SACK. + +`./defaults.sh` + +// Establish a connection. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +.1 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 320 + +0 accept(3, ..., ...) = 4 + +// Write some data, and send the initial congestion window. + +0 write(4, ..., 15000) = 15000 + +0 > P. 1:10001(10000) ack 1 + +// Limited transmit: on first dupack, send a new data segment. + +.11 < . 1:1(0) ack 1 win 320 + +0 > . 10001:11001(1000) ack 1 + +// Limited transmit: on second dupack, send a new data segment. + +.01 < . 1:1(0) ack 1 win 320 + +0 > . 11001:12001(1000) ack 1 + +// It turned out to be reordering, not loss. + +.01 < . 1:1(0) ack 3001 win 320 + +0 > P. 12001:14001(2000) ack 1 + + +.02 < . 1:1(0) ack 5001 win 320 + +0 > P. 14001:15001(1000) ack 1 + +// Client gradually ACKs all data. + +.02 < . 1:1(0) ack 7001 win 320 + +.02 < . 1:1(0) ack 9001 win 320 + +.02 < . 1:1(0) ack 11001 win 320 + +.02 < . 1:1(0) ack 13001 win 320 + +.02 < . 1:1(0) ack 15001 win 320 + +// Clean up. + +.17 close(4) = 0 + +0 > F. 15001:15001(0) ack 1 + +.1 < F. 1:1(0) ack 15002 win 257 + +0 > . 15002:15002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt b/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt new file mode 100644 index 0000000000000..6740859a13600 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_sendfile_sendfile-simple.pkt @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +// Simplest possible test of open() and then sendfile(). +// We write some zeroes into a file (since packetdrill expects payloads +// to be all zeroes) and then open() the file, then use sendfile() +// and verify that the correct number of zeroes goes out. + +`./defaults.sh +/bin/rm -f /tmp/testfile +/bin/dd bs=1 count=5 if=/dev/zero of=/tmp/testfile status=none +` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +0 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + + +0 open("/tmp/testfile", O_RDONLY) = 5 + +0 sendfile(4, 5, [0], 5) = 5 + +0 > P. 1:6(5) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt new file mode 100644 index 0000000000000..8940726a3ec24 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_fastopen-invalid-buf-ptr.pkt @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test TCP fastopen behavior with NULL as buffer pointer, but a non-zero +// buffer length. +`./defaults.sh +./set_sysctls.py /proc/sys/net/ipv4/tcp_timestamps=0` + +// Cache warmup: send a Fast Open cookie request + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 ++0 setsockopt(3, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 ++0 connect(3, ..., ...) = -1 EINPROGRESS (Operation is now in progress) ++0 > S 0:0(0) ++0 < S. 123:123(0) ack 1 win 14600 ++0 > . 1:1(0) ack 1 ++0 close(3) = 0 ++0 > F. 1:1(0) ack 1 ++0 < F. 1:1(0) ack 2 win 92 ++0 > . 2:2(0) ack 2 + +// Test with MSG_FASTOPEN without TCP_FASTOPEN_CONNECT. ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 ++0 sendto(4, NULL, 1, MSG_FASTOPEN, ..., ...) = -1 ++0 close(4) = 0 + +// Test with TCP_FASTOPEN_CONNECT without MSG_FASTOPEN. ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 5 ++0 fcntl(5, F_SETFL, O_RDWR|O_NONBLOCK) = 0 ++0 setsockopt(5, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 ++0 connect(5, ..., ...) = 0 ++0 sendto(5, NULL, 1, 0, ..., ...) = -1 ++0 close(5) = 0 + +// Test with both TCP_FASTOPEN_CONNECT and MSG_FASTOPEN. ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 6 ++0 fcntl(6, F_SETFL, O_RDWR|O_NONBLOCK) = 0 ++0 setsockopt(6, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 ++0 connect(6, ..., ...) = 0 ++0 sendto(6, NULL, 1, MSG_FASTOPEN, ..., ...) = -1 ++0 close(6) = 0 + +`/tmp/sysctl_restore_${PPID}.sh` diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt new file mode 100644 index 0000000000000..b2b2cdf27e20f --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_sendmsg-empty-iov.pkt @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test that we correctly skip zero-length IOVs. +`./defaults.sh` + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_ZEROCOPY, [1], 4) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.01 < . 1:1(0) ack 1 win 257 + +0 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 + + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(4)=[{..., 0}, {..., 40}, {..., 0}, {..., 20}], + msg_flags=0}, 0) = 60 + +0 > P. 1:61(60) ack 1 + +.01 < . 1:1(0) ack 61 win 257 + + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(4)=[{..., 0}, {..., 0}, {..., 0}, {..., 0}], + msg_flags=0}, MSG_ZEROCOPY) = 0 + + +0 sendmsg(4, {msg_name(...)=..., + msg_iov(4)=[{..., 0}, {..., 10}, {..., 0}, {..., 50}], + msg_flags=0}, MSG_ZEROCOPY) = 60 + +0 > P. 61:121(60) ack 1 + +.01 < . 1:1(0) ack 121 win 257 diff --git a/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt new file mode 100644 index 0000000000000..59f5903f285c1 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_syscall_bad_arg_syscall-invalid-buf-ptr.pkt @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test kernel behavior with NULL as buffer pointer + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.2 < . 1:1(0) ack 1 win 514 + + +0 accept(3, ..., ...) = 4 + + +0 write(4, NULL, 1000) = -1 EFAULT (Bad address) + +0 send(4, NULL, 1000, 0) = -1 EFAULT (Bad address) + +0 sendto(4, NULL, 1000, 0, ..., ...) = -1 EFAULT (Bad address) + + +0 < . 1:1001(1000) ack 1 win 200 + +0 read(4, NULL, 1000) = -1 EFAULT (Bad address) + +0 recv(4, NULL, 1000, 0) = -1 EFAULT (Bad address) + +0 recvfrom(4, NULL, 1000, 0, ..., ...) = -1 EFAULT (Bad address) diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt new file mode 100644 index 0000000000000..183051ba0cae3 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + + +0 < S 0:0(0) win 0 + +0 > S. 0:0(0) ack 1 + + +.1 < . 1:1(0) ack 1 win 65530 + +0 accept(3, ..., ...) = 4 + + +0 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0 + +0 write(4, ..., 24) = 24 + +0 > P. 1:25(24) ack 1 + +.1 < . 1:1(0) ack 25 win 65530 + +0 %{ assert tcpi_probes == 0, tcpi_probes; \ + assert tcpi_backoff == 0, tcpi_backoff }% + +// install a qdisc dropping all packets + +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev tun0 root pfifo limit 0` + +0 write(4, ..., 24) = 24 + // When qdisc is congested we retry every 500ms + // (TCP_RESOURCE_PROBE_INTERVAL) and therefore + // we retry 6 times before hitting 3s timeout. + // First verify that the connection is alive: ++3.250 write(4, ..., 24) = 24 + // Now verify that shortly after that the socket is dead: + +.100 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out) + + +0 %{ assert tcpi_probes == 6, tcpi_probes; \ + assert tcpi_backoff == 0, tcpi_backoff }% + +0 close(4) = 0 diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt new file mode 100644 index 0000000000000..2efe02bfba9cf --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user_timeout.pkt @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh` + +// Initialize connection + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:0(0) win 32792 + +0 > S. 0:0(0) ack 1 + +.1 < . 1:1(0) ack 1 win 32792 + + + +0 accept(3, ..., ...) = 4 + +// Okay, we received nothing, and decide to close this idle socket. +// We set TCP_USER_TIMEOUT to 3 seconds because really it is not worth +// trying hard to cleanly close this flow, at the price of keeping +// a TCP structure in kernel for about 1 minute ! + +2 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0 + +0 close(4) = 0 + + +0 > F. 1:1(0) ack 1 + +.3~+.400 > F. 1:1(0) ack 1 + +.3~+.400 > F. 1:1(0) ack 1 + +.6~+.800 > F. 1:1(0) ack 1 + +// We finally receive something from the peer, but it is way too late +// Our socket vanished because TCP_USER_TIMEOUT was really small + +0 < . 1:2(1) ack 1 win 32792 + +0 > R 1:1(0) diff --git a/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt b/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt new file mode 100644 index 0000000000000..8bd60226ccfce --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_validate_validate-established-no-flags.pkt @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Verify that established connections drop a segment without the ACK flag set. + +`./defaults.sh` + +// Create a socket. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + +// Establish a connection. + +0 < S 0:0(0) win 20000 + +0 > S. 0:0(0) ack 1 + +.01 < . 1:1(0) ack 1 win 20000 + +0 accept(3, ..., ...) = 4 + +// Receive a segment with no flags set, verify that it's not enqueued. + +.01 < - 1:1001(1000) win 20000 + +0 ioctl(4, SIOCINQ, [0]) = 0 + +// Receive a segment with ACK flag set, verify that it is enqueued. + +.01 < . 1:1001(1000) ack 1 win 20000 + +0 ioctl(4, SIOCINQ, [1000]) = 0 From c58a812c8e49ad688f94f4b050ad5c5b388fc5d2 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Wed, 18 Dec 2024 21:36:55 +0800 Subject: [PATCH 0719/1450] ring-buffer: Fix overflow in __rb_map_vma An overflow occurred when performing the following calculation: nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; Add a check before the calculation to avoid this problem. syzbot reported this as a slab-out-of-bounds in __rb_map_vma: BUG: KASAN: slab-out-of-bounds in __rb_map_vma+0x9ab/0xae0 kernel/trace/ring_buffer.c:7058 Read of size 8 at addr ffff8880767dd2b8 by task syz-executor187/5836 CPU: 0 UID: 0 PID: 5836 Comm: syz-executor187 Not tainted 6.13.0-rc2-syzkaller-00159-gf932fb9b4074 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xc3/0x620 mm/kasan/report.c:489 kasan_report+0xd9/0x110 mm/kasan/report.c:602 __rb_map_vma+0x9ab/0xae0 kernel/trace/ring_buffer.c:7058 ring_buffer_map+0x56e/0x9b0 kernel/trace/ring_buffer.c:7138 tracing_buffers_mmap+0xa6/0x120 kernel/trace/trace.c:8482 call_mmap include/linux/fs.h:2183 [inline] mmap_file mm/internal.h:124 [inline] __mmap_new_file_vma mm/vma.c:2291 [inline] __mmap_new_vma mm/vma.c:2355 [inline] __mmap_region+0x1786/0x2670 mm/vma.c:2456 mmap_region+0x127/0x320 mm/mmap.c:1348 do_mmap+0xc00/0xfc0 mm/mmap.c:496 vm_mmap_pgoff+0x1ba/0x360 mm/util.c:580 ksys_mmap_pgoff+0x32c/0x5c0 mm/mmap.c:542 __do_sys_mmap arch/x86/kernel/sys_x86_64.c:89 [inline] __se_sys_mmap arch/x86/kernel/sys_x86_64.c:82 [inline] __x64_sys_mmap+0x125/0x190 arch/x86/kernel/sys_x86_64.c:82 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f The reproducer for this bug is: ------------------------8<------------------------- #include #include #include #include #include int main(int argc, char **argv) { int page_size = getpagesize(); int fd; void *meta; system("echo 1 > /sys/kernel/tracing/buffer_size_kb"); fd = open("/sys/kernel/tracing/per_cpu/cpu0/trace_pipe_raw", O_RDONLY); meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, page_size * 5); } ------------------------>8------------------------- Cc: stable@vger.kernel.org Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Link: https://lore.kernel.org/tencent_06924B6674ED771167C23CC336C097223609@qq.com Reported-by: syzbot+345e4443a21200874b18@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=345e4443a21200874b18 Signed-off-by: Edward Adam Davis Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7e257e855dd19..60210fb5b2110 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7019,7 +7019,11 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, lockdep_assert_held(&cpu_buffer->mapping_lock); nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ - nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */ + nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */ + if (nr_pages <= pgoff) + return -EINVAL; + + nr_pages -= pgoff; nr_vma_pages = vma_pages(vma); if (!nr_vma_pages || nr_vma_pages > nr_pages) From 8cd63406d08110c8098e1efda8aef7ddab4db348 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Dec 2024 14:15:07 -0500 Subject: [PATCH 0720/1450] trace/ring-buffer: Do not use TP_printk() formatting for boot mapped buffers The TP_printk() of a TRACE_EVENT() is a generic printf format that any developer can create for their event. It may include pointers to strings and such. A boot mapped buffer may contain data from a previous kernel where the strings addresses are different. One solution is to copy the event content and update the pointers by the recorded delta, but a simpler solution (for now) is to just use the print_fields() function to print these events. The print_fields() function just iterates the fields and prints them according to what type they are, and ignores the TP_printk() format from the event itself. To understand the difference, when printing via TP_printk() the output looks like this: 4582.696626: kmem_cache_alloc: call_site=getname_flags+0x47/0x1f0 ptr=00000000e70e10e0 bytes_req=4096 bytes_alloc=4096 gfp_flags=GFP_KERNEL node=-1 accounted=false 4582.696629: kmem_cache_alloc: call_site=alloc_empty_file+0x6b/0x110 ptr=0000000095808002 bytes_req=360 bytes_alloc=384 gfp_flags=GFP_KERNEL node=-1 accounted=false 4582.696630: kmem_cache_alloc: call_site=security_file_alloc+0x24/0x100 ptr=00000000576339c3 bytes_req=16 bytes_alloc=16 gfp_flags=GFP_KERNEL|__GFP_ZERO node=-1 accounted=false 4582.696653: kmem_cache_free: call_site=do_sys_openat2+0xa7/0xd0 ptr=00000000e70e10e0 name=names_cache But when printing via print_fields() (echo 1 > /sys/kernel/tracing/options/fields) the same event output looks like this: 4582.696626: kmem_cache_alloc: call_site=0xffffffff92d10d97 (-1831793257) ptr=0xffff9e0e8571e000 (-107689771147264) bytes_req=0x1000 (4096) bytes_alloc=0x1000 (4096) gfp_flags=0xcc0 (3264) node=0xffffffff (-1) accounted=(0) 4582.696629: kmem_cache_alloc: call_site=0xffffffff92d0250b (-1831852789) ptr=0xffff9e0e8577f800 (-107689770747904) bytes_req=0x168 (360) bytes_alloc=0x180 (384) gfp_flags=0xcc0 (3264) node=0xffffffff (-1) accounted=(0) 4582.696630: kmem_cache_alloc: call_site=0xffffffff92efca74 (-1829778828) ptr=0xffff9e0e8d35d3b0 (-107689640864848) bytes_req=0x10 (16) bytes_alloc=0x10 (16) gfp_flags=0xdc0 (3520) node=0xffffffff (-1) accounted=(0) 4582.696653: kmem_cache_free: call_site=0xffffffff92cfbea7 (-1831879001) ptr=0xffff9e0e8571e000 (-107689771147264) name=names_cache Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Link: https://lore.kernel.org/20241218141507.28389a1d@gandalf.local.home Fixes: 07714b4bb3f98 ("tracing: Handle old buffer mappings for event strings and functions") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be62f0ea1814d..6581cb2bc67f9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4353,6 +4353,15 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (event) { if (tr->trace_flags & TRACE_ITER_FIELDS) return print_event_fields(iter, event); + /* + * For TRACE_EVENT() events, the print_fmt is not + * safe to use if the array has delta offsets + * Force printing via the fields. + */ + if ((tr->text_delta || tr->data_delta) && + event->type > __TRACE_LAST_TYPE) + return print_event_fields(iter, event); + return event->funcs->trace(iter, sym_flags, event); } From 0674188f2f4d38d74aa863f17373d76256f2ed09 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 17 Dec 2024 15:37:04 +0800 Subject: [PATCH 0721/1450] ACPI: EC: Enable EC support on LoongArch by default Commit a6021aa24f6417416d933 ("ACPI: EC: make EC support compile-time conditional") only enable ACPI_EC on X86 by default, but the embedded controller is also widely used on LoongArch laptops so we also enable ACPI_EC for LoongArch. The laptop driver cannot work without EC, so also update the dependency of LOONGSON_LAPTOP to let it depend on APCI_EC. Fixes: a6021aa24f6417416d933 ("ACPI: EC: make EC support compile-time conditional") Reported-by: Xiaotian Wu Tested-by: Binbin Zhou Signed-off-by: Huacai Chen Link: https://patch.msgid.link/20241217073704.3339587-1-chenhuacai@loongson.cn [ rjw: Added Fixes: ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 4 ++-- drivers/platform/loongarch/Kconfig | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index d65cd08ba8e18..d81b55f5068c4 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -135,10 +135,10 @@ config ACPI_REV_OVERRIDE_POSSIBLE config ACPI_EC bool "Embedded Controller" depends on HAS_IOPORT - default X86 + default X86 || LOONGARCH help This driver handles communication with the microcontroller - on many x86 laptops and other machines. + on many x86/LoongArch laptops and other machines. config ACPI_EC_DEBUGFS tristate "EC read/write access through /sys/kernel/debug/ec" diff --git a/drivers/platform/loongarch/Kconfig b/drivers/platform/loongarch/Kconfig index 5633e4d73991a..447528797d07a 100644 --- a/drivers/platform/loongarch/Kconfig +++ b/drivers/platform/loongarch/Kconfig @@ -18,7 +18,7 @@ if LOONGARCH_PLATFORM_DEVICES config LOONGSON_LAPTOP tristate "Generic Loongson-3 Laptop Driver" - depends on ACPI + depends on ACPI_EC depends on BACKLIGHT_CLASS_DEVICE depends on INPUT depends on MACH_LOONGSON64 From a7f9d98eb1202132014ba760c26ad8608ffc9caf Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 10 Dec 2024 20:44:14 -0600 Subject: [PATCH 0722/1450] drm/amd: Update strapping for NBIO 2.5.0 This helps to avoid a spurious PME event on hotplug to Azalia. Cc: Vijendar Mukunda Reported-and-tested-by: ionut_n2001@yahoo.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=215884 Tested-by: Gabriel Marcano Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20241211024414.7840-1-mario.limonciello@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 3f6f237b9dd189e1fb85b8a3f7c97a8f27c1e49a) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index b1b57dcc5a737..49e953f86ced4 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -271,8 +271,19 @@ const struct nbio_hdp_flush_reg nbio_v7_0_hdp_flush_reg = { .ref_and_mask_sdma1 = GPU_HDP_FLUSH_DONE__SDMA1_MASK, }; +#define regRCC_DEV0_EPF6_STRAP4 0xd304 +#define regRCC_DEV0_EPF6_STRAP4_BASE_IDX 5 + static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { + uint32_t data; + + switch (adev->ip_versions[NBIO_HWIP][0]) { + case IP_VERSION(2, 5, 0): + data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); + WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data); + break; + } } #define MMIO_REG_HOLE_OFFSET (0x80000 - PAGE_SIZE) From 3abb660f9e18925468685591a3702bda05faba4f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:49:20 -0500 Subject: [PATCH 0723/1450] drm/amdgpu/nbio7.0: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 0ec43fbece784215d3c4469973e4556d70bce915) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index 49e953f86ced4..d1032e9992b49 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -278,7 +278,7 @@ static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { uint32_t data; - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(2, 5, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data); From cf2c97423a4f89c8b798294d3f34ecfe7e7035c3 Mon Sep 17 00:00:00 2001 From: David Laight Date: Sat, 14 Dec 2024 17:30:53 +0000 Subject: [PATCH 0724/1450] ipvs: Fix clamp() of ip_vs_conn_tab on small memory systems The 'max_avail' value is calculated from the system memory size using order_base_2(). order_base_2(x) is defined as '(x) ? fn(x) : 0'. The compiler generates two copies of the code that follows and then expands clamp(max, min, PAGE_SHIFT - 12) (11 on 32bit). This triggers a compile-time assert since min is 5. In reality a system would have to have less than 512MB memory for the bounds passed to clamp to be reversed. Swap the order of the arguments to clamp() to avoid the warning. Replace the clamp_val() on the line below with clamp(). clamp_val() is just 'an accident waiting to happen' and not needed here. Detected by compile time checks added to clamp(), specifically: minmax.h: use BUILD_BUG_ON_MSG() for the lo < hi test in clamp() Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/all/CA+G9fYsT34UkGFKxus63H6UVpYi5GRZkezT9MRLfAbM3f6ke0g@mail.gmail.com/ Fixes: 4f325e26277b ("ipvs: dynamically limit the connection hash table") Tested-by: Bartosz Golaszewski Reviewed-by: Bartosz Golaszewski Signed-off-by: David Laight Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 98d7dbe3d7876..c0289f83f96df 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1495,8 +1495,8 @@ int __init ip_vs_conn_init(void) max_avail -= 2; /* ~4 in hash row */ max_avail -= 1; /* IPVS up to 1/2 of mem */ max_avail -= order_base_2(sizeof(struct ip_vs_conn)); - max = clamp(max, min, max_avail); - ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max); + max = clamp(max_avail, min, max); + ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; From c9cfced17365b1df8c6ae6cd5db56aebd7ed9b57 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 10 Dec 2024 10:27:06 +0800 Subject: [PATCH 0725/1450] net/mlx5e: Report rx_discards_phy via rx_dropped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We noticed a high number of rx_discards_phy events on certain servers while running `ethtool -S`. However, this critical counter is not currently included in the standard /proc/net/dev statistics file, making it difficult to monitor effectively—especially given the diversity of vendors across a large fleet of servers. Let's report it via the standard rx_dropped metric. Suggested-by: Jakub Kicinski Signed-off-by: Yafang Shao Cc: Saeed Mahameed Cc: Leon Romanovsky Cc: Gal Pressman Reviewed-by: Simon Horman Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20241210022706.6665-1-laoar.shao@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dd16d73000c31..15e765a41d729 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3946,6 +3946,7 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) } stats->rx_missed_errors = priv->stats.qcnt.rx_out_of_buffer; + stats->rx_dropped = PPORT_2863_GET(pstats, if_in_discards); stats->rx_length_errors = PPORT_802_3_GET(pstats, a_in_range_length_errors) + From 70b6f46a4ed8bd56c85ffff22df91e20e8c85e33 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 17 Dec 2024 20:56:55 +0100 Subject: [PATCH 0726/1450] netfilter: ipset: Fix for recursive locking warning With CONFIG_PROVE_LOCKING, when creating a set of type bitmap:ip, adding it to a set of type list:set and populating it from iptables SET target triggers a kernel warning: | WARNING: possible recursive locking detected | 6.12.0-rc7-01692-g5e9a28f41134-dirty #594 Not tainted | -------------------------------------------- | ping/4018 is trying to acquire lock: | ffff8881094a6848 (&set->lock){+.-.}-{2:2}, at: ip_set_add+0x28c/0x360 [ip_set] | | but task is already holding lock: | ffff88811034c048 (&set->lock){+.-.}-{2:2}, at: ip_set_add+0x28c/0x360 [ip_set] This is a false alarm: ipset does not allow nested list:set type, so the loop in list_set_kadd() can never encounter the outer set itself. No other set type supports embedded sets, so this is the only case to consider. To avoid the false report, create a distinct lock class for list:set type ipset locks. Fixes: f830837f0eed ("netfilter: ipset: list:set set type support") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_list_set.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index bfae7066936bb..db794fe1300e6 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -611,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) return true; } +static struct lock_class_key list_set_lockdep_key; + static int list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) @@ -627,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; + lockdep_set_class(&set->lock, &list_set_lockdep_key); set->variant = &set_variant; set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), __alignof__(struct set_elem)); From b3ded6072c5600704cfa3ce3a8dc8718d34bda66 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 16 Nov 2024 21:36:47 +0100 Subject: [PATCH 0727/1450] power: supply: bq24190: Fix BQ24296 Vbus regulator support There are 2 issues with bq24296_set_otg_vbus(): 1. When writing the OTG_CONFIG bit it uses POC_CHG_CONFIG_SHIFT which should be POC_OTG_CONFIG_SHIFT. 2. When turning the regulator off it never turns charging back on. Note this must be done through bq24190_charger_set_charge_type(), to ensure that the charge_type property value of none/trickle/fast is honored. Resolve both issues to fix BQ24296 Vbus regulator support not working. Fixes: b150a703b56f ("power: supply: bq24190_charger: Add support for BQ24296") Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20241116203648.169100-2-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq24190_charger.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c index 2b393eb5c2820..c47f32f152e60 100644 --- a/drivers/power/supply/bq24190_charger.c +++ b/drivers/power/supply/bq24190_charger.c @@ -567,6 +567,7 @@ static int bq24190_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) static int bq24296_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) { + union power_supply_propval val = { .intval = bdi->charge_type }; int ret; ret = pm_runtime_resume_and_get(bdi->dev); @@ -587,13 +588,18 @@ static int bq24296_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) ret = bq24190_write_mask(bdi, BQ24190_REG_POC, BQ24296_REG_POC_OTG_CONFIG_MASK, - BQ24296_REG_POC_CHG_CONFIG_SHIFT, + BQ24296_REG_POC_OTG_CONFIG_SHIFT, BQ24296_REG_POC_OTG_CONFIG_OTG); - } else + } else { ret = bq24190_write_mask(bdi, BQ24190_REG_POC, BQ24296_REG_POC_OTG_CONFIG_MASK, - BQ24296_REG_POC_CHG_CONFIG_SHIFT, + BQ24296_REG_POC_OTG_CONFIG_SHIFT, BQ24296_REG_POC_OTG_CONFIG_DISABLE); + if (ret < 0) + goto out; + + ret = bq24190_charger_set_charge_type(bdi, &val); + } out: pm_runtime_mark_last_busy(bdi->dev); From cff865c700711ecc3824b2dfe181637f3ed23c80 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Dec 2024 09:10:34 +0100 Subject: [PATCH 0728/1450] net: phy: avoid undefined behavior in *_led_polarity_set() gcc runs into undefined behavior at the end of the three led_polarity_set() callback functions if it were called with a zero 'modes' argument and it just ends the function there without returning from it. This gets flagged by 'objtool' as a function that continues on to the next one: drivers/net/phy/aquantia/aquantia_leds.o: warning: objtool: aqr_phy_led_polarity_set+0xf: can't find jump dest instruction at .text+0x5d9 drivers/net/phy/intel-xway.o: warning: objtool: xway_gphy_led_polarity_set() falls through to next function xway_gphy_config_init() drivers/net/phy/mxl-gpy.o: warning: objtool: gpy_led_polarity_set() falls through to next function gpy_led_hw_control_get() There is no point to micro-optimize the behavior here to save a single-digit number of bytes in the kernel, so just change this to a "return -EINVAL" as we do when any unexpected bits are set. Fixes: 1758af47b98c ("net: phy: intel-xway: add support for PHY LEDs") Fixes: 9d55e68b19f2 ("net: phy: aquantia: correctly describe LED polarity override") Fixes: eb89c79c1b8f ("net: phy: mxl-gpy: correctly describe LED polarity") Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241217081056.238792-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/phy/aquantia/aquantia_leds.c | 2 +- drivers/net/phy/intel-xway.c | 2 +- drivers/net/phy/mxl-gpy.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/aquantia/aquantia_leds.c b/drivers/net/phy/aquantia/aquantia_leds.c index 00ad2313fed36..951f46104eff0 100644 --- a/drivers/net/phy/aquantia/aquantia_leds.c +++ b/drivers/net/phy/aquantia/aquantia_leds.c @@ -156,5 +156,5 @@ int aqr_phy_led_polarity_set(struct phy_device *phydev, int index, unsigned long if (force_active_high || force_active_low) return aqr_phy_led_active_low_set(phydev, index, force_active_low); - unreachable(); + return -EINVAL; } diff --git a/drivers/net/phy/intel-xway.c b/drivers/net/phy/intel-xway.c index b672c55a7a4e7..e6ed2413e514c 100644 --- a/drivers/net/phy/intel-xway.c +++ b/drivers/net/phy/intel-xway.c @@ -529,7 +529,7 @@ static int xway_gphy_led_polarity_set(struct phy_device *phydev, int index, if (force_active_high) return phy_clear_bits(phydev, XWAY_MDIO_LED, XWAY_GPHY_LED_INV(index)); - unreachable(); + return -EINVAL; } static struct phy_driver xway_gphy[] = { diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index db3c1f72b4073..a8ccf257c1094 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -1014,7 +1014,7 @@ static int gpy_led_polarity_set(struct phy_device *phydev, int index, if (force_active_high) return phy_clear_bits(phydev, PHY_LED, PHY_LED_POLARITY(index)); - unreachable(); + return -EINVAL; } static struct phy_driver gpy_drivers[] = { From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 18 Dec 2024 17:56:25 +0100 Subject: [PATCH 0729/1450] io_uring: Fix registered ring file refcount leak Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is only called on exit, but __io_uring_free (which frees the tctx in which the registered ring pointers are stored) is also called on execve (via begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel -> io_uring_cancel_generic -> __io_uring_free). This means: A process going through execve while having registered rings will leak references to the rings' `struct file`. Fix it by zapping registered rings on execve(). This is implemented by moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its callee __io_uring_cancel(), which is called from io_uring_task_cancel() on execve. This could probably be exploited *on 32-bit kernels* by leaking 2^32 references to the same ring, because the file refcount is stored in a pointer-sized field and get_file() doesn't have protection against refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no impact beyond a memory leak. Cc: stable@vger.kernel.org Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 4 +--- io_uring/io_uring.c | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e123d5e17b526..85fe4e6b275c7 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { - if (current->io_uring) { - io_uring_unreg_ringfd(); + if (current->io_uring) __io_uring_cancel(false); - } } static inline void io_uring_task_cancel(void) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 605625e932eb8..432b95ca9c85e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3214,6 +3214,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) void __io_uring_cancel(bool cancel_all) { + io_uring_unreg_ringfd(); io_uring_cancel_generic(cancel_all, NULL); } From 5c964c8a97c12145104f5d2782aa1ffccf3a93dd Mon Sep 17 00:00:00 2001 From: Martin Hou Date: Mon, 16 Dec 2024 11:06:18 +0800 Subject: [PATCH 0730/1450] net: usb: qmi_wwan: add Quectel RG255C Add support for Quectel RG255C which is based on Qualcomm SDX35 chip. The composition is DM / NMEA / AT / QMI. T: Bus=01 Lev=01 Prnt=01 Port=04 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=0316 Rev= 5.15 S: Manufacturer=Quectel S: Product=RG255C-CN S: SerialNumber=c68192c1 C:* #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Martin Hou Link: https://patch.msgid.link/tencent_17DDD787B48E8A5AB8379ED69E23A0CD9309@qq.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 9fe7f704a2f7b..e9208a8d2bfa6 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1429,6 +1429,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x2c7c, 0x0195, 4)}, /* Quectel EG95 */ {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x030e, 4)}, /* Quectel EM05GV2 */ + {QMI_QUIRK_SET_DTR(0x2c7c, 0x0316, 3)}, /* Quectel RG255C */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0104, 4)}, /* Fibocom NL678 series */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0112, 0)}, /* Fibocom FG132 */ {QMI_FIXED_INTF(0x0489, 0xe0b4, 0)}, /* Foxconn T77W968 LTE */ From 65c233d8e329c152e88fe796155702fd21028883 Mon Sep 17 00:00:00 2001 From: shunlizhou Date: Mon, 16 Dec 2024 13:54:46 +0000 Subject: [PATCH 0731/1450] docs: net: bonding: fix typos The bonding documentation had several "insure" which is not properly used in the context. Suggest to change to "ensure" to improve readability. Signed-off-by: shunlizhou Link: https://patch.msgid.link/20241216135447.57681-1-shunlizhou@aliyun.com Signed-off-by: Jakub Kicinski --- Documentation/networking/bonding.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index 7c8d22d686824..a4c1291d25611 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -1963,7 +1963,7 @@ obtain its hardware address from the first slave, which might not match the hardware address of the VLAN interfaces (which was ultimately copied from an earlier slave). -There are two methods to insure that the VLAN device operates +There are two methods to ensure that the VLAN device operates with the correct hardware address if all slaves are removed from a bond interface: @@ -2078,7 +2078,7 @@ as an unsolicited ARP reply (because ARP matches replies on an interface basis), and is discarded. The MII monitor is not affected by the state of the routing table. -The solution here is simply to insure that slaves do not have +The solution here is simply to ensure that slaves do not have routes of their own, and if for some reason they must, those routes do not supersede routes of their master. This should generally be the case, but unusual configurations or errant manual or automatic static @@ -2295,7 +2295,7 @@ active-backup: the switches have an ISL and play together well. If the network configuration is such that one switch is specifically a backup switch (e.g., has lower capacity, higher cost, etc), - then the primary option can be used to insure that the + then the primary option can be used to ensure that the preferred link is always used when it is available. broadcast: @@ -2322,7 +2322,7 @@ monitor can provide a higher level of reliability in detecting end to end connectivity failures (which may be caused by the failure of any individual component to pass traffic for any reason). Additionally, the ARP monitor should be configured with multiple targets (at least -one for each switch in the network). This will insure that, +one for each switch in the network). This will ensure that, regardless of which switch is active, the ARP monitor has a suitable target to query. From a126061c80d5efb4baef4bcf346094139cd81df6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Dec 2024 13:51:21 +0000 Subject: [PATCH 0732/1450] ptr_ring: do not block hard interrupts in ptr_ring_resize_multiple() Jakub added a lockdep_assert_no_hardirq() check in __page_pool_put_page() to increase test coverage. syzbot found a splat caused by hard irq blocking in ptr_ring_resize_multiple() [1] As current users of ptr_ring_resize_multiple() do not require hard irqs being masked, replace it to only block BH. Rename helpers to better reflect they are safe against BH only. - ptr_ring_resize_multiple() to ptr_ring_resize_multiple_bh() - skb_array_resize_multiple() to skb_array_resize_multiple_bh() [1] WARNING: CPU: 1 PID: 9150 at net/core/page_pool.c:709 __page_pool_put_page net/core/page_pool.c:709 [inline] WARNING: CPU: 1 PID: 9150 at net/core/page_pool.c:709 page_pool_put_unrefed_netmem+0x157/0xa40 net/core/page_pool.c:780 Modules linked in: CPU: 1 UID: 0 PID: 9150 Comm: syz.1.1052 Not tainted 6.11.0-rc3-syzkaller-00202-gf8669d7b5f5d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 RIP: 0010:__page_pool_put_page net/core/page_pool.c:709 [inline] RIP: 0010:page_pool_put_unrefed_netmem+0x157/0xa40 net/core/page_pool.c:780 Code: 74 0e e8 7c aa fb f7 eb 43 e8 75 aa fb f7 eb 3c 65 8b 1d 38 a8 6a 76 31 ff 89 de e8 a3 ae fb f7 85 db 74 0b e8 5a aa fb f7 90 <0f> 0b 90 eb 1d 65 8b 1d 15 a8 6a 76 31 ff 89 de e8 84 ae fb f7 85 RSP: 0018:ffffc9000bda6b58 EFLAGS: 00010083 RAX: ffffffff8997e523 RBX: 0000000000000000 RCX: 0000000000040000 RDX: ffffc9000fbd0000 RSI: 0000000000001842 RDI: 0000000000001843 RBP: 0000000000000000 R08: ffffffff8997df2c R09: 1ffffd40003a000d R10: dffffc0000000000 R11: fffff940003a000e R12: ffffea0001d00040 R13: ffff88802e8a4000 R14: dffffc0000000000 R15: 00000000ffffffff FS: 00007fb7aaf716c0(0000) GS:ffff8880b9300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa15a0d4b72 CR3: 00000000561b0000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tun_ptr_free drivers/net/tun.c:617 [inline] __ptr_ring_swap_queue include/linux/ptr_ring.h:571 [inline] ptr_ring_resize_multiple_noprof include/linux/ptr_ring.h:643 [inline] tun_queue_resize drivers/net/tun.c:3694 [inline] tun_device_event+0xaaf/0x1080 drivers/net/tun.c:3714 notifier_call_chain+0x19f/0x3e0 kernel/notifier.c:93 call_netdevice_notifiers_extack net/core/dev.c:2032 [inline] call_netdevice_notifiers net/core/dev.c:2046 [inline] dev_change_tx_queue_len+0x158/0x2a0 net/core/dev.c:9024 do_setlink+0xff6/0x41f0 net/core/rtnetlink.c:2923 rtnl_setlink+0x40d/0x5a0 net/core/rtnetlink.c:3201 rtnetlink_rcv_msg+0x73f/0xcf0 net/core/rtnetlink.c:6647 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2550 Fixes: ff4e538c8c3e ("page_pool: add a lockdep check for recycling in hardirq") Reported-by: syzbot+f56a5c5eac2b28439810@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/671e10df.050a0220.2b8c0f.01cf.GAE@google.com/T/ Signed-off-by: Eric Dumazet Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://patch.msgid.link/20241217135121.326370-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/tap.c | 6 +++--- drivers/net/tun.c | 6 +++--- include/linux/ptr_ring.h | 21 ++++++++++----------- include/linux/skb_array.h | 17 +++++++++-------- net/sched/sch_generic.c | 4 ++-- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 5aa41d5f7765a..5ca6ecf0ce5fb 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1329,9 +1329,9 @@ int tap_queue_resize(struct tap_dev *tap) list_for_each_entry(q, &tap->queue_list, next) rings[i++] = &q->ring; - ret = ptr_ring_resize_multiple(rings, n, - dev->tx_queue_len, GFP_KERNEL, - __skb_array_destroy_skb); + ret = ptr_ring_resize_multiple_bh(rings, n, + dev->tx_queue_len, GFP_KERNEL, + __skb_array_destroy_skb); kfree(rings); return ret; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8e94df88392c6..41e3eeac06fdc 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3701,9 +3701,9 @@ static int tun_queue_resize(struct tun_struct *tun) list_for_each_entry(tfile, &tun->disabled, next) rings[i++] = &tfile->tx_ring; - ret = ptr_ring_resize_multiple(rings, n, - dev->tx_queue_len, GFP_KERNEL, - tun_ptr_free); + ret = ptr_ring_resize_multiple_bh(rings, n, + dev->tx_queue_len, GFP_KERNEL, + tun_ptr_free); kfree(rings); return ret; diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index fd037c127bb07..551329220e4f3 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -615,15 +615,14 @@ static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. - * In particular if you consume ring in interrupt or BH context, you must - * disable interrupts/BH when doing so. + * In particular if you consume ring in BH context, you must + * disable BH when doing so. */ -static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, - unsigned int nrings, - int size, - gfp_t gfp, void (*destroy)(void *)) +static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings, + unsigned int nrings, + int size, gfp_t gfp, + void (*destroy)(void *)) { - unsigned long flags; void ***queues; int i; @@ -638,12 +637,12 @@ static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, } for (i = 0; i < nrings; ++i) { - spin_lock_irqsave(&(rings[i])->consumer_lock, flags); + spin_lock_bh(&(rings[i])->consumer_lock); spin_lock(&(rings[i])->producer_lock); queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], size, gfp, destroy); spin_unlock(&(rings[i])->producer_lock); - spin_unlock_irqrestore(&(rings[i])->consumer_lock, flags); + spin_unlock_bh(&(rings[i])->consumer_lock); } for (i = 0; i < nrings; ++i) @@ -662,8 +661,8 @@ static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, noqueues: return -ENOMEM; } -#define ptr_ring_resize_multiple(...) \ - alloc_hooks(ptr_ring_resize_multiple_noprof(__VA_ARGS__)) +#define ptr_ring_resize_multiple_bh(...) \ + alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index 926496c9cc9c3..bf178238a3083 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -199,17 +199,18 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } -static inline int skb_array_resize_multiple_noprof(struct skb_array **rings, - int nrings, unsigned int size, - gfp_t gfp) +static inline int skb_array_resize_multiple_bh_noprof(struct skb_array **rings, + int nrings, + unsigned int size, + gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); - return ptr_ring_resize_multiple_noprof((struct ptr_ring **)rings, - nrings, size, gfp, - __skb_array_destroy_skb); + return ptr_ring_resize_multiple_bh_noprof((struct ptr_ring **)rings, + nrings, size, gfp, + __skb_array_destroy_skb); } -#define skb_array_resize_multiple(...) \ - alloc_hooks(skb_array_resize_multiple_noprof(__VA_ARGS__)) +#define skb_array_resize_multiple_bh(...) \ + alloc_hooks(skb_array_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void skb_array_cleanup(struct skb_array *a) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 38ec18f73de43..8874ae6680952 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -911,8 +911,8 @@ static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, bands[prio] = q; } - return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len, - GFP_KERNEL); + return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len, + GFP_KERNEL); } struct Qdisc_ops pfifo_fast_ops __read_mostly = { From dbf8be8218e7ff2ee2bbeebc91bf0e0c58a8c60b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 8 Nov 2024 13:57:06 +0000 Subject: [PATCH 0733/1450] docs/mm: add VMA locks documentation Locking around VMAs is complicated and confusing. While we have a number of disparate comments scattered around the place, we seem to be reaching a level of complexity that justifies a serious effort at clearly documenting how locks are expected to be used when it comes to interacting with mm_struct and vm_area_struct objects. This is especially pertinent as regards the efforts to find sensible abstractions for these fundamental objects in kernel rust code whose compiler strictly requires some means of expressing these rules (and through this expression, self-document these requirements as well as enforce them). The document limits scope to mmap and VMA locks and those that are immediately adjacent and relevant to them - so additionally covers page table locking as this is so very closely tied to VMA operations (and relies upon us handling these correctly). The document tries to cover some of the nastier and more confusing edge cases and concerns especially around lock ordering and page table teardown. The document is split between generally useful information for users of mm interfaces, and separately a section intended for mm kernel developers providing a discussion around internal implementation details. [lorenzo.stoakes@oracle.com: v3] Link: https://lkml.kernel.org/r/20241114205402.859737-1-lorenzo.stoakes@oracle.com [lorenzo.stoakes@oracle.com: docs/mm: minor corrections] Link: https://lkml.kernel.org/r/d3de735a-25ae-4eb2-866c-a9624fe6f795@lucifer.local [jannh@google.com: docs/mm: add more warnings around page table access] Link: https://lkml.kernel.org/r/20241118-vma-docs-addition1-onv3-v2-1-c9d5395b72ee@google.com Link: https://lkml.kernel.org/r/20241108135708.48567-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Qi Zheng Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Bagas Sanjaya Reviewed-by: Jann Horn Cc: Alice Ryhl Cc: Boqun Feng Cc: Hillf Danton Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/process_addrs.rst | 850 +++++++++++++++++++++++++++++ 1 file changed, 850 insertions(+) diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index e8618fbc62c9e..1d416658d7f59 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -3,3 +3,853 @@ ================= Process Addresses ================= + +.. toctree:: + :maxdepth: 3 + + +Userland memory ranges are tracked by the kernel via Virtual Memory Areas or +'VMA's of type :c:struct:`!struct vm_area_struct`. + +Each VMA describes a virtually contiguous memory range with identical +attributes, each described by a :c:struct:`!struct vm_area_struct` +object. Userland access outside of VMAs is invalid except in the case where an +adjacent stack VMA could be extended to contain the accessed address. + +All VMAs are contained within one and only one virtual address space, described +by a :c:struct:`!struct mm_struct` object which is referenced by all tasks (that is, +threads) which share the virtual address space. We refer to this as the +:c:struct:`!mm`. + +Each mm object contains a maple tree data structure which describes all VMAs +within the virtual address space. + +.. note:: An exception to this is the 'gate' VMA which is provided by + architectures which use :c:struct:`!vsyscall` and is a global static + object which does not belong to any specific mm. + +------- +Locking +------- + +The kernel is designed to be highly scalable against concurrent read operations +on VMA **metadata** so a complicated set of locks are required to ensure memory +corruption does not occur. + +.. note:: Locking VMAs for their metadata does not have any impact on the memory + they describe nor the page tables that map them. + +Terminology +----------- + +* **mmap locks** - Each MM has a read/write semaphore :c:member:`!mmap_lock` + which locks at a process address space granularity which can be acquired via + :c:func:`!mmap_read_lock`, :c:func:`!mmap_write_lock` and variants. +* **VMA locks** - The VMA lock is at VMA granularity (of course) which behaves + as a read/write semaphore in practice. A VMA read lock is obtained via + :c:func:`!lock_vma_under_rcu` (and unlocked via :c:func:`!vma_end_read`) and a + write lock via :c:func:`!vma_start_write` (all VMA write locks are unlocked + automatically when the mmap write lock is released). To take a VMA write lock + you **must** have already acquired an :c:func:`!mmap_write_lock`. +* **rmap locks** - When trying to access VMAs through the reverse mapping via a + :c:struct:`!struct address_space` or :c:struct:`!struct anon_vma` object + (reachable from a folio via :c:member:`!folio->mapping`). VMAs must be stabilised via + :c:func:`!anon_vma_[try]lock_read` or :c:func:`!anon_vma_[try]lock_write` for + anonymous memory and :c:func:`!i_mmap_[try]lock_read` or + :c:func:`!i_mmap_[try]lock_write` for file-backed memory. We refer to these + locks as the reverse mapping locks, or 'rmap locks' for brevity. + +We discuss page table locks separately in the dedicated section below. + +The first thing **any** of these locks achieve is to **stabilise** the VMA +within the MM tree. That is, guaranteeing that the VMA object will not be +deleted from under you nor modified (except for some specific fields +described below). + +Stabilising a VMA also keeps the address space described by it around. + +Lock usage +---------- + +If you want to **read** VMA metadata fields or just keep the VMA stable, you +must do one of the following: + +* Obtain an mmap read lock at the MM granularity via :c:func:`!mmap_read_lock` (or a + suitable variant), unlocking it with a matching :c:func:`!mmap_read_unlock` when + you're done with the VMA, *or* +* Try to obtain a VMA read lock via :c:func:`!lock_vma_under_rcu`. This tries to + acquire the lock atomically so might fail, in which case fall-back logic is + required to instead obtain an mmap read lock if this returns :c:macro:`!NULL`, + *or* +* Acquire an rmap lock before traversing the locked interval tree (whether + anonymous or file-backed) to obtain the required VMA. + +If you want to **write** VMA metadata fields, then things vary depending on the +field (we explore each VMA field in detail below). For the majority you must: + +* Obtain an mmap write lock at the MM granularity via :c:func:`!mmap_write_lock` (or a + suitable variant), unlocking it with a matching :c:func:`!mmap_write_unlock` when + you're done with the VMA, *and* +* Obtain a VMA write lock via :c:func:`!vma_start_write` for each VMA you wish to + modify, which will be released automatically when :c:func:`!mmap_write_unlock` is + called. +* If you want to be able to write to **any** field, you must also hide the VMA + from the reverse mapping by obtaining an **rmap write lock**. + +VMA locks are special in that you must obtain an mmap **write** lock **first** +in order to obtain a VMA **write** lock. A VMA **read** lock however can be +obtained without any other lock (:c:func:`!lock_vma_under_rcu` will acquire then +release an RCU lock to lookup the VMA for you). + +This constrains the impact of writers on readers, as a writer can interact with +one VMA while a reader interacts with another simultaneously. + +.. note:: The primary users of VMA read locks are page fault handlers, which + means that without a VMA write lock, page faults will run concurrent with + whatever you are doing. + +Examining all valid lock states: + +.. table:: + + ========= ======== ========= ======= ===== =========== ========== + mmap lock VMA lock rmap lock Stable? Read? Write most? Write all? + ========= ======== ========= ======= ===== =========== ========== + \- \- \- N N N N + \- R \- Y Y N N + \- \- R/W Y Y N N + R/W \-/R \-/R/W Y Y N N + W W \-/R Y Y Y N + W W W Y Y Y Y + ========= ======== ========= ======= ===== =========== ========== + +.. warning:: While it's possible to obtain a VMA lock while holding an mmap read lock, + attempting to do the reverse is invalid as it can result in deadlock - if + another task already holds an mmap write lock and attempts to acquire a VMA + write lock that will deadlock on the VMA read lock. + +All of these locks behave as read/write semaphores in practice, so you can +obtain either a read or a write lock for each of these. + +.. note:: Generally speaking, a read/write semaphore is a class of lock which + permits concurrent readers. However a write lock can only be obtained + once all readers have left the critical region (and pending readers + made to wait). + + This renders read locks on a read/write semaphore concurrent with other + readers and write locks exclusive against all others holding the semaphore. + +VMA fields +^^^^^^^^^^ + +We can subdivide :c:struct:`!struct vm_area_struct` fields by their purpose, which makes it +easier to explore their locking characteristics: + +.. note:: We exclude VMA lock-specific fields here to avoid confusion, as these + are in effect an internal implementation detail. + +.. table:: Virtual layout fields + + ===================== ======================================== =========== + Field Description Write lock + ===================== ======================================== =========== + :c:member:`!vm_start` Inclusive start virtual address of range mmap write, + VMA describes. VMA write, + rmap write. + :c:member:`!vm_end` Exclusive end virtual address of range mmap write, + VMA describes. VMA write, + rmap write. + :c:member:`!vm_pgoff` Describes the page offset into the file, mmap write, + the original page offset within the VMA write, + virtual address space (prior to any rmap write. + :c:func:`!mremap`), or PFN if a PFN map + and the architecture does not support + :c:macro:`!CONFIG_ARCH_HAS_PTE_SPECIAL`. + ===================== ======================================== =========== + +These fields describes the size, start and end of the VMA, and as such cannot be +modified without first being hidden from the reverse mapping since these fields +are used to locate VMAs within the reverse mapping interval trees. + +.. table:: Core fields + + ============================ ======================================== ========================= + Field Description Write lock + ============================ ======================================== ========================= + :c:member:`!vm_mm` Containing mm_struct. None - written once on + initial map. + :c:member:`!vm_page_prot` Architecture-specific page table mmap write, VMA write. + protection bits determined from VMA + flags. + :c:member:`!vm_flags` Read-only access to VMA flags describing N/A + attributes of the VMA, in union with + private writable + :c:member:`!__vm_flags`. + :c:member:`!__vm_flags` Private, writable access to VMA flags mmap write, VMA write. + field, updated by + :c:func:`!vm_flags_*` functions. + :c:member:`!vm_file` If the VMA is file-backed, points to a None - written once on + struct file object describing the initial map. + underlying file, if anonymous then + :c:macro:`!NULL`. + :c:member:`!vm_ops` If the VMA is file-backed, then either None - Written once on + the driver or file-system provides a initial map by + :c:struct:`!struct vm_operations_struct` :c:func:`!f_ops->mmap()`. + object describing callbacks to be + invoked on VMA lifetime events. + :c:member:`!vm_private_data` A :c:member:`!void *` field for Handled by driver. + driver-specific metadata. + ============================ ======================================== ========================= + +These are the core fields which describe the MM the VMA belongs to and its attributes. + +.. table:: Config-specific fields + + ================================= ===================== ======================================== =============== + Field Configuration option Description Write lock + ================================= ===================== ======================================== =============== + :c:member:`!anon_name` CONFIG_ANON_VMA_NAME A field for storing a mmap write, + :c:struct:`!struct anon_vma_name` VMA write. + object providing a name for anonymous + mappings, or :c:macro:`!NULL` if none + is set or the VMA is file-backed. The + underlying object is reference counted + and can be shared across multiple VMAs + for scalability. + :c:member:`!swap_readahead_info` CONFIG_SWAP Metadata used by the swap mechanism mmap read, + to perform readahead. This field is swap-specific + accessed atomically. lock. + :c:member:`!vm_policy` CONFIG_NUMA :c:type:`!mempolicy` object which mmap write, + describes the NUMA behaviour of the VMA write. + VMA. The underlying object is reference + counted. + :c:member:`!numab_state` CONFIG_NUMA_BALANCING :c:type:`!vma_numab_state` object which mmap read, + describes the current state of numab-specific + NUMA balancing in relation to this VMA. lock. + Updated under mmap read lock by + :c:func:`!task_numa_work`. + :c:member:`!vm_userfaultfd_ctx` CONFIG_USERFAULTFD Userfaultfd context wrapper object of mmap write, + type :c:type:`!vm_userfaultfd_ctx`, VMA write. + either of zero size if userfaultfd is + disabled, or containing a pointer + to an underlying + :c:type:`!userfaultfd_ctx` object which + describes userfaultfd metadata. + ================================= ===================== ======================================== =============== + +These fields are present or not depending on whether the relevant kernel +configuration option is set. + +.. table:: Reverse mapping fields + + =================================== ========================================= ============================ + Field Description Write lock + =================================== ========================================= ============================ + :c:member:`!shared.rb` A red/black tree node used, if the mmap write, VMA write, + mapping is file-backed, to place the VMA i_mmap write. + in the + :c:member:`!struct address_space->i_mmap` + red/black interval tree. + :c:member:`!shared.rb_subtree_last` Metadata used for management of the mmap write, VMA write, + interval tree if the VMA is file-backed. i_mmap write. + :c:member:`!anon_vma_chain` List of pointers to both forked/CoW’d mmap read, anon_vma write. + :c:type:`!anon_vma` objects and + :c:member:`!vma->anon_vma` if it is + non-:c:macro:`!NULL`. + :c:member:`!anon_vma` :c:type:`!anon_vma` object used by When :c:macro:`NULL` and + anonymous folios mapped exclusively to setting non-:c:macro:`NULL`: + this VMA. Initially set by mmap read, page_table_lock. + :c:func:`!anon_vma_prepare` serialised + by the :c:macro:`!page_table_lock`. This When non-:c:macro:`NULL` and + is set as soon as any page is faulted in. setting :c:macro:`NULL`: + mmap write, VMA write, + anon_vma write. + =================================== ========================================= ============================ + +These fields are used to both place the VMA within the reverse mapping, and for +anonymous mappings, to be able to access both related :c:struct:`!struct anon_vma` objects +and the :c:struct:`!struct anon_vma` in which folios mapped exclusively to this VMA should +reside. + +.. note:: If a file-backed mapping is mapped with :c:macro:`!MAP_PRIVATE` set + then it can be in both the :c:type:`!anon_vma` and :c:type:`!i_mmap` + trees at the same time, so all of these fields might be utilised at + once. + +Page tables +----------- + +We won't speak exhaustively on the subject but broadly speaking, page tables map +virtual addresses to physical ones through a series of page tables, each of +which contain entries with physical addresses for the next page table level +(along with flags), and at the leaf level the physical addresses of the +underlying physical data pages or a special entry such as a swap entry, +migration entry or other special marker. Offsets into these pages are provided +by the virtual address itself. + +In Linux these are divided into five levels - PGD, P4D, PUD, PMD and PTE. Huge +pages might eliminate one or two of these levels, but when this is the case we +typically refer to the leaf level as the PTE level regardless. + +.. note:: In instances where the architecture supports fewer page tables than + five the kernel cleverly 'folds' page table levels, that is stubbing + out functions related to the skipped levels. This allows us to + conceptually act as if there were always five levels, even if the + compiler might, in practice, eliminate any code relating to missing + ones. + +There are four key operations typically performed on page tables: + +1. **Traversing** page tables - Simply reading page tables in order to traverse + them. This only requires that the VMA is kept stable, so a lock which + establishes this suffices for traversal (there are also lockless variants + which eliminate even this requirement, such as :c:func:`!gup_fast`). +2. **Installing** page table mappings - Whether creating a new mapping or + modifying an existing one in such a way as to change its identity. This + requires that the VMA is kept stable via an mmap or VMA lock (explicitly not + rmap locks). +3. **Zapping/unmapping** page table entries - This is what the kernel calls + clearing page table mappings at the leaf level only, whilst leaving all page + tables in place. This is a very common operation in the kernel performed on + file truncation, the :c:macro:`!MADV_DONTNEED` operation via + :c:func:`!madvise`, and others. This is performed by a number of functions + including :c:func:`!unmap_mapping_range` and :c:func:`!unmap_mapping_pages`. + The VMA need only be kept stable for this operation. +4. **Freeing** page tables - When finally the kernel removes page tables from a + userland process (typically via :c:func:`!free_pgtables`) extreme care must + be taken to ensure this is done safely, as this logic finally frees all page + tables in the specified range, ignoring existing leaf entries (it assumes the + caller has both zapped the range and prevented any further faults or + modifications within it). + +.. note:: Modifying mappings for reclaim or migration is performed under rmap + lock as it, like zapping, does not fundamentally modify the identity + of what is being mapped. + +**Traversing** and **zapping** ranges can be performed holding any one of the +locks described in the terminology section above - that is the mmap lock, the +VMA lock or either of the reverse mapping locks. + +That is - as long as you keep the relevant VMA **stable** - you are good to go +ahead and perform these operations on page tables (though internally, kernel +operations that perform writes also acquire internal page table locks to +serialise - see the page table implementation detail section for more details). + +When **installing** page table entries, the mmap or VMA lock must be held to +keep the VMA stable. We explore why this is in the page table locking details +section below. + +.. warning:: Page tables are normally only traversed in regions covered by VMAs. + If you want to traverse page tables in areas that might not be + covered by VMAs, heavier locking is required. + See :c:func:`!walk_page_range_novma` for details. + +**Freeing** page tables is an entirely internal memory management operation and +has special requirements (see the page freeing section below for more details). + +.. warning:: When **freeing** page tables, it must not be possible for VMAs + containing the ranges those page tables map to be accessible via + the reverse mapping. + + The :c:func:`!free_pgtables` function removes the relevant VMAs + from the reverse mappings, but no other VMAs can be permitted to be + accessible and span the specified range. + +Lock ordering +------------- + +As we have multiple locks across the kernel which may or may not be taken at the +same time as explicit mm or VMA locks, we have to be wary of lock inversion, and +the **order** in which locks are acquired and released becomes very important. + +.. note:: Lock inversion occurs when two threads need to acquire multiple locks, + but in doing so inadvertently cause a mutual deadlock. + + For example, consider thread 1 which holds lock A and tries to acquire lock B, + while thread 2 holds lock B and tries to acquire lock A. + + Both threads are now deadlocked on each other. However, had they attempted to + acquire locks in the same order, one would have waited for the other to + complete its work and no deadlock would have occurred. + +The opening comment in :c:macro:`!mm/rmap.c` describes in detail the required +ordering of locks within memory management code: + +.. code-block:: + + inode->i_rwsem (while writing or truncating, not reading or faulting) + mm->mmap_lock + mapping->invalidate_lock (in filemap_fault) + folio_lock + hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) + vma_start_write + mapping->i_mmap_rwsem + anon_vma->rwsem + mm->page_table_lock or pte_lock + swap_lock (in swap_duplicate, swap_info_get) + mmlist_lock (in mmput, drain_mmlist and others) + mapping->private_lock (in block_dirty_folio) + i_pages lock (widely used) + lruvec->lru_lock (in folio_lruvec_lock_irq) + inode->i_lock (in set_page_dirty's __mark_inode_dirty) + bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + sb_lock (within inode_lock in fs/fs-writeback.c) + i_pages lock (widely used, in set_page_dirty, + in arch-dependent flush_dcache_mmap_lock, + within bdi.wb->list_lock in __sync_single_inode) + +There is also a file-system specific lock ordering comment located at the top of +:c:macro:`!mm/filemap.c`: + +.. code-block:: + + ->i_mmap_rwsem (truncate_pagecache) + ->private_lock (__free_pte->block_dirty_folio) + ->swap_lock (exclusive_swap_page, others) + ->i_pages lock + + ->i_rwsem + ->invalidate_lock (acquired by fs in truncate path) + ->i_mmap_rwsem (truncate->unmap_mapping_range) + + ->mmap_lock + ->i_mmap_rwsem + ->page_table_lock or pte_lock (various, mainly in memory.c) + ->i_pages lock (arch-dependent flush_dcache_mmap_lock) + + ->mmap_lock + ->invalidate_lock (filemap_fault) + ->lock_page (filemap_fault, access_process_vm) + + ->i_rwsem (generic_perform_write) + ->mmap_lock (fault_in_readable->do_page_fault) + + bdi->wb.list_lock + sb_lock (fs/fs-writeback.c) + ->i_pages lock (__sync_single_inode) + + ->i_mmap_rwsem + ->anon_vma.lock (vma_merge) + + ->anon_vma.lock + ->page_table_lock or pte_lock (anon_vma_prepare and various) + + ->page_table_lock or pte_lock + ->swap_lock (try_to_unmap_one) + ->private_lock (try_to_unmap_one) + ->i_pages lock (try_to_unmap_one) + ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) + ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) + ->private_lock (folio_remove_rmap_pte->set_page_dirty) + ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) + bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) + ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) + bdi.wb->list_lock (zap_pte_range->set_page_dirty) + ->inode->i_lock (zap_pte_range->set_page_dirty) + ->private_lock (zap_pte_range->block_dirty_folio) + +Please check the current state of these comments which may have changed since +the time of writing of this document. + +------------------------------ +Locking Implementation Details +------------------------------ + +.. warning:: Locking rules for PTE-level page tables are very different from + locking rules for page tables at other levels. + +Page table locking details +-------------------------- + +In addition to the locks described in the terminology section above, we have +additional locks dedicated to page tables: + +* **Higher level page table locks** - Higher level page tables, that is PGD, P4D + and PUD each make use of the process address space granularity + :c:member:`!mm->page_table_lock` lock when modified. + +* **Fine-grained page table locks** - PMDs and PTEs each have fine-grained locks + either kept within the folios describing the page tables or allocated + separated and pointed at by the folios if :c:macro:`!ALLOC_SPLIT_PTLOCKS` is + set. The PMD spin lock is obtained via :c:func:`!pmd_lock`, however PTEs are + mapped into higher memory (if a 32-bit system) and carefully locked via + :c:func:`!pte_offset_map_lock`. + +These locks represent the minimum required to interact with each page table +level, but there are further requirements. + +Importantly, note that on a **traversal** of page tables, sometimes no such +locks are taken. However, at the PTE level, at least concurrent page table +deletion must be prevented (using RCU) and the page table must be mapped into +high memory, see below. + +Whether care is taken on reading the page table entries depends on the +architecture, see the section on atomicity below. + +Locking rules +^^^^^^^^^^^^^ + +We establish basic locking rules when interacting with page tables: + +* When changing a page table entry the page table lock for that page table + **must** be held, except if you can safely assume nobody can access the page + tables concurrently (such as on invocation of :c:func:`!free_pgtables`). +* Reads from and writes to page table entries must be *appropriately* + atomic. See the section on atomicity below for details. +* Populating previously empty entries requires that the mmap or VMA locks are + held (read or write), doing so with only rmap locks would be dangerous (see + the warning below). +* As mentioned previously, zapping can be performed while simply keeping the VMA + stable, that is holding any one of the mmap, VMA or rmap locks. + +.. warning:: Populating previously empty entries is dangerous as, when unmapping + VMAs, :c:func:`!vms_clear_ptes` has a window of time between + zapping (via :c:func:`!unmap_vmas`) and freeing page tables (via + :c:func:`!free_pgtables`), where the VMA is still visible in the + rmap tree. :c:func:`!free_pgtables` assumes that the zap has + already been performed and removes PTEs unconditionally (along with + all other page tables in the freed range), so installing new PTE + entries could leak memory and also cause other unexpected and + dangerous behaviour. + +There are additional rules applicable when moving page tables, which we discuss +in the section on this topic below. + +PTE-level page tables are different from page tables at other levels, and there +are extra requirements for accessing them: + +* On 32-bit architectures, they may be in high memory (meaning they need to be + mapped into kernel memory to be accessible). +* When empty, they can be unlinked and RCU-freed while holding an mmap lock or + rmap lock for reading in combination with the PTE and PMD page table locks. + In particular, this happens in :c:func:`!retract_page_tables` when handling + :c:macro:`!MADV_COLLAPSE`. + So accessing PTE-level page tables requires at least holding an RCU read lock; + but that only suffices for readers that can tolerate racing with concurrent + page table updates such that an empty PTE is observed (in a page table that + has actually already been detached and marked for RCU freeing) while another + new page table has been installed in the same location and filled with + entries. Writers normally need to take the PTE lock and revalidate that the + PMD entry still refers to the same PTE-level page table. + +To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or +:c:func:`!pte_offset_map` can be used depending on stability requirements. +These map the page table into kernel memory if required, take the RCU lock, and +depending on variant, may also look up or acquire the PTE lock. +See the comment on :c:func:`!__pte_offset_map_lock`. + +Atomicity +^^^^^^^^^ + +Regardless of page table locks, the MMU hardware concurrently updates accessed +and dirty bits (perhaps more, depending on architecture). Additionally, page +table traversal operations in parallel (though holding the VMA stable) and +functionality like GUP-fast locklessly traverses (that is reads) page tables, +without even keeping the VMA stable at all. + +When performing a page table traversal and keeping the VMA stable, whether a +read must be performed once and only once or not depends on the architecture +(for instance x86-64 does not require any special precautions). + +If a write is being performed, or if a read informs whether a write takes place +(on an installation of a page table entry say, for instance in +:c:func:`!__pud_install`), special care must always be taken. In these cases we +can never assume that page table locks give us entirely exclusive access, and +must retrieve page table entries once and only once. + +If we are reading page table entries, then we need only ensure that the compiler +does not rearrange our loads. This is achieved via :c:func:`!pXXp_get` +functions - :c:func:`!pgdp_get`, :c:func:`!p4dp_get`, :c:func:`!pudp_get`, +:c:func:`!pmdp_get`, and :c:func:`!ptep_get`. + +Each of these uses :c:func:`!READ_ONCE` to guarantee that the compiler reads +the page table entry only once. + +However, if we wish to manipulate an existing page table entry and care about +the previously stored data, we must go further and use an hardware atomic +operation as, for example, in :c:func:`!ptep_get_and_clear`. + +Equally, operations that do not rely on the VMA being held stable, such as +GUP-fast (see :c:func:`!gup_fast` and its various page table level handlers like +:c:func:`!gup_fast_pte_range`), must very carefully interact with page table +entries, using functions such as :c:func:`!ptep_get_lockless` and equivalent for +higher level page table levels. + +Writes to page table entries must also be appropriately atomic, as established +by :c:func:`!set_pXX` functions - :c:func:`!set_pgd`, :c:func:`!set_p4d`, +:c:func:`!set_pud`, :c:func:`!set_pmd`, and :c:func:`!set_pte`. + +Equally functions which clear page table entries must be appropriately atomic, +as in :c:func:`!pXX_clear` functions - :c:func:`!pgd_clear`, +:c:func:`!p4d_clear`, :c:func:`!pud_clear`, :c:func:`!pmd_clear`, and +:c:func:`!pte_clear`. + +Page table installation +^^^^^^^^^^^^^^^^^^^^^^^ + +Page table installation is performed with the VMA held stable explicitly by an +mmap or VMA lock in read or write mode (see the warning in the locking rules +section for details as to why). + +When allocating a P4D, PUD or PMD and setting the relevant entry in the above +PGD, P4D or PUD, the :c:member:`!mm->page_table_lock` must be held. This is +acquired in :c:func:`!__p4d_alloc`, :c:func:`!__pud_alloc` and +:c:func:`!__pmd_alloc` respectively. + +.. note:: :c:func:`!__pmd_alloc` actually invokes :c:func:`!pud_lock` and + :c:func:`!pud_lockptr` in turn, however at the time of writing it ultimately + references the :c:member:`!mm->page_table_lock`. + +Allocating a PTE will either use the :c:member:`!mm->page_table_lock` or, if +:c:macro:`!USE_SPLIT_PMD_PTLOCKS` is defined, a lock embedded in the PMD +physical page metadata in the form of a :c:struct:`!struct ptdesc`, acquired by +:c:func:`!pmd_ptdesc` called from :c:func:`!pmd_lock` and ultimately +:c:func:`!__pte_alloc`. + +Finally, modifying the contents of the PTE requires special treatment, as the +PTE page table lock must be acquired whenever we want stable and exclusive +access to entries contained within a PTE, especially when we wish to modify +them. + +This is performed via :c:func:`!pte_offset_map_lock` which carefully checks to +ensure that the PTE hasn't changed from under us, ultimately invoking +:c:func:`!pte_lockptr` to obtain a spin lock at PTE granularity contained within +the :c:struct:`!struct ptdesc` associated with the physical PTE page. The lock +must be released via :c:func:`!pte_unmap_unlock`. + +.. note:: There are some variants on this, such as + :c:func:`!pte_offset_map_rw_nolock` when we know we hold the PTE stable but + for brevity we do not explore this. See the comment for + :c:func:`!__pte_offset_map_lock` for more details. + +When modifying data in ranges we typically only wish to allocate higher page +tables as necessary, using these locks to avoid races or overwriting anything, +and set/clear data at the PTE level as required (for instance when page faulting +or zapping). + +A typical pattern taken when traversing page table entries to install a new +mapping is to optimistically determine whether the page table entry in the table +above is empty, if so, only then acquiring the page table lock and checking +again to see if it was allocated underneath us. + +This allows for a traversal with page table locks only being taken when +required. An example of this is :c:func:`!__pud_alloc`. + +At the leaf page table, that is the PTE, we can't entirely rely on this pattern +as we have separate PMD and PTE locks and a THP collapse for instance might have +eliminated the PMD entry as well as the PTE from under us. + +This is why :c:func:`!__pte_offset_map_lock` locklessly retrieves the PMD entry +for the PTE, carefully checking it is as expected, before acquiring the +PTE-specific lock, and then *again* checking that the PMD entry is as expected. + +If a THP collapse (or similar) were to occur then the lock on both pages would +be acquired, so we can ensure this is prevented while the PTE lock is held. + +Installing entries this way ensures mutual exclusion on write. + +Page table freeing +^^^^^^^^^^^^^^^^^^ + +Tearing down page tables themselves is something that requires significant +care. There must be no way that page tables designated for removal can be +traversed or referenced by concurrent tasks. + +It is insufficient to simply hold an mmap write lock and VMA lock (which will +prevent racing faults, and rmap operations), as a file-backed mapping can be +truncated under the :c:struct:`!struct address_space->i_mmap_rwsem` alone. + +As a result, no VMA which can be accessed via the reverse mapping (either +through the :c:struct:`!struct anon_vma->rb_root` or the :c:member:`!struct +address_space->i_mmap` interval trees) can have its page tables torn down. + +The operation is typically performed via :c:func:`!free_pgtables`, which assumes +either the mmap write lock has been taken (as specified by its +:c:member:`!mm_wr_locked` parameter), or that the VMA is already unreachable. + +It carefully removes the VMA from all reverse mappings, however it's important +that no new ones overlap these or any route remain to permit access to addresses +within the range whose page tables are being torn down. + +Additionally, it assumes that a zap has already been performed and steps have +been taken to ensure that no further page table entries can be installed between +the zap and the invocation of :c:func:`!free_pgtables`. + +Since it is assumed that all such steps have been taken, page table entries are +cleared without page table locks (in the :c:func:`!pgd_clear`, :c:func:`!p4d_clear`, +:c:func:`!pud_clear`, and :c:func:`!pmd_clear` functions. + +.. note:: It is possible for leaf page tables to be torn down independent of + the page tables above it as is done by + :c:func:`!retract_page_tables`, which is performed under the i_mmap + read lock, PMD, and PTE page table locks, without this level of care. + +Page table moving +^^^^^^^^^^^^^^^^^ + +Some functions manipulate page table levels above PMD (that is PUD, P4D and PGD +page tables). Most notable of these is :c:func:`!mremap`, which is capable of +moving higher level page tables. + +In these instances, it is required that **all** locks are taken, that is +the mmap lock, the VMA lock and the relevant rmap locks. + +You can observe this in the :c:func:`!mremap` implementation in the functions +:c:func:`!take_rmap_locks` and :c:func:`!drop_rmap_locks` which perform the rmap +side of lock acquisition, invoked ultimately by :c:func:`!move_page_tables`. + +VMA lock internals +------------------ + +Overview +^^^^^^^^ + +VMA read locking is entirely optimistic - if the lock is contended or a competing +write has started, then we do not obtain a read lock. + +A VMA **read** lock is obtained by :c:func:`!lock_vma_under_rcu`, which first +calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU +critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`, +before releasing the RCU lock via :c:func:`!rcu_read_unlock`. + +VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for +their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it +via :c:func:`!vma_end_read`. + +VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a +VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always +acquired. An mmap write lock **must** be held for the duration of the VMA write +lock, releasing or downgrading the mmap write lock also releases the VMA write +lock so there is no :c:func:`!vma_end_write` function. + +Note that a semaphore write lock is not held across a VMA lock. Rather, a +sequence number is used for serialisation, and the write semaphore is only +acquired at the point of write lock to update this. + +This ensures the semantics we require - VMA write locks provide exclusive write +access to the VMA. + +Implementation details +^^^^^^^^^^^^^^^^^^^^^^ + +The VMA lock mechanism is designed to be a lightweight means of avoiding the use +of the heavily contended mmap lock. It is implemented using a combination of a +read/write semaphore and sequence numbers belonging to the containing +:c:struct:`!struct mm_struct` and the VMA. + +Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic +operation, i.e. it tries to acquire a read lock but returns false if it is +unable to do so. At the end of the read operation, :c:func:`!vma_end_read` is +called to release the VMA read lock. + +Invoking :c:func:`!vma_start_read` requires that :c:func:`!rcu_read_lock` has +been called first, establishing that we are in an RCU critical section upon VMA +read lock acquisition. Once acquired, the RCU lock can be released as it is only +required for lookup. This is abstracted by :c:func:`!lock_vma_under_rcu` which +is the interface a user should use. + +Writing requires the mmap to be write-locked and the VMA lock to be acquired via +:c:func:`!vma_start_write`, however the write lock is released by the termination or +downgrade of the mmap write lock so no :c:func:`!vma_end_write` is required. + +All this is achieved by the use of per-mm and per-VMA sequence counts, which are +used in order to reduce complexity, especially for operations which write-lock +multiple VMAs at once. + +If the mm sequence count, :c:member:`!mm->mm_lock_seq` is equal to the VMA +sequence count :c:member:`!vma->vm_lock_seq` then the VMA is write-locked. If +they differ, then it is not. + +Each time the mmap write lock is released in :c:func:`!mmap_write_unlock` or +:c:func:`!mmap_write_downgrade`, :c:func:`!vma_end_write_all` is invoked which +also increments :c:member:`!mm->mm_lock_seq` via +:c:func:`!mm_lock_seqcount_end`. + +This way, we ensure that, regardless of the VMA's sequence number, a write lock +is never incorrectly indicated and that when we release an mmap write lock we +efficiently release **all** VMA write locks contained within the mmap at the +same time. + +Since the mmap write lock is exclusive against others who hold it, the automatic +release of any VMA locks on its release makes sense, as you would never want to +keep VMAs locked across entirely separate write operations. It also maintains +correct lock ordering. + +Each time a VMA read lock is acquired, we acquire a read lock on the +:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that +the sequence count of the VMA does not match that of the mm. + +If it does, the read lock fails. If it does not, we hold the lock, excluding +writers, but permitting other readers, who will also obtain this lock under RCU. + +Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu` +are also RCU safe, so the whole read lock operation is guaranteed to function +correctly. + +On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock` +read/write semaphore, before setting the VMA's sequence number under this lock, +also simultaneously holding the mmap write lock. + +This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep +until these are finished and mutual exclusion is achieved. + +After setting the VMA's sequence number, the lock is released, avoiding +complexity with a long-term held write lock. + +This clever combination of a read/write semaphore and sequence count allows for +fast RCU-based per-VMA lock acquisition (especially on page fault, though +utilised elsewhere) with minimal complexity around lock ordering. + +mmap write lock downgrading +--------------------------- + +When an mmap write lock is held one has exclusive access to resources within the +mmap (with the usual caveats about requiring VMA write locks to avoid races with +tasks holding VMA read locks). + +It is then possible to **downgrade** from a write lock to a read lock via +:c:func:`!mmap_write_downgrade` which, similar to :c:func:`!mmap_write_unlock`, +implicitly terminates all VMA write locks via :c:func:`!vma_end_write_all`, but +importantly does not relinquish the mmap lock while downgrading, therefore +keeping the locked virtual address space stable. + +An interesting consequence of this is that downgraded locks are exclusive +against any other task possessing a downgraded lock (since a racing task would +have to acquire a write lock first to downgrade it, and the downgraded lock +prevents a new write lock from being obtained until the original lock is +released). + +For clarity, we map read (R)/downgraded write (D)/write (W) locks against one +another showing which locks exclude the others: + +.. list-table:: Lock exclusivity + :widths: 5 5 5 5 + :header-rows: 1 + :stub-columns: 1 + + * - + - R + - D + - W + * - R + - N + - N + - Y + * - D + - N + - Y + - Y + * - W + - Y + - Y + - Y + +Here a Y indicates the locks in the matching row/column are mutually exclusive, +and N indicates that they are not. + +Stack expansion +--------------- + +Stack expansion throws up additional complexities in that we cannot permit there +to be racing page faults, as a result we invoke :c:func:`!vma_start_write` to +prevent this in :c:func:`!expand_downwards` or :c:func:`!expand_upwards`. From 6a75f19af16ff482cfd6085c77123aa0f464f8dd Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Thu, 5 Dec 2024 11:29:41 -0800 Subject: [PATCH 0734/1450] selftests/memfd: run sysctl tests when PID namespace support is enabled The sysctl tests for vm.memfd_noexec rely on the kernel to support PID namespaces (i.e. the kernel is built with CONFIG_PID_NS=y). If the kernel the test runs on does not support PID namespaces, the first sysctl test will fail when attempting to spawn a new thread in a new PID namespace, abort the test, preventing the remaining tests from being run. This is not desirable, as not all kernels need PID namespaces, but can still use the other features provided by memfd. Therefore, only run the sysctl tests if the kernel supports PID namespaces. Otherwise, skip those tests and emit an informative message to let the user know why the sysctl tests are not being run. Link: https://lkml.kernel.org/r/20241205192943.3228757-1-isaacmanjarres@google.com Fixes: 11f75a01448f ("selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC") Signed-off-by: Isaac J. Manjarres Reviewed-by: Jeff Xu Cc: Suren Baghdasaryan Cc: Kalesh Singh Cc: [6.6+] Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 95af2d78fd318..0a0b555160280 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1557,6 +1558,11 @@ static void test_share_fork(char *banner, char *b_suffix) close(fd); } +static bool pid_ns_supported(void) +{ + return access("/proc/self/ns/pid", F_OK) == 0; +} + int main(int argc, char **argv) { pid_t pid; @@ -1591,8 +1597,12 @@ int main(int argc, char **argv) test_seal_grow(); test_seal_resize(); - test_sysctl_simple(); - test_sysctl_nested(); + if (pid_ns_supported()) { + test_sysctl_simple(); + test_sysctl_nested(); + } else { + printf("PID namespaces are not supported; skipping sysctl tests\n"); + } test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); From da5bd7fa789ae212ac18ebc3ac52b7f2ce1781da Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 5 Dec 2024 20:42:01 +0800 Subject: [PATCH 0735/1450] mailmap: add entry for Ying Huang Map my old company email to my personal email. Link: https://lkml.kernel.org/r/20241205124201.529308-1-huang.ying.caritas@gmail.com Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 5ff0e5d681e7c..7efe43237ca8e 100644 --- a/.mailmap +++ b/.mailmap @@ -735,6 +735,7 @@ Wolfram Sang Wolfram Sang Yakir Yang Yanteng Si +Ying Huang Yusuke Goda Zack Rusin Zhu Yanjun From 1a72d2ebeec51f10e5b0f0609c6754e92b11ee9d Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 5 Dec 2024 18:48:32 +0800 Subject: [PATCH 0736/1450] ocfs2: revert "ocfs2: fix the la space leak when unmounting an ocfs2 volume" Patch series "Revert ocfs2 commit dfe6c5692fb5 and provide a new fix". SUSE QA team detected a mistake in my commit dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume"). I am very sorry for my error. (If my eyes are correct) From the mailling list mails, this patch shouldn't be applied to 4.19 5.4 5.10 5.15 6.1 6.6, and these branches should perform a revert operation. Reason for revert: In commit dfe6c5692fb5, I mistakenly wrote: "This bug has existed since the initial OCFS2 code.". The statement is wrong. The correct introduction commit is 30dd3478c3cd. IOW, if the branch doesn't include 30dd3478c3cd, dfe6c5692fb5 should also not be included. This reverts commit dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume"). In commit dfe6c5692fb5, the commit log "This bug has existed since the initial OCFS2 code." is wrong. The correct introduction commit is 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()"). The influence of commit dfe6c5692fb5 is that it provides a correct fix for the latest kernel. however, it shouldn't be pushed to stable branches. Let's use this commit to revert all branches that include dfe6c5692fb5 and use a new fix method to fix commit 30dd3478c3cd. Link: https://lkml.kernel.org/r/20241205104835.18223-1-heming.zhao@suse.com Link: https://lkml.kernel.org/r/20241205104835.18223-2-heming.zhao@suse.com Fixes: dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume") Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/localalloc.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 8ac42ea81a17b..5df34561c551c 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -1002,25 +1002,6 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, start = bit_off + 1; } - /* clear the contiguous bits until the end boundary */ - if (count) { - blkno = la_start_blk + - ocfs2_clusters_to_blocks(osb->sb, - start - count); - - trace_ocfs2_sync_local_to_main_free( - count, start - count, - (unsigned long long)la_start_blk, - (unsigned long long)blkno); - - status = ocfs2_release_clusters(handle, - main_bm_inode, - main_bm_bh, blkno, - count); - if (status < 0) - mlog_errno(status); - } - bail: if (status) mlog_errno(status); From 7782e3b3b004e8cb94a88621a22cc3c2f33e5b90 Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 5 Dec 2024 18:48:33 +0800 Subject: [PATCH 0737/1450] ocfs2: fix the space leak in LA when releasing LA Commit 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()") introduced an issue, the ocfs2_sync_local_to_main() ignores the last contiguous free bits, which causes an OCFS2 volume to lose the last free clusters of LA window during the release routine. Please note, because commit dfe6c5692fb5 ("ocfs2: fix the la space leak when unmounting an ocfs2 volume") was reverted, this commit is a replacement fix for commit dfe6c5692fb5. Link: https://lkml.kernel.org/r/20241205104835.18223-3-heming.zhao@suse.com Fixes: 30dd3478c3cd ("ocfs2: correctly use ocfs2_find_next_zero_bit()") Signed-off-by: Heming Zhao Suggested-by: Joseph Qi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/localalloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 5df34561c551c..d1aa04a5af1b1 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -971,9 +971,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, start = count = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) < - left) { - if (bit_off == start) { + while (1) { + bit_off = ocfs2_find_next_zero_bit(bitmap, left, start); + if ((bit_off < left) && (bit_off == start)) { count++; start++; continue; @@ -998,6 +998,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, } } + if (bit_off >= left) + break; count = 1; start = bit_off + 1; } From dad2dc9c92e0f93f33cebcb0595b8daa3d57473f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 4 Dec 2024 22:50:06 -0800 Subject: [PATCH 0738/1450] mm: shmem: fix ShmemHugePages at swapout /proc/meminfo ShmemHugePages has been showing overlarge amounts (more than Shmem) after swapping out THPs: we forgot to update NR_SHMEM_THPS. Add shmem_update_stats(), to avoid repetition, and risk of making that mistake again: the call from shmem_delete_from_page_cache() is the bugfix; the call from shmem_replace_folio() is reassuring, but not really a bugfix (replace corrects misplaced swapin readahead, but huge swapin readahead would be a mistake). Link: https://lkml.kernel.org/r/5ba477c8-a569-70b5-923e-09ab221af45b@google.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Hugh Dickins Reviewed-by: Shakeel Butt Reviewed-by: Yosry Ahmed Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ccb9629a0f70d..f6fb053ac50dc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -787,6 +787,14 @@ static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void shmem_update_stats(struct folio *folio, int nr_pages) +{ + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); +} + /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ @@ -821,10 +829,7 @@ static int shmem_add_to_page_cache(struct folio *folio, xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); @@ -852,8 +857,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); @@ -1969,10 +1973,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } if (!error) { mem_cgroup_replace_folio(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages); - __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages); - __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); From 8aca2bc96c833ba695ede7a45ad7784c836a262e Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 28 Oct 2024 22:56:55 +0800 Subject: [PATCH 0739/1450] mm: use aligned address in clear_gigantic_page() In current kernel, hugetlb_no_page() calls folio_zero_user() with the fault address. Where the fault address may be not aligned with the huge page size. Then, folio_zero_user() may call clear_gigantic_page() with the address, while clear_gigantic_page() requires the address to be huge page size aligned. So, this may cause memory corruption or information leak, addtional, use more obvious naming 'addr_hint' instead of 'addr' for clear_gigantic_page(). Link: https://lkml.kernel.org/r/20241028145656.932941-1-wangkefeng.wang@huawei.com Fixes: 78fefd04c123 ("mm: memory: convert clear_huge_page() to folio_zero_user()") Signed-off-by: Kefeng Wang Reviewed-by: "Huang, Ying" Reviewed-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- mm/memory.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 90f883d6b8fd8..fc1ae51321271 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -825,7 +825,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, error = PTR_ERR(folio); goto out; } - folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size)); + folio_zero_user(folio, addr); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { diff --git a/mm/memory.c b/mm/memory.c index 75c2dfd04f725..84864387f9659 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6815,9 +6815,10 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct folio *folio, unsigned long addr, +static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, unsigned int nr_pages) { + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); int i; might_sleep(); From f5d09de9f1bf9674c6418ff10d0a40cfe29268e1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 28 Oct 2024 22:56:56 +0800 Subject: [PATCH 0740/1450] mm: use aligned address in copy_user_gigantic_page() In current kernel, hugetlb_wp() calls copy_user_large_folio() with the fault address. Where the fault address may be not aligned with the huge page size. Then, copy_user_large_folio() may call copy_user_gigantic_page() with the address, while copy_user_gigantic_page() requires the address to be huge page size aligned. So, this may cause memory corruption or information leak, addtional, use more obvious naming 'addr_hint' instead of 'addr' for copy_user_gigantic_page(). Link: https://lkml.kernel.org/r/20241028145656.932941-2-wangkefeng.wang@huawei.com Fixes: 530dd9926dc1 ("mm: memory: improve copy_user_large_folio()") Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Cc: Huang Ying Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 ++--- mm/memory.c | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea2ed8e301ef2..cec4b121193fc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5340,7 +5340,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } ret = copy_user_large_folio(new_folio, pte_folio, - ALIGN_DOWN(addr, sz), dst_vma); + addr, dst_vma); folio_put(pte_folio); if (ret) { folio_put(new_folio); @@ -6643,8 +6643,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, *foliop = NULL; goto out; } - ret = copy_user_large_folio(folio, *foliop, - ALIGN_DOWN(dst_addr, size), dst_vma); + ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); folio_put(*foliop); *foliop = NULL; if (ret) { diff --git a/mm/memory.c b/mm/memory.c index 84864387f9659..209885a4134f7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6852,13 +6852,14 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint) } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, - unsigned long addr, + unsigned long addr_hint, struct vm_area_struct *vma, unsigned int nr_pages) { - int i; + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst)); struct page *dst_page; struct page *src_page; + int i; for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); From 42c4e4b20d9c4651903c4afc53a4ff18b7451b3e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 6 Dec 2024 21:52:29 +0000 Subject: [PATCH 0741/1450] mm: correctly reference merged VMA On second merge attempt on mmap() we incorrectly discard the possibly merged VMA, resulting in a possible use-after-free (and most certainly a reference to the wrong VMA) in this instance in the subsequent __mmap_complete() invocation. Correct this mistake by reassigning vma correctly if a merge succeeds in this case. Link: https://lkml.kernel.org/r/20241206215229.244413-1-lorenzo.stoakes@oracle.com Fixes: 5ac87a885aec ("mm: defer second attempt at merge on mmap()") Signed-off-by: Lorenzo Stoakes Suggested-by: Jann Horn Reported-by: syzbot+91cf8da9401355f946c3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67536a25.050a0220.a30f1.0149.GAE@google.com/ Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/vma.c b/mm/vma.c index 8e31b7e25aeb1..bb2119e5a0d07 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2460,10 +2460,13 @@ unsigned long __mmap_region(struct file *file, unsigned long addr, /* If flags changed, we might be able to merge, so try again. */ if (map.retry_merge) { + struct vm_area_struct *merged; VMG_MMAP_STATE(vmg, &map, vma); vma_iter_config(map.vmi, map.addr, map.end); - vma_merge_existing_range(&vmg); + merged = vma_merge_existing_range(&vmg); + if (merged) + vma = merged; } __mmap_complete(&map, vma); From 31c5629920b82ddf66059f20f79be2bc00c4197b Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Tue, 10 Dec 2024 01:06:04 +0100 Subject: [PATCH 0742/1450] mm: add RCU annotation to pte_offset_map(_lock) RCU lock is taken by ___pte_offset_map() unless it returns NULL. Add this information to its inline callers to avoid sparse warning about context imbalance in pte_unmap(). Link: https://lkml.kernel.org/r/20241210000604.700710-1-oss@malat.biz Signed-off-by: Petr Malat Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- mm/pgtable-generic.c | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c6..3a6ee6a05aa00 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3010,7 +3010,15 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, + pmd_t *pmdvalp) +{ + pte_t *pte; + + __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); + return pte; +} static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); @@ -3023,7 +3031,8 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; - __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)); + __cond_lock(RCU, __cond_lock(*ptlp, + pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5297dcc38c37a..5a882f2b10f90 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -279,7 +279,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; From 5c0541e11c16bd2f162e23a22d07c09d58017e5a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:25 -0500 Subject: [PATCH 0743/1450] mm: introduce cpu_icache_is_aliasing() across all architectures In commit eacd0e950dc2 ("ARC: [mm] Lazy D-cache flush (non aliasing VIPT)"), arc adds the need to flush dcache to make icache see the code page change. This also requires special handling for clear_user_(high)page(). Introduce cpu_icache_is_aliasing() to make MM code query special clear_user_(high)page() easier. This will be used by the following commit. Link: https://lkml.kernel.org/r/20241209182326.2955963-1-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Suggested-by: Mathieu Desnoyers Reviewed-by: Mathieu Desnoyers Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: Geert Uytterhoeven Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- arch/arc/Kconfig | 1 + arch/arc/include/asm/cachetype.h | 8 ++++++++ include/linux/cacheinfo.h | 6 ++++++ 3 files changed, 15 insertions(+) create mode 100644 arch/arc/include/asm/cachetype.h diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ea5a1dcb133b4..4f2eeda907ecb 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT diff --git a/arch/arc/include/asm/cachetype.h b/arch/arc/include/asm/cachetype.h new file mode 100644 index 0000000000000..acd3b6cb4bf5b --- /dev/null +++ b/arch/arc/include/asm/cachetype.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ARC_CACHETYPE_H +#define __ASM_ARC_CACHETYPE_H + +#define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() true + +#endif diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 108060612bb87..7ad7365386495 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -155,8 +155,14 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) #ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING #define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() #else #include + +#ifndef cpu_icache_is_aliasing +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() +#endif + #endif #endif /* _LINUX_CACHEINFO_H */ From c51a4f11e6d8246590b5e64908c1ed84b33e8ba2 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:26 -0500 Subject: [PATCH 0744/1450] mm: use clear_user_(high)page() for arch with special user folio handling Some architectures have special handling after clearing user folios: architectures, which set cpu_dcache_is_aliasing() to true, require flushing dcache; arc, which sets cpu_icache_is_aliasing() to true, changes folio->flags to make icache coherent to dcache. So __GFP_ZERO using only clear_page() is not enough to zero user folios and clear_user_(high)page() must be used. Otherwise, user data will be corrupted. Fix it by always clearing user folios with clear_user_(high)page() when cpu_dcache_is_aliasing() is true or cpu_icache_is_aliasing() is true. Rename alloc_zeroed() to user_alloc_needs_zeroing() and invert the logic to clarify its intend. Link: https://lkml.kernel.org/r/20241209182326.2955963-2-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/linux-mm/CAMuHMdV1hRp_NtR5YnJo=HsfgKQeH91J537Gh4gKk3PFZhSkbA@mail.gmail.com/ Tested-by: Geert Uytterhoeven Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +++++++- include/linux/mm.h | 18 ++++++++++++++++++ mm/huge_memory.c | 9 +++++---- mm/internal.h | 6 ------ mm/memory.c | 10 +++++----- 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6e452bd8e7e36..5c6bea81a90ec 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -224,7 +224,13 @@ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr); + struct folio *folio; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); + if (folio && user_alloc_needs_zeroing()) + clear_user_highpage(&folio->page, vaddr); + + return folio; } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a6ee6a05aa00..338a76ce9083a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -31,6 +31,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -4184,6 +4185,23 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif +/* + * user_alloc_needs_zeroing checks if a user folio from page allocator needs to + * be zeroed or not. + */ +static inline bool user_alloc_needs_zeroing(void) +{ + /* + * for user folios, arch with cache aliasing requires cache flush and + * arc changes folio->flags to make icache coherent with dcache, so + * always return false to make caller use + * clear_user_page()/clear_user_highpage(). + */ + return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || + !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ee335d96fc390..9bb351caa6190 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1176,11 +1176,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, folio_throttle_swaprate(folio, gfp); /* - * When a folio is not zeroed during allocation (__GFP_ZERO not used), - * folio_zero_user() is used to make sure that the page corresponding - * to the faulting address will be hot in the cache after zeroing. + * When a folio is not zeroed during allocation (__GFP_ZERO not used) + * or user folios require special handling, folio_zero_user() is used to + * make sure that the page corresponding to the faulting address will be + * hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that diff --git a/mm/internal.h b/mm/internal.h index cb8d8e8e3ffa5..3bd08bafad042 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1285,12 +1285,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); -static inline bool alloc_zeroed(void) -{ - return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, - &init_on_alloc); -} - /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. diff --git a/mm/memory.c b/mm/memory.c index 209885a4134f7..398c031be9baa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4733,12 +4733,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation - * (__GFP_ZERO not used), folio_zero_user() is used - * to make sure that the page corresponding to the - * faulting address will be hot in the cache after - * zeroing. + * (__GFP_ZERO not used) or user folios require special + * handling, folio_zero_user() is used to make sure + * that the page corresponding to the faulting address + * will be hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, vmf->address); return folio; } From be48c412f6ebf38849213c19547bc6d5b692b5e5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 10 Dec 2024 00:57:15 +0800 Subject: [PATCH 0745/1450] zram: refuse to use zero sized block device as backing device Patch series "zram: fix backing device setup issue", v2. This series fixes two bugs of backing device setting: - ZRAM should reject using a zero sized (or the uninitialized ZRAM device itself) as the backing device. - Fix backing device leaking when removing a uninitialized ZRAM device. This patch (of 2): Setting a zero sized block device as backing device is pointless, and one can easily create a recursive loop by setting the uninitialized ZRAM device itself as its own backing device by (zram0 is uninitialized): echo /dev/zram0 > /sys/block/zram0/backing_dev It's definitely a wrong config, and the module will pin itself, kernel should refuse doing so in the first place. By refusing to use zero sized device we avoided misuse cases including this one above. Link: https://lkml.kernel.org/r/20241209165717.94215-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20241209165717.94215-2-ryncsn@gmail.com Fixes: 013bf95a83ec ("zram: add interface to specif backing device") Signed-off-by: Kairui Song Reported-by: Desheng Wu Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3dee026988dc8..e86cc3d2f4d2d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -614,6 +614,12 @@ static ssize_t backing_dev_store(struct device *dev, } nr_pages = i_size_read(inode) >> PAGE_SHIFT; + /* Refuse to use zero sized device (also prevents self reference) */ + if (!nr_pages) { + err = -EINVAL; + goto out; + } + bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long); bitmap = kvzalloc(bitmap_sz, GFP_KERNEL); if (!bitmap) { From 74363ec674cb172d8856de25776c8f3103f05e2f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 10 Dec 2024 00:57:16 +0800 Subject: [PATCH 0746/1450] zram: fix uninitialized ZRAM not releasing backing device Setting backing device is done before ZRAM initialization. If we set the backing device, then remove the ZRAM module without initializing the device, the backing device reference will be leaked and the device will be hold forever. Fix this by always reset the ZRAM fully on rmmod or reset store. Link: https://lkml.kernel.org/r/20241209165717.94215-3-ryncsn@gmail.com Fixes: 013bf95a83ec ("zram: add interface to specif backing device") Signed-off-by: Kairui Song Reported-by: Desheng Wu Suggested-by: Sergey Senozhatsky Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e86cc3d2f4d2d..45df5eeabc5e3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1444,12 +1444,16 @@ static void zram_meta_free(struct zram *zram, u64 disksize) size_t num_pages = disksize >> PAGE_SHIFT; size_t index; + if (!zram->table) + return; + /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) zram_free_page(zram, index); zs_destroy_pool(zram->mem_pool); vfree(zram->table); + zram->table = NULL; } static bool zram_meta_alloc(struct zram *zram, u64 disksize) @@ -2326,11 +2330,6 @@ static void zram_reset_device(struct zram *zram) zram->limit_pages = 0; - if (!init_done(zram)) { - up_write(&zram->init_lock); - return; - } - set_capacity_and_notify(zram->disk, 0); part_stat_set_all(zram->disk->part0, 0); From 901ce9705fbb9f330ff1f19600e5daf9770b0175 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Mon, 9 Dec 2024 15:56:52 +0900 Subject: [PATCH 0747/1450] nilfs2: prevent use of deleted inode syzbot reported a WARNING in nilfs_rmdir. [1] Because the inode bitmap is corrupted, an inode with an inode number that should exist as a ".nilfs" file was reassigned by nilfs_mkdir for "file0", causing an inode duplication during execution. And this causes an underflow of i_nlink in rmdir operations. The inode is used twice by the same task to unmount and remove directories ".nilfs" and "file0", it trigger warning in nilfs_rmdir. Avoid to this issue, check i_nlink in nilfs_iget(), if it is 0, it means that this inode has been deleted, and iput is executed to reclaim it. [1] WARNING: CPU: 1 PID: 5824 at fs/inode.c:407 drop_nlink+0xc4/0x110 fs/inode.c:407 ... Call Trace: nilfs_rmdir+0x1b0/0x250 fs/nilfs2/namei.c:342 vfs_rmdir+0x3a3/0x510 fs/namei.c:4394 do_rmdir+0x3b5/0x580 fs/namei.c:4453 __do_sys_rmdir fs/namei.c:4472 [inline] __se_sys_rmdir fs/namei.c:4470 [inline] __x64_sys_rmdir+0x47/0x50 fs/namei.c:4470 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Link: https://lkml.kernel.org/r/20241209065759.6781-1-konishi.ryusuke@gmail.com Fixes: d25006523d0b ("nilfs2: pathname operations") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+9260555647a5132edd48@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9260555647a5132edd48 Tested-by: syzbot+9260555647a5132edd48@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/inode.c | 8 +++++++- fs/nilfs2/namei.c | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index cf9ba481ae376..b7d4105f37bf5 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -544,8 +544,14 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, inode = nilfs_iget_locked(sb, root, ino); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + + if (!(inode->i_state & I_NEW)) { + if (!inode->i_nlink) { + iput(inode); + return ERR_PTR(-ESTALE); + } return inode; + } err = __nilfs_read_inode(sb, root, ino, inode); if (unlikely(err)) { diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 9b108052d9f71..1d836a5540f3b 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -67,6 +67,11 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) inode = NULL; } else { inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino); + if (inode == ERR_PTR(-ESTALE)) { + nilfs_error(dir->i_sb, + "deleted inode referenced: %lu", ino); + return ERR_PTR(-EIO); + } } return d_splice_alias(inode, dentry); From 8ac662f5da19f5873fdd94c48a5cdb45b2e1b58f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 10 Dec 2024 17:24:12 +0000 Subject: [PATCH 0748/1450] fork: avoid inappropriate uprobe access to invalid mm If dup_mmap() encounters an issue, currently uprobe is able to access the relevant mm via the reverse mapping (in build_map_info()), and if we are very unlucky with a race window, observe invalid XA_ZERO_ENTRY state which we establish as part of the fork error path. This occurs because uprobe_write_opcode() invokes anon_vma_prepare() which in turn invokes find_mergeable_anon_vma() that uses a VMA iterator, invoking vma_iter_load() which uses the advanced maple tree API and thus is able to observe XA_ZERO_ENTRY entries added to dup_mmap() in commit d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()"). This change was made on the assumption that only process tear-down code would actually observe (and make use of) these values. However this very unlikely but still possible edge case with uprobes exists and unfortunately does make these observable. The uprobe operation prevents races against the dup_mmap() operation via the dup_mmap_sem semaphore, which is acquired via uprobe_start_dup_mmap() and dropped via uprobe_end_dup_mmap(), and held across register_for_each_vma() prior to invoking build_map_info() which does the reverse mapping lookup. Currently these are acquired and dropped within dup_mmap(), which exposes the race window prior to error handling in the invoking dup_mm() which tears down the mm. We can avoid all this by just moving the invocation of uprobe_start_dup_mmap() and uprobe_end_dup_mmap() up a level to dup_mm() and only release this lock once the dup_mmap() operation succeeds or clean up is done. This means that the uprobe code can never observe an incompletely constructed mm and resolves the issue in this case. Link: https://lkml.kernel.org/r/20241210172412.52995-1-lorenzo.stoakes@oracle.com Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") Signed-off-by: Lorenzo Stoakes Reported-by: syzbot+2d788f4f7cb660dac4b7@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6756d273.050a0220.2477f.003d.GAE@google.com/ Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jann Horn Cc: Jiri Olsa Cc: Kan Liang Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peng Zhang Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: David Hildenbrand Signed-off-by: Andrew Morton --- kernel/fork.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 1450b461d196a..9b301180fd416 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -639,11 +639,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, 0); - uprobe_start_dup_mmap(); - if (mmap_write_lock_killable(oldmm)) { - retval = -EINTR; - goto fail_uprobe_end; - } + if (mmap_write_lock_killable(oldmm)) + return -EINTR; flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* @@ -782,8 +779,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, dup_userfaultfd_complete(&uf); else dup_userfaultfd_fail(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: @@ -1692,9 +1687,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; + uprobe_start_dup_mmap(); err = dup_mmap(mm, oldmm); if (err) goto free_pt; + uprobe_end_dup_mmap(); mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; @@ -1709,6 +1706,8 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, mm->binfmt = NULL; mm_init_owner(mm, NULL); mmput(mm); + if (err) + uprobe_end_dup_mmap(); fail_nomem: return NULL; From faeec8e23c10bd30e8aa759a2eb3018dae00f924 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 10 Dec 2024 10:34:37 +0100 Subject: [PATCH 0749/1450] mm/page_alloc: don't call pfn_to_page() on possibly non-existent PFN in split_large_buddy() In split_large_buddy(), we might call pfn_to_page() on a PFN that might not exist. In corner cases, such as when freeing the highest pageblock in the last memory section, this could result with CONFIG_SPARSEMEM && !CONFIG_SPARSEMEM_EXTREME in __pfn_to_section() returning NULL and and __section_mem_map_addr() dereferencing that NULL pointer. Let's fix it, and avoid doing a pfn_to_page() call for the first iteration, where we already have the page. So far this was found by code inspection, but let's just CC stable as the fix is easy. Link: https://lkml.kernel.org/r/20241210093437.174413-1-david@redhat.com Fixes: fd919a85cd55 ("mm: page_isolation: prepare for hygienic freelists") Signed-off-by: David Hildenbrand Reported-by: Vlastimil Babka Closes: https://lkml.kernel.org/r/e1a898ba-a717-4d20-9144-29df1a6c8813@suse.cz Reviewed-by: Vlastimil Babka Reviewed-by: Zi Yan Acked-by: Johannes Weiner Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1cb4b8c8886d8..cae7b93864c23 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1238,13 +1238,15 @@ static void split_large_buddy(struct zone *zone, struct page *page, if (order > pageblock_order) order = pageblock_order; - while (pfn != end) { + do { int mt = get_pfnblock_migratetype(page, pfn); __free_one_page(page, pfn, zone, order, mt, fpi); pfn += 1 << order; + if (pfn == end) + break; page = pfn_to_page(pfn); - } + } while (1); } static void free_one_page(struct zone *zone, struct page *page, From a2e740e216f5bf49ccb83b6d490c72a340558a43 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Dec 2024 20:25:37 +0000 Subject: [PATCH 0750/1450] vmalloc: fix accounting with i915 If the caller of vmap() specifies VM_MAP_PUT_PAGES (currently only the i915 driver), we will decrement nr_vmalloc_pages and MEMCG_VMALLOC in vfree(). These counters are incremented by vmalloc() but not by vmap() so this will cause an underflow. Check the VM_MAP_PUT_PAGES flag before decrementing either counter. Link: https://lkml.kernel.org/r/20241211202538.168311-1-willy@infradead.org Fixes: b944afc9d64d ("mm: add a VM_MAP_PUT_PAGES flag for vmap") Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Balbir Singh Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Muchun Song Cc: Roman Gushchin Cc: "Uladzislau Rezki (Sony)" Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f009b21705c16..5c88d0e90c209 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3374,7 +3374,8 @@ void vfree(const void *addr) struct page *page = vm->pages[i]; BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + if (!(vm->flags & VM_MAP_PUT_PAGES)) + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations @@ -3382,7 +3383,8 @@ void vfree(const void *addr) __free_page(page); cond_resched(); } - atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); + if (!(vm->flags & VM_MAP_PUT_PAGES)) + atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } From 6309b8ce98e9a18390b9fd8f03fc412f3c17aee9 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 13 Dec 2024 01:43:28 +0900 Subject: [PATCH 0751/1450] nilfs2: fix buffer head leaks in calls to truncate_inode_pages() When block_invalidatepage was converted to block_invalidate_folio, the fallback to block_invalidatepage in folio_invalidate() if the address_space_operations method invalidatepage (currently invalidate_folio) was not set, was removed. Unfortunately, some pseudo-inodes in nilfs2 use empty_aops set by inode_init_always_gfp() as is, or explicitly set it to address_space_operations. Therefore, with this change, block_invalidatepage() is no longer called from folio_invalidate(), and as a result, the buffer_head structures attached to these pages/folios are no longer freed via try_to_free_buffers(). Thus, these buffer heads are now leaked by truncate_inode_pages(), which cleans up the page cache from inode evict(), etc. Three types of caches use empty_aops: gc inode caches and the DAT shadow inode used by GC, and b-tree node caches. Of these, b-tree node caches explicitly call invalidate_mapping_pages() during cleanup, which involves calling try_to_free_buffers(), so the leak was not visible during normal operation but worsened when GC was performed. Fix this issue by using address_space_operations with invalidate_folio set to block_invalidate_folio instead of empty_aops, which will ensure the same behavior as before. Link: https://lkml.kernel.org/r/20241212164556.21338-1-konishi.ryusuke@gmail.com Fixes: 7ba13abbd31e ("fs: Turn block_invalidatepage into block_invalidate_folio") Signed-off-by: Ryusuke Konishi Cc: [5.18+] Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 1 + fs/nilfs2/gcinode.c | 2 +- fs/nilfs2/inode.c | 5 +++++ fs/nilfs2/nilfs.h | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 501ad7be5174c..54a3fa0cf67ed 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -35,6 +35,7 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode) ii->i_flags = 0; memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap)); mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS); + btnc_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; } void nilfs_btnode_cache_clear(struct address_space *btnc) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index ace22253fed0f..2dbb15767df16 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -163,7 +163,7 @@ int nilfs_init_gcinode(struct inode *inode) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - inode->i_mapping->a_ops = &empty_aops; + inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b7d4105f37bf5..23f3a75edd501 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -276,6 +276,10 @@ const struct address_space_operations nilfs_aops = { .is_partially_uptodate = block_is_partially_uptodate, }; +const struct address_space_operations nilfs_buffer_cache_aops = { + .invalidate_folio = block_invalidate_folio, +}; + static int nilfs_insert_inode_locked(struct inode *inode, struct nilfs_root *root, unsigned long ino) @@ -681,6 +685,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode) NILFS_I(s_inode)->i_flags = 0; memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap)); mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS); + s_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; err = nilfs_attach_btree_node_cache(s_inode); if (unlikely(err)) { diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 45d03826eaf15..dff241c53fc58 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -401,6 +401,7 @@ extern const struct file_operations nilfs_dir_operations; extern const struct inode_operations nilfs_file_inode_operations; extern const struct file_operations nilfs_file_operations; extern const struct address_space_operations nilfs_aops; +extern const struct address_space_operations nilfs_buffer_cache_aops; extern const struct inode_operations nilfs_dir_inode_operations; extern const struct inode_operations nilfs_special_inode_operations; extern const struct inode_operations nilfs_symlink_inode_operations; From 42b2eb69835b0fda797f70eb5b4fc213dbe3a7ea Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 12 Dec 2024 18:33:51 +0000 Subject: [PATCH 0752/1450] mm: convert partially_mapped set/clear operations to be atomic Other page flags in the 2nd page, like PG_hwpoison and PG_anon_exclusive can get modified concurrently. Changes to other page flags might be lost if they are happening at the same time as non-atomic partially_mapped operations. Hence, make partially_mapped operations atomic. Link: https://lkml.kernel.org/r/20241212183351.1345389-1-usamaarif642@gmail.com Fixes: 8422acdc97ed ("mm: introduce a pageflag for partially mapped folios") Reported-by: David Hildenbrand Link: https://lore.kernel.org/all/e53b04ad-1827-43a2-a1ab-864c7efecf6e@redhat.com/ Signed-off-by: Usama Arif Acked-by: David Hildenbrand Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Barry Song Cc: Domenico Cerasuolo Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Nico Pache Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ++---------- mm/huge_memory.c | 8 ++++---- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cf46ac7208029..691506bdf2c5a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -862,18 +862,10 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) -FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -/* - * PG_partially_mapped is protected by deferred_split split_queue_lock, - * so its safe to use non-atomic set/clear. - */ -__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) -FOLIO_TEST_FLAG_FALSE(partially_mapped) -__FOLIO_SET_FLAG_NOOP(partially_mapped) -__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) +FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9bb351caa6190..df0c4988dd88b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3577,7 +3577,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3689,7 +3689,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3733,7 +3733,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { - __folio_set_partially_mapped(folio); + folio_set_partially_mapped(folio); if (folio_test_pmd_mappable(folio)) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); @@ -3826,7 +3826,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } else { /* We lost race with folio_put() */ if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } From 30c2de0a267c04046d89e678cc0067a9cfb455df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Dec 2024 13:31:26 -0800 Subject: [PATCH 0753/1450] mm/vmstat: fix a W=1 clang compiler warning Fix the following clang compiler warning that is reported if the kernel is built with W=1: ./include/linux/vmstat.h:518:36: error: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Werror,-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ Link: https://lkml.kernel.org/r/20241212213126.1269116-1-bvanassche@acm.org Fixes: 9d7ea9a297e6 ("mm/vmstat: add helpers to get vmstat item names for each enum type") Signed-off-by: Bart Van Assche Cc: Konstantin Khlebnikov Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d2761bf8ff32c..9f3a04345b860 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item) static inline const char *lru_list_name(enum lru_list lru) { - return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" + return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) From 640a603943a7659340c10044c0a1c98ae4e13189 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Fri, 13 Dec 2024 09:33:32 +0800 Subject: [PATCH 0754/1450] mm/codetag: clear tags before swap When CONFIG_MEM_ALLOC_PROFILING_DEBUG is set, kernel WARN would be triggered when calling __alloc_tag_ref_set() during swap: alloc_tag was not cleared (got tag for mm/filemap.c:1951) WARNING: CPU: 0 PID: 816 at ./include/linux/alloc_tag.h... Clear code tags before swap can fix the warning. And this patch also fix a potential invalid address dereference in alloc_tag_add_check() when CONFIG_MEM_ALLOC_PROFILING_DEBUG is set and ref->ct is CODETAG_EMPTY, which is defined as ((void *)1). Link: https://lkml.kernel.org/r/20241213013332.89910-1-00107082@163.com Fixes: 51f43d5d82ed ("mm/codetag: swap tags when migrate pages") Signed-off-by: David Wang <00107082@163.com> Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412112227.df61ebb-lkp@intel.com Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 +- lib/alloc_tag.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 7c0786bdf9af0..cba024bf2db33 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -135,7 +135,7 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) { - WARN_ONCE(ref && ref->ct, + WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref), "alloc_tag was not cleared (got tag for %s:%u)\n", ref->ct->filename, ref->ct->lineno); diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 35f7560a309a4..3a0413462e9fc 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -209,6 +209,13 @@ void pgalloc_tag_swap(struct folio *new, struct folio *old) return; } + /* + * Clear tag references to avoid debug warning when using + * __alloc_tag_ref_set() with non-empty reference. + */ + set_codetag_empty(&ref_old); + set_codetag_empty(&ref_new); + /* swap tags */ __alloc_tag_ref_set(&ref_old, tag_new); update_page_tag_ref(handle_old, &ref_old); From e269b5d2916d7a696c2d2ed370cea95d95a0675a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 29 Nov 2024 16:14:22 -0800 Subject: [PATCH 0755/1450] alloc_tag: fix module allocation tags populated area calculation vm_module_tags_populate() calculation of the populated area assumes that area starts at a page boundary and therefore when new pages are allocation, the end of the area is page-aligned as well. If the start of the area is not page-aligned then allocating a page and incrementing the end of the area by PAGE_SIZE leads to an area at the end but within the area boundary which is not populated. Accessing this are will lead to a kernel panic. Fix the calculation by down-aligning the start of the area and using that as the location allocated pages are mapped to. [gehao@kylinos.cn: fix vm_module_tags_populate's KASAN poisoning logic] Link: https://lkml.kernel.org/r/20241205170528.81000-1-hao.ge@linux.dev [gehao@kylinos.cn: fix panic when CONFIG_KASAN enabled and CONFIG_KASAN_VMALLOC not enabled] Link: https://lkml.kernel.org/r/20241212072126.134572-1-hao.ge@linux.dev Link: https://lkml.kernel.org/r/20241130001423.1114965-1-surenb@google.com Fixes: 0f9b685626da ("alloc_tag: populate memory for module tags as needed") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202411132111.6a221562-lkp@intel.com Acked-by: Yu Zhao Tested-by: Adrian Huang Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Sourav Panda Cc: Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 3a0413462e9fc..7dcebf118a3e6 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -408,28 +408,52 @@ static bool find_aligned_area(struct ma_state *mas, unsigned long section_size, static int vm_module_tags_populate(void) { - unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT; + unsigned long phys_end = ALIGN_DOWN(module_tags.start_addr, PAGE_SIZE) + + (vm_module_tags->nr_pages << PAGE_SHIFT); + unsigned long new_end = module_tags.start_addr + module_tags.size; - if (phys_size < module_tags.size) { + if (phys_end < new_end) { struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages; - unsigned long addr = module_tags.start_addr + phys_size; + unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN); + unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN); unsigned long more_pages; unsigned long nr; - more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT; + more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT; nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN, NUMA_NO_NODE, more_pages, next_page); if (nr < more_pages || - vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL, + vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL, next_page, PAGE_SHIFT) < 0) { /* Clean up and error out */ for (int i = 0; i < nr; i++) __free_page(next_page[i]); return -ENOMEM; } + vm_module_tags->nr_pages += nr; + + /* + * Kasan allocates 1 byte of shadow for every 8 bytes of data. + * When kasan_alloc_module_shadow allocates shadow memory, + * its unit of allocation is a page. + * Therefore, here we need to align to MODULE_ALIGN. + */ + if (old_shadow_end < new_shadow_end) + kasan_alloc_module_shadow((void *)old_shadow_end, + new_shadow_end - old_shadow_end, + GFP_KERNEL); } + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + kasan_unpoison_vmalloc((void *)module_tags.start_addr, + new_end - module_tags.start_addr, + KASAN_VMALLOC_PROT_NORMAL); + return 0; } From 60da7445a142bd15e67f3cda915497781c3f781f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 29 Nov 2024 16:14:23 -0800 Subject: [PATCH 0756/1450] alloc_tag: fix set_codetag_empty() when !CONFIG_MEM_ALLOC_PROFILING_DEBUG It was recently noticed that set_codetag_empty() might be used not only to mark NULL alloctag references as empty to avoid warnings but also to reset valid tags (in clear_page_tag_ref()). Since set_codetag_empty() is defined as NOOP for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n, such use of set_codetag_empty() leads to subtle bugs. Fix set_codetag_empty() for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n to reset the tag reference. Link: https://lkml.kernel.org/r/20241130001423.1114965-2-surenb@google.com Fixes: a8fc28dad6d5 ("alloc_tag: introduce clear_page_tag_ref() helper function") Signed-off-by: Suren Baghdasaryan Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20241124074318.399027-1-00107082@163.com/ Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Sourav Panda Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index cba024bf2db33..0bbbe537c5f9f 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -63,7 +63,12 @@ static inline void set_codetag_empty(union codetag_ref *ref) #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } -static inline void set_codetag_empty(union codetag_ref *ref) {} + +static inline void set_codetag_empty(union codetag_ref *ref) +{ + if (ref) + ref->ct = NULL; +} #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ From d3ac65d274b3a93cf9cf9559fd1473ab65e00e10 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Sun, 15 Dec 2024 20:27:51 -0800 Subject: [PATCH 0757/1450] mm: huge_memory: handle strsep not finding delimiter split_huge_pages_write() does not handle the case where strsep finds no delimiter in the given string and sets the input buffer to NULL, which allows this reproducer to trigger a protection fault. Link: https://lkml.kernel.org/r/20241216042752.257090-2-leocstone@gmail.com Signed-off-by: Leo Stone Reported-by: syzbot+8a3da2f1bbf59227c289@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8a3da2f1bbf59227c289 Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index df0c4988dd88b..e53d83b3e5cf9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4169,7 +4169,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, size_t input_len = strlen(input_buf); tok = strsep(&buf, ","); - if (tok) { + if (tok && buf) { strscpy(file_path, tok); } else { ret = -EINVAL; From a17975992cc11588767175247ccaae1213a8b582 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 17 Dec 2024 22:16:51 +0100 Subject: [PATCH 0758/1450] selftests: openvswitch: fix tcpdump execution Fix the way tcpdump is executed by: - Using the right variable for the namespace. Currently the use of the empty "ns" makes the command fail. - Waiting until it starts to capture to ensure the interesting traffic is caught on slow systems. - Using line-buffered output to ensure logs are available when the test is paused with "-p". Otherwise the last chunk of data might only be written when tcpdump is killed. Fixes: 74cc26f416b9 ("selftests: openvswitch: add interface support") Signed-off-by: Adrian Moreno Acked-by: Eelco Chaudron Link: https://patch.msgid.link/20241217211652.483016-1-amorenoz@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/openvswitch/openvswitch.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index cc0bfae2bafa1..960e1ab4dd04b 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -171,8 +171,10 @@ ovs_add_netns_and_veths () { ovs_add_if "$1" "$2" "$4" -u || return 1 fi - [ $TRACING -eq 1 ] && ovs_netns_spawn_daemon "$1" "$ns" \ - tcpdump -i any -s 65535 + if [ $TRACING -eq 1 ]; then + ovs_netns_spawn_daemon "$1" "$3" tcpdump -l -i any -s 6553 + ovs_wait grep -q "listening on any" ${ovs_dir}/stderr + fi return 0 } From 16f027cd40eeedd2325f7e720689462ca8d9d13e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 16 Dec 2024 15:50:59 +0200 Subject: [PATCH 0759/1450] net: dsa: restore dsa_software_vlan_untag() ability to operate on VLAN-untagged traffic Robert Hodaszi reports that locally terminated traffic towards VLAN-unaware bridge ports is broken with ocelot-8021q. He is describing the same symptoms as for commit 1f9fc48fd302 ("net: dsa: sja1105: fix reception from VLAN-unaware bridges"). For context, the set merged as "VLAN fixes for Ocelot driver": https://lore.kernel.org/netdev/20240815000707.2006121-1-vladimir.oltean@nxp.com/ was developed in a slightly different form earlier this year, in January. Initially, the switch was unconditionally configured to set OCELOT_ES0_TAG when using ocelot-8021q, regardless of port operating mode. This led to the situation where VLAN-unaware bridge ports would always push their PVID - see ocelot_vlan_unaware_pvid() - a negligible value anyway - into RX packets. To strip this in software, we would have needed DSA to know what private VID the switch chose for VLAN-unaware bridge ports, and pushed into the packets. This was implemented downstream, and a remnant of it remains in the form of a comment mentioning ds->ops->get_private_vid(), as something which would maybe need to be considered in the future. However, for upstream, it was deemed inappropriate, because it would mean introducing yet another behavior for stripping VLAN tags from VLAN-unaware bridge ports, when one already existed (ds->untag_bridge_pvid). The latter has been marked as obsolete along with an explanation why it is logically broken, but still, it would have been confusing. So, for upstream, felix_update_tag_8021q_rx_rule() was developed, which essentially changed the state of affairs from "Felix with ocelot-8021q delivers all packets as VLAN-tagged towards the CPU" into "Felix with ocelot-8021q delivers all packets from VLAN-aware bridge ports towards the CPU". This was done on the premise that in VLAN-unaware mode, there's nothing useful in the VLAN tags, and we can avoid introducing ds->ops->get_private_vid() in the DSA receive path if we configure the switch to not push those VLAN tags into packets in the first place. Unfortunately, and this is when the trainwreck started, the selftests developed initially and posted with the series were not re-ran. dsa_software_vlan_untag() was initially written given the assumption that users of this feature would send _all_ traffic as VLAN-tagged. It was only partially adapted to the new scheme, by removing ds->ops->get_private_vid(), which also used to be necessary in standalone ports mode. Where the trainwreck became even worse is that I had a second opportunity to think about this, when the dsa_software_vlan_untag() logic change initially broke sja1105, in commit 1f9fc48fd302 ("net: dsa: sja1105: fix reception from VLAN-unaware bridges"). I did not connect the dots that it also breaks ocelot-8021q, for pretty much the same reason that not all received packets will be VLAN-tagged. To be compatible with the optimized Felix control path which runs felix_update_tag_8021q_rx_rule() to only push VLAN tags when useful (in VLAN-aware mode), we need to restore the old dsa_software_vlan_untag() logic. The blamed commit introduced the assumption that dsa_software_vlan_untag() will see only VLAN-tagged packets, assumption which is false. What corrupts RX traffic is the fact that we call skb_vlan_untag() on packets which are not VLAN-tagged in the first place. Fixes: 93e4649efa96 ("net: dsa: provide a software untagging function on RX for VLAN-aware bridges") Reported-by: Robert Hodaszi Closes: https://lore.kernel.org/netdev/20241215163334.615427-1-robert.hodaszi@digi.com/ Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241216135059.1258266-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/tag.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/dsa/tag.h b/net/dsa/tag.h index d5707870906bc..5d80ddad4ff6b 100644 --- a/net/dsa/tag.h +++ b/net/dsa/tag.h @@ -138,9 +138,10 @@ static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb, * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path * @skb: Pointer to socket buffer (packet) * - * Receive path method for switches which cannot avoid tagging all packets - * towards the CPU port. Called when ds->untag_bridge_pvid (legacy) or - * ds->untag_vlan_aware_bridge_pvid is set to true. + * Receive path method for switches which send some packets as VLAN-tagged + * towards the CPU port (generally from VLAN-aware bridge ports) even when the + * packet was not tagged on the wire. Called when ds->untag_bridge_pvid + * (legacy) or ds->untag_vlan_aware_bridge_pvid is set to true. * * As a side effect of this method, any VLAN tag from the skb head is moved * to hwaccel. @@ -149,14 +150,19 @@ static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb) { struct dsa_port *dp = dsa_user_to_port(skb->dev); struct net_device *br = dsa_port_bridge_dev_get(dp); - u16 vid; + u16 vid, proto; + int err; /* software untagging for standalone ports not yet necessary */ if (!br) return skb; + err = br_vlan_get_proto(br, &proto); + if (err) + return skb; + /* Move VLAN tag from data to hwaccel */ - if (!skb_vlan_tag_present(skb)) { + if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { skb = skb_vlan_untag(skb); if (!skb) return NULL; From 5eb70dbebf32c2fd1f2814c654ae17fc47d6e859 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Dec 2024 18:25:08 -0800 Subject: [PATCH 0760/1450] netdev-genl: avoid empty messages in queue dump Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down it has no queues. Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reported-by: syzbot+0a884bc2d304ce4af70f@syzkaller.appspotmail.com Reviewed-by: Eric Dumazet Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241218022508.815344-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1be8c7c21d19d..2d3ae0cd3ad25 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -430,10 +430,10 @@ static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - int err = 0; + int err; if (!(netdev->flags & IFF_UP)) - return err; + return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) From 5eecd85c77a254a43bde3212da8047b001745c9f Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 17 Dec 2024 12:37:39 +0100 Subject: [PATCH 0761/1450] psample: adjust size if rate_as_probability is set If PSAMPLE_ATTR_SAMPLE_PROBABILITY flag is to be sent, the available size for the packet data has to be adjusted accordingly. Also, check the error code returned by nla_put_flag. Fixes: 7b1b2b60c63f ("net: psample: allow using rate as probability") Signed-off-by: Adrian Moreno Reviewed-by: Aaron Conole Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241217113739.3929300-1-amorenoz@redhat.com Signed-off-by: Jakub Kicinski --- net/psample/psample.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/psample/psample.c b/net/psample/psample.c index a0ddae8a65f91..25f92ba0840c6 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -393,7 +393,9 @@ void psample_sample_packet(struct psample_group *group, nla_total_size_64bit(sizeof(u64)) + /* timestamp */ nla_total_size(sizeof(u16)) + /* protocol */ (md->user_cookie_len ? - nla_total_size(md->user_cookie_len) : 0); /* user cookie */ + nla_total_size(md->user_cookie_len) : 0) + /* user cookie */ + (md->rate_as_probability ? + nla_total_size(0) : 0); /* rate as probability */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); @@ -498,8 +500,9 @@ void psample_sample_packet(struct psample_group *group, md->user_cookie)) goto error; - if (md->rate_as_probability) - nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY); + if (md->rate_as_probability && + nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY)) + goto error; genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, From 51df947678360faf1967fe0bd1a40c681f634104 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 16 Dec 2024 21:23:24 -0800 Subject: [PATCH 0762/1450] octeontx2-pf: fix netdev memory leak in rvu_rep_create() When rvu_rep_devlink_port_register() fails, free_netdev(ndev) for this incomplete iteration before going to "exit:" label. Fixes: 9ed0343f561e ("octeontx2-pf: Add devlink port support") Reviewed-by: Przemek Kitszel Signed-off-by: Harshit Mogalapalli Link: https://patch.msgid.link/20241217052326.1086191-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/rep.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c index 232b10740c130..9e3fcbae5dee7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c @@ -680,8 +680,10 @@ int rvu_rep_create(struct otx2_nic *priv, struct netlink_ext_ack *extack) ndev->features |= ndev->hw_features; eth_hw_addr_random(ndev); err = rvu_rep_devlink_port_register(rep); - if (err) + if (err) { + free_netdev(ndev); goto exit; + } SET_NETDEV_DEVLINK_PORT(ndev, &rep->dl_port); err = register_netdev(ndev); From b95c8c33ae687fcd3007cefa93907a6bd270119b Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 16 Dec 2024 21:23:25 -0800 Subject: [PATCH 0763/1450] octeontx2-pf: fix error handling of devlink port in rvu_rep_create() Unregister the devlink port when register_netdev() fails. Fixes: 9ed0343f561e ("octeontx2-pf: Add devlink port support") Reviewed-by: Przemek Kitszel Signed-off-by: Harshit Mogalapalli Link: https://patch.msgid.link/20241217052326.1086191-2-harshit.m.mogalapalli@oracle.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/rep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c index 9e3fcbae5dee7..04e08e06f30ff 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c @@ -690,6 +690,7 @@ int rvu_rep_create(struct otx2_nic *priv, struct netlink_ext_ack *extack) if (err) { NL_SET_ERR_MSG_MOD(extack, "PFVF representor registration failed"); + rvu_rep_devlink_port_unregister(rep); free_netdev(ndev); goto exit; } From 206112fa65790221ca3ebbc43092911bb8836a19 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Mon, 16 Dec 2024 12:19:53 +0500 Subject: [PATCH 0764/1450] net: renesas: rswitch: do not write to MPSM register at init time MPSM register is used to execute mdio bus transactions. There is no need to initialize it early. Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://patch.msgid.link/20241216071957.2587354-2-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 5fc8c94d1e4bf..12efee9f75d82 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1166,7 +1166,6 @@ static void rswitch_etha_enable_mii(struct rswitch_etha *etha) { rswitch_modify(etha->addr, MPIC, MPIC_PSMCS_MASK | MPIC_PSMHT_MASK, MPIC_PSMCS(etha->psmcs) | MPIC_PSMHT(0x06)); - rswitch_modify(etha->addr, MPSM, 0, MPSM_MFF_C45); } static int rswitch_etha_hw_init(struct rswitch_etha *etha, const u8 *mac) From da75ba93e3383fc10af71e5029b5a57378a57576 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Mon, 16 Dec 2024 12:19:54 +0500 Subject: [PATCH 0765/1450] net: renesas: rswitch: use FIELD_PREP for remaining MPIC register fields Commit fb9e6039c325 ("net: renesas: rswitch: fix initial MPIC register setting") converted setting some MPIC fields to FIELD_PREP. To keep common style, do the same with mii bus related fields of the same register. Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://patch.msgid.link/20241216071957.2587354-3-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 5 +++-- drivers/net/ethernet/renesas/rswitch.h | 10 ++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 12efee9f75d82..e1541a206687c 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1164,8 +1164,9 @@ static void rswitch_rmac_setting(struct rswitch_etha *etha, const u8 *mac) static void rswitch_etha_enable_mii(struct rswitch_etha *etha) { - rswitch_modify(etha->addr, MPIC, MPIC_PSMCS_MASK | MPIC_PSMHT_MASK, - MPIC_PSMCS(etha->psmcs) | MPIC_PSMHT(0x06)); + rswitch_modify(etha->addr, MPIC, MPIC_PSMCS | MPIC_PSMHT, + FIELD_PREP(MPIC_PSMCS, etha->psmcs) | + FIELD_PREP(MPIC_PSMHT, 0x06)); } static int rswitch_etha_hw_init(struct rswitch_etha *etha, const u8 *mac) diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index 4b14891003304..78c0325cdf306 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -732,6 +732,8 @@ enum rswitch_etha_mode { #define MPIC_LSC_100M 1 #define MPIC_LSC_1G 2 #define MPIC_LSC_2_5G 3 +#define MPIC_PSMCS GENMASK(22, 16) +#define MPIC_PSMHT GENMASK(26, 24) #define MDIO_READ_C45 0x03 #define MDIO_WRITE_C45 0x01 @@ -747,14 +749,6 @@ enum rswitch_etha_mode { #define MMIS1_PRACS BIT(0) /* Read */ #define MMIS1_CLEAR_FLAGS 0xf -#define MPIC_PSMCS_SHIFT 16 -#define MPIC_PSMCS_MASK GENMASK(22, MPIC_PSMCS_SHIFT) -#define MPIC_PSMCS(val) ((val) << MPIC_PSMCS_SHIFT) - -#define MPIC_PSMHT_SHIFT 24 -#define MPIC_PSMHT_MASK GENMASK(26, MPIC_PSMHT_SHIFT) -#define MPIC_PSMHT(val) ((val) << MPIC_PSMHT_SHIFT) - #define MLVC_PLV BIT(16) /* GWCA */ From 1ced1b8cacf396d6ff979f594ba40ace42087797 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Mon, 16 Dec 2024 12:19:55 +0500 Subject: [PATCH 0766/1450] net: renesas: rswitch: align mdio C45 operations with datasheet Per rswitch datasheet, software can know that mdio operation completed either by polling MPSM.PSME bit, or via interrupt. Instead, the driver currently polls for interrupt status bit. Although this still provides correct result, it requires additional register operations to clean the interrupt status bits, and generally looks wrong. Fix it to poll MPSM.PSME bit, as the datasheet suggests. Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://patch.msgid.link/20241216071957.2587354-4-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 12 +++--------- drivers/net/ethernet/renesas/rswitch.h | 6 ------ 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index e1541a206687c..6e3f162ae3b3d 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1205,32 +1205,26 @@ static int rswitch_etha_set_access(struct rswitch_etha *etha, bool read, if (devad == 0xffffffff) return -ENODEV; - writel(MMIS1_CLEAR_FLAGS, etha->addr + MMIS1); - val = MPSM_PSME | MPSM_MFF_C45; iowrite32((regad << 16) | (devad << 8) | (phyad << 3) | val, etha->addr + MPSM); - ret = rswitch_reg_wait(etha->addr, MMIS1, MMIS1_PAACS, MMIS1_PAACS); + ret = rswitch_reg_wait(etha->addr, MPSM, MPSM_PSME, 0); if (ret) return ret; - rswitch_modify(etha->addr, MMIS1, MMIS1_PAACS, MMIS1_PAACS); - if (read) { writel((pop << 13) | (devad << 8) | (phyad << 3) | val, etha->addr + MPSM); - ret = rswitch_reg_wait(etha->addr, MMIS1, MMIS1_PRACS, MMIS1_PRACS); + ret = rswitch_reg_wait(etha->addr, MPSM, MPSM_PSME, 0); if (ret) return ret; ret = (ioread32(etha->addr + MPSM) & MPSM_PRD_MASK) >> 16; - - rswitch_modify(etha->addr, MMIS1, MMIS1_PRACS, MMIS1_PRACS); } else { iowrite32((data << 16) | (pop << 13) | (devad << 8) | (phyad << 3) | val, etha->addr + MPSM); - ret = rswitch_reg_wait(etha->addr, MMIS1, MMIS1_PWACS, MMIS1_PWACS); + ret = rswitch_reg_wait(etha->addr, MPSM, MPSM_PSME, 0); } return ret; diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index 78c0325cdf306..2cb66f3f47165 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -743,12 +743,6 @@ enum rswitch_etha_mode { #define MPSM_PRD_SHIFT 16 #define MPSM_PRD_MASK GENMASK(31, MPSM_PRD_SHIFT) -/* Completion flags */ -#define MMIS1_PAACS BIT(2) /* Address */ -#define MMIS1_PWACS BIT(1) /* Write */ -#define MMIS1_PRACS BIT(0) /* Read */ -#define MMIS1_CLEAR_FLAGS 0xf - #define MLVC_PLV BIT(16) /* GWCA */ From 2aa722b6d81c3118d33dcb8eea9aac49f56af790 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Mon, 16 Dec 2024 12:19:56 +0500 Subject: [PATCH 0767/1450] net: renesas: rswitch: use generic MPSM operation for mdio C45 Introduce rswitch_etha_mpsm_op() that accepts values for MPSM register fields and executes the transaction. This avoids some code duptication, and can be used both for C45 and C22. Convert C45 read and write operations to use that. Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://patch.msgid.link/20241216071957.2587354-5-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 51 +++++++++++++++----------- drivers/net/ethernet/renesas/rswitch.h | 17 ++++++--- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 6e3f162ae3b3d..a3ba2a91c0abe 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1195,36 +1195,29 @@ static int rswitch_etha_hw_init(struct rswitch_etha *etha, const u8 *mac) return rswitch_etha_change_mode(etha, EAMC_OPC_OPERATION); } -static int rswitch_etha_set_access(struct rswitch_etha *etha, bool read, - int phyad, int devad, int regad, int data) +static int rswitch_etha_mpsm_op(struct rswitch_etha *etha, bool read, + unsigned int mmf, unsigned int pda, + unsigned int pra, unsigned int pop, + unsigned int prd) { - int pop = read ? MDIO_READ_C45 : MDIO_WRITE_C45; u32 val; int ret; - if (devad == 0xffffffff) - return -ENODEV; - - val = MPSM_PSME | MPSM_MFF_C45; - iowrite32((regad << 16) | (devad << 8) | (phyad << 3) | val, etha->addr + MPSM); + val = MPSM_PSME | + FIELD_PREP(MPSM_MFF, mmf) | + FIELD_PREP(MPSM_PDA, pda) | + FIELD_PREP(MPSM_PRA, pra) | + FIELD_PREP(MPSM_POP, pop) | + FIELD_PREP(MPSM_PRD, prd); + iowrite32(val, etha->addr + MPSM); ret = rswitch_reg_wait(etha->addr, MPSM, MPSM_PSME, 0); if (ret) return ret; if (read) { - writel((pop << 13) | (devad << 8) | (phyad << 3) | val, etha->addr + MPSM); - - ret = rswitch_reg_wait(etha->addr, MPSM, MPSM_PSME, 0); - if (ret) - return ret; - - ret = (ioread32(etha->addr + MPSM) & MPSM_PRD_MASK) >> 16; - } else { - iowrite32((data << 16) | (pop << 13) | (devad << 8) | (phyad << 3) | val, - etha->addr + MPSM); - - ret = rswitch_reg_wait(etha->addr, MPSM, MPSM_PSME, 0); + val = ioread32(etha->addr + MPSM); + ret = FIELD_GET(MPSM_PRD, val); } return ret; @@ -1234,16 +1227,30 @@ static int rswitch_etha_mii_read_c45(struct mii_bus *bus, int addr, int devad, int regad) { struct rswitch_etha *etha = bus->priv; + int ret; - return rswitch_etha_set_access(etha, true, addr, devad, regad, 0); + ret = rswitch_etha_mpsm_op(etha, false, MPSM_MMF_C45, addr, devad, + MPSM_POP_ADDRESS, regad); + if (ret) + return ret; + + return rswitch_etha_mpsm_op(etha, true, MPSM_MMF_C45, addr, devad, + MPSM_POP_READ_C45, 0); } static int rswitch_etha_mii_write_c45(struct mii_bus *bus, int addr, int devad, int regad, u16 val) { struct rswitch_etha *etha = bus->priv; + int ret; + + ret = rswitch_etha_mpsm_op(etha, false, MPSM_MMF_C45, addr, devad, + MPSM_POP_ADDRESS, regad); + if (ret) + return ret; - return rswitch_etha_set_access(etha, false, addr, devad, regad, val); + return rswitch_etha_mpsm_op(etha, false, MPSM_MMF_C45, addr, devad, + MPSM_POP_WRITE, val); } /* Call of_node_put(port) after done */ diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index 2cb66f3f47165..35ee73766396e 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -735,13 +735,18 @@ enum rswitch_etha_mode { #define MPIC_PSMCS GENMASK(22, 16) #define MPIC_PSMHT GENMASK(26, 24) -#define MDIO_READ_C45 0x03 -#define MDIO_WRITE_C45 0x01 - #define MPSM_PSME BIT(0) -#define MPSM_MFF_C45 BIT(2) -#define MPSM_PRD_SHIFT 16 -#define MPSM_PRD_MASK GENMASK(31, MPSM_PRD_SHIFT) +#define MPSM_MFF BIT(2) +#define MPSM_MMF_C22 0 +#define MPSM_MMF_C45 1 +#define MPSM_PDA GENMASK(7, 3) +#define MPSM_PRA GENMASK(12, 8) +#define MPSM_POP GENMASK(14, 13) +#define MPSM_POP_ADDRESS 0 +#define MPSM_POP_WRITE 1 +#define MPSM_POP_READ_C22 2 +#define MPSM_POP_READ_C45 3 +#define MPSM_PRD GENMASK(31, 16) #define MLVC_PLV BIT(16) From db48fe905d8ae90d0c35238ddd90e816d543316c Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Mon, 16 Dec 2024 12:19:57 +0500 Subject: [PATCH 0768/1450] net: renesas: rswitch: add mdio C22 support The generic MPSM operation added by the previous patch can be used both for C45 and C22. Add handlers for C22 operations. Signed-off-by: Nikita Yushchenko Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://patch.msgid.link/20241216071957.2587354-6-nikita.yoush@cogentembedded.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index a3ba2a91c0abe..aae26098bc0c0 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1253,6 +1253,23 @@ static int rswitch_etha_mii_write_c45(struct mii_bus *bus, int addr, int devad, MPSM_POP_WRITE, val); } +static int rswitch_etha_mii_read_c22(struct mii_bus *bus, int phyad, int regad) +{ + struct rswitch_etha *etha = bus->priv; + + return rswitch_etha_mpsm_op(etha, true, MPSM_MMF_C22, phyad, regad, + MPSM_POP_READ_C22, 0); +} + +static int rswitch_etha_mii_write_c22(struct mii_bus *bus, int phyad, + int regad, u16 val) +{ + struct rswitch_etha *etha = bus->priv; + + return rswitch_etha_mpsm_op(etha, false, MPSM_MMF_C22, phyad, regad, + MPSM_POP_WRITE, val); +} + /* Call of_node_put(port) after done */ static struct device_node *rswitch_get_port_node(struct rswitch_device *rdev) { @@ -1335,6 +1352,8 @@ static int rswitch_mii_register(struct rswitch_device *rdev) mii_bus->priv = rdev->etha; mii_bus->read_c45 = rswitch_etha_mii_read_c45; mii_bus->write_c45 = rswitch_etha_mii_write_c45; + mii_bus->read = rswitch_etha_mii_read_c22; + mii_bus->write = rswitch_etha_mii_write_c22; mii_bus->parent = &rdev->priv->pdev->dev; mdio_np = of_get_child_by_name(rdev->np_port, "mdio"); From 13a6691910cc23ea9ba4066e098603088673d5b0 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Tue, 10 Dec 2024 09:33:10 +0200 Subject: [PATCH 0769/1450] RDMA/nldev: Set error code in rdma_nl_notify_event In case of error set the error code before the goto. Fixes: 6ff57a2ea7c2 ("RDMA/nldev: Fix NULL pointer dereferences issue in rdma_nl_notify_event") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-rdma/a84a2fc3-33b6-46da-a1bd-3343fa07eaf9@stanley.mountain/ Signed-off-by: Chiara Meiohas Reviewed-by: Maher Sanalla Link: https://patch.msgid.link/13eb25961923f5de9eb9ecbbc94e26113d6049ef.1733815944.git.leonro@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ff121e59b9c0e..cb987ab0177c8 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -2833,8 +2833,8 @@ int rdma_nl_notify_event(struct ib_device *device, u32 port_num, enum rdma_nl_notify_event_type type) { struct sk_buff *skb; + int ret = -EMSGSIZE; struct net *net; - int ret = 0; void *nlh; net = read_pnet(&device->coredev.rdma_net); From 16b87037b48889d21854c8e97aec8a1baf2642b3 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 12 Dec 2024 16:18:48 +0100 Subject: [PATCH 0770/1450] RDMA/siw: Remove direct link to net_device Do not manage a per device direct link to net_device. Rely on associated ib_devices net_device management, not doubling the effort locally. A badly managed local link to net_device was causing a 'KASAN: slab-use-after-free' exception during siw_query_port() call. Fixes: bdcf26bf9b3a ("rdma/siw: network and RDMA core interface") Reported-by: syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4b87489410b4efd181bf Signed-off-by: Bernard Metzler Link: https://patch.msgid.link/20241212151848.564872-1-bmt@zurich.ibm.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw.h | 7 +++--- drivers/infiniband/sw/siw/siw_cm.c | 27 ++++++++++++++++----- drivers/infiniband/sw/siw/siw_main.c | 15 +----------- drivers/infiniband/sw/siw/siw_verbs.c | 35 ++++++++++++++++++--------- 4 files changed, 49 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 86d4d6a2170e1..ea5eee50dc39d 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -46,6 +46,9 @@ */ #define SIW_IRQ_MAXBURST_SQ_ACTIVE 4 +/* There is always only a port 1 per siw device */ +#define SIW_PORT 1 + struct siw_dev_cap { int max_qp; int max_qp_wr; @@ -69,16 +72,12 @@ struct siw_pd { struct siw_device { struct ib_device base_dev; - struct net_device *netdev; struct siw_dev_cap attrs; u32 vendor_part_id; int numa_node; char raw_gid[ETH_ALEN]; - /* physical port state (only one port per device) */ - enum ib_port_state state; - spinlock_t lock; struct xarray qp_xa; diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 86323918a570e..708b13993fdfd 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1759,6 +1759,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) { struct socket *s; struct siw_cep *cep = NULL; + struct net_device *ndev = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; int rv = 0; @@ -1779,9 +1780,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); /* For wildcard addr, limit binding to current device only */ - if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) - s->sk->sk_bound_dev_if = sdev->netdev->ifindex; - + if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { + ndev = ib_device_get_netdev(id->device, SIW_PORT); + if (ndev) { + s->sk->sk_bound_dev_if = ndev->ifindex; + } else { + rv = -ENODEV; + goto error; + } + } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in)); } else { @@ -1797,9 +1804,15 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } /* For wildcard addr, limit binding to current device only */ - if (ipv6_addr_any(&laddr->sin6_addr)) - s->sk->sk_bound_dev_if = sdev->netdev->ifindex; - + if (ipv6_addr_any(&laddr->sin6_addr)) { + ndev = ib_device_get_netdev(id->device, SIW_PORT); + if (ndev) { + s->sk->sk_bound_dev_if = ndev->ifindex; + } else { + rv = -ENODEV; + goto error; + } + } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in6)); } @@ -1860,6 +1873,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); cep->state = SIW_EPSTATE_LISTENING; + dev_put(ndev); siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); @@ -1879,6 +1893,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) siw_cep_set_free_and_put(cep); } sock_release(s); + dev_put(ndev); return rv; } diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 17abef48abcd2..14d3103aee6f8 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -287,7 +287,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev) return NULL; base_dev = &sdev->base_dev; - sdev->netdev = netdev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, @@ -381,12 +380,10 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, switch (event) { case NETDEV_UP: - sdev->state = IB_PORT_ACTIVE; siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; case NETDEV_DOWN: - sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); break; @@ -407,12 +404,8 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* - * Todo: Below netdev events are currently not handled. + * All other events are not handled */ - case NETDEV_CHANGEMTU: - case NETDEV_CHANGE: - break; - default: break; } @@ -442,12 +435,6 @@ static int siw_newlink(const char *basedev_name, struct net_device *netdev) sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); - - if (netif_running(netdev) && netif_carrier_ok(netdev)) - sdev->state = IB_PORT_ACTIVE; - else - sdev->state = IB_PORT_DOWN; - ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 986666c19378a..7ca0297d68a4a 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -171,21 +171,29 @@ int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, int siw_query_port(struct ib_device *base_dev, u32 port, struct ib_port_attr *attr) { - struct siw_device *sdev = to_siw_dev(base_dev); + struct net_device *ndev; int rv; memset(attr, 0, sizeof(*attr)); rv = ib_get_eth_speed(base_dev, port, &attr->active_speed, &attr->active_width); + if (rv) + return rv; + + ndev = ib_device_get_netdev(base_dev, SIW_PORT); + if (!ndev) + return -ENODEV; + attr->gid_tbl_len = 1; attr->max_msg_sz = -1; - attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); - attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); - attr->phys_state = sdev->state == IB_PORT_ACTIVE ? + attr->max_mtu = ib_mtu_int_to_enum(ndev->max_mtu); + attr->active_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); + attr->phys_state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; + attr->state = attr->phys_state == IB_PORT_PHYS_STATE_LINK_UP ? + IB_PORT_ACTIVE : IB_PORT_DOWN; attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; - attr->state = sdev->state; /* * All zero * @@ -199,6 +207,7 @@ int siw_query_port(struct ib_device *base_dev, u32 port, * attr->subnet_timeout = 0; * attr->init_type_repy = 0; */ + dev_put(ndev); return rv; } @@ -505,21 +514,24 @@ int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct siw_qp *qp; - struct siw_device *sdev; + struct net_device *ndev; - if (base_qp && qp_attr && qp_init_attr) { + if (base_qp && qp_attr && qp_init_attr) qp = to_siw_qp(base_qp); - sdev = to_siw_dev(base_qp->device); - } else { + else return -EINVAL; - } + + ndev = ib_device_get_netdev(base_qp->device, SIW_PORT); + if (!ndev) + return -ENODEV; + qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state]; qp_attr->cap.max_inline_data = SIW_MAX_INLINE; qp_attr->cap.max_send_wr = qp->attrs.sq_size; qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; qp_attr->cap.max_recv_wr = qp->attrs.rq_size; qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; - qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + qp_attr->path_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); qp_attr->max_rd_atomic = qp->attrs.irq_size; qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; @@ -534,6 +546,7 @@ int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; + dev_put(ndev); return 0; } From 572af9f284669d31d9175122bbef9bc62cea8ded Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Wed, 18 Dec 2024 12:51:06 +0900 Subject: [PATCH 0771/1450] net: mdiobus: fix an OF node reference leak fwnode_find_mii_timestamper() calls of_parse_phandle_with_fixed_args() but does not decrement the refcount of the obtained OF node. Add an of_node_put() call before returning from the function. This bug was detected by an experimental static analysis tool that I am developing. Fixes: bc1bee3b87ee ("net: mdiobus: Introduce fwnode_mdiobus_register_phy()") Signed-off-by: Joe Hattori Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218035106.1436405-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Paolo Abeni --- drivers/net/mdio/fwnode_mdio.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c index b156493d70841..aea0f03575689 100644 --- a/drivers/net/mdio/fwnode_mdio.c +++ b/drivers/net/mdio/fwnode_mdio.c @@ -40,6 +40,7 @@ fwnode_find_pse_control(struct fwnode_handle *fwnode) static struct mii_timestamper * fwnode_find_mii_timestamper(struct fwnode_handle *fwnode) { + struct mii_timestamper *mii_ts; struct of_phandle_args arg; int err; @@ -53,10 +54,16 @@ fwnode_find_mii_timestamper(struct fwnode_handle *fwnode) else if (err) return ERR_PTR(err); - if (arg.args_count != 1) - return ERR_PTR(-EINVAL); + if (arg.args_count != 1) { + mii_ts = ERR_PTR(-EINVAL); + goto put_node; + } + + mii_ts = register_mii_timestamper(arg.np, arg.args[0]); - return register_mii_timestamper(arg.np, arg.args[0]); +put_node: + of_node_put(arg.np); + return mii_ts; } int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio, From ce1219c3f76bb131d095e90521506d3c6ccfa086 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 18 Dec 2024 11:53:01 +0800 Subject: [PATCH 0772/1450] net: mctp: handle skb cleanup on sock_queue failures Currently, we don't use the return value from sock_queue_rcv_skb, which means we may leak skbs if a message is not successfully queued to a socket. Instead, ensure that we're freeing the skb where the sock hasn't otherwise taken ownership of the skb by adding checks on the sock_queue_rcv_skb() to invoke a kfree on failure. In doing so, rather than using the 'rc' value to trigger the kfree_skb(), use the skb pointer itself, which is more explicit. Also, add a kunit test for the sock delivery failure cases. Fixes: 4a992bbd3650 ("mctp: Implement message fragmentation & reassembly") Cc: stable@vger.kernel.org Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20241218-mctp-next-v2-1-1c1729645eaa@codeconstruct.com.au Signed-off-by: Paolo Abeni --- net/mctp/route.c | 36 +++++++++++----- net/mctp/test/route-test.c | 86 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 10 deletions(-) diff --git a/net/mctp/route.c b/net/mctp/route.c index 597e9cf5aa644..3f2bd65ff5e3c 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -374,8 +374,13 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) msk = NULL; rc = -EINVAL; - /* we may be receiving a locally-routed packet; drop source sk - * accounting + /* We may be receiving a locally-routed packet; drop source sk + * accounting. + * + * From here, we will either queue the skb - either to a frag_queue, or + * to a receiving socket. When that succeeds, we clear the skb pointer; + * a non-NULL skb on exit will be otherwise unowned, and hence + * kfree_skb()-ed. */ skb_orphan(skb); @@ -434,7 +439,9 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * pending key. */ if (flags & MCTP_HDR_FLAG_EOM) { - sock_queue_rcv_skb(&msk->sk, skb); + rc = sock_queue_rcv_skb(&msk->sk, skb); + if (!rc) + skb = NULL; if (key) { /* we've hit a pending reassembly; not much we * can do but drop it @@ -443,7 +450,6 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) MCTP_TRACE_KEY_REPLIED); key = NULL; } - rc = 0; goto out_unlock; } @@ -470,8 +476,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * this function. */ rc = mctp_key_add(key, msk); - if (!rc) + if (!rc) { trace_mctp_key_acquire(key); + skb = NULL; + } /* we don't need to release key->lock on exit, so * clean up here and suppress the unlock via @@ -489,6 +497,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) key = NULL; } else { rc = mctp_frag_queue(key, skb); + if (!rc) + skb = NULL; } } @@ -503,12 +513,19 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) else rc = mctp_frag_queue(key, skb); + if (rc) + goto out_unlock; + + /* we've queued; the queue owns the skb now */ + skb = NULL; + /* end of message? deliver to socket, and we're done with * the reassembly/response key */ - if (!rc && flags & MCTP_HDR_FLAG_EOM) { - sock_queue_rcv_skb(key->sk, key->reasm_head); - key->reasm_head = NULL; + if (flags & MCTP_HDR_FLAG_EOM) { + rc = sock_queue_rcv_skb(key->sk, key->reasm_head); + if (!rc) + key->reasm_head = NULL; __mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED); key = NULL; } @@ -527,8 +544,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (any_key) mctp_key_unref(any_key); out: - if (rc) - kfree_skb(skb); + kfree_skb(skb); return rc; } diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 8551dab1d1e69..17165b86ce22d 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -837,6 +837,90 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) mctp_test_route_input_multiple_nets_key_fini(test, &t2); } +/* Input route to socket, using a single-packet message, where sock delivery + * fails. Ensure we're handling the failure appropriately. + */ +static void mctp_test_route_input_sk_fail_single(struct kunit *test) +{ + const struct mctp_hdr hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO); + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + struct sk_buff *skb; + int rc; + + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + + /* No rcvbuf space, so delivery should fail. __sock_set_rcvbuf will + * clamp the minimum to SOCK_MIN_RCVBUF, so we open-code this. + */ + lock_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvbuf, 0); + release_sock(sock->sk); + + skb = mctp_test_create_skb(&hdr, 10); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + skb_get(skb); + + mctp_test_skb_set_dev(skb, dev); + + /* do route input, which should fail */ + rc = mctp_route_input(&rt->rt, skb); + KUNIT_EXPECT_NE(test, rc, 0); + + /* we should hold the only reference to skb */ + KUNIT_EXPECT_EQ(test, refcount_read(&skb->users), 1); + kfree_skb(skb); + + __mctp_route_test_fini(test, dev, rt, sock); +} + +/* Input route to socket, using a fragmented message, where sock delivery fails. + */ +static void mctp_test_route_input_sk_fail_frag(struct kunit *test) +{ + const struct mctp_hdr hdrs[2] = { RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1) }; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct sk_buff *skbs[2]; + struct socket *sock; + unsigned int i; + int rc; + + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + + lock_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvbuf, 0); + release_sock(sock->sk); + + for (i = 0; i < ARRAY_SIZE(skbs); i++) { + skbs[i] = mctp_test_create_skb(&hdrs[i], 10); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skbs[i]); + skb_get(skbs[i]); + + mctp_test_skb_set_dev(skbs[i], dev); + } + + /* first route input should succeed, we're only queueing to the + * frag list + */ + rc = mctp_route_input(&rt->rt, skbs[0]); + KUNIT_EXPECT_EQ(test, rc, 0); + + /* final route input should fail to deliver to the socket */ + rc = mctp_route_input(&rt->rt, skbs[1]); + KUNIT_EXPECT_NE(test, rc, 0); + + /* we should hold the only reference to both skbs */ + KUNIT_EXPECT_EQ(test, refcount_read(&skbs[0]->users), 1); + kfree_skb(skbs[0]); + + KUNIT_EXPECT_EQ(test, refcount_read(&skbs[1]->users), 1); + kfree_skb(skbs[1]); + + __mctp_route_test_fini(test, dev, rt, sock); +} + #if IS_ENABLED(CONFIG_MCTP_FLOWS) static void mctp_test_flow_init(struct kunit *test, @@ -1053,6 +1137,8 @@ static struct kunit_case mctp_test_cases[] = { mctp_route_input_sk_reasm_gen_params), KUNIT_CASE_PARAM(mctp_test_route_input_sk_keys, mctp_route_input_sk_keys_gen_params), + KUNIT_CASE(mctp_test_route_input_sk_fail_single), + KUNIT_CASE(mctp_test_route_input_sk_fail_frag), KUNIT_CASE(mctp_test_route_input_multiple_nets_bind), KUNIT_CASE(mctp_test_route_input_multiple_nets_key), KUNIT_CASE(mctp_test_packet_flow), From 32c9c06adb5b157ef259233775a063a43746d699 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Thu, 19 Dec 2024 18:53:02 +0800 Subject: [PATCH 0773/1450] ASoC: mediatek: disable buffer pre-allocation On Chromebooks based on Mediatek MT8195 or MT8188, the audio frontend (AFE) is limited to accessing a very small window (1 MiB) of memory, which is described as a reserved memory region in the device tree. On these two platforms, the maximum buffer size is given as 512 KiB. The MediaTek common code uses the same value for preallocations. This means that only the first two PCM substreams get preallocations, and then the whole space is exhausted, barring any other substreams from working. Since the substreams used are not always the first two, this means audio won't work correctly. This is observed on the MT8188 Geralt Chromebooks, on which the "mediatek,dai-link" property was dropped when it was upstreamed. That property causes the driver to only register the PCM substreams listed in the property, and in the order given. Instead of trying to compute an optimal value and figuring out which streams are used, simply disable preallocation. The PCM buffers are managed by the core and are allocated and released on the fly. There should be no impact to any of the other MediaTek platforms. Signed-off-by: Chen-Yu Tsai Reviewed-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20241219105303.548437-1-wenst@chromium.org Signed-off-by: Mark Brown --- sound/soc/mediatek/common/mtk-afe-platform-driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/mediatek/common/mtk-afe-platform-driver.c b/sound/soc/mediatek/common/mtk-afe-platform-driver.c index 9b72b2a7ae917..6b63305839414 100644 --- a/sound/soc/mediatek/common/mtk-afe-platform-driver.c +++ b/sound/soc/mediatek/common/mtk-afe-platform-driver.c @@ -120,8 +120,8 @@ int mtk_afe_pcm_new(struct snd_soc_component *component, struct mtk_base_afe *afe = snd_soc_component_get_drvdata(component); size = afe->mtk_afe_hardware->buffer_bytes_max; - snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_DEV, - afe->dev, size, size); + snd_pcm_set_managed_buffer_all(pcm, SNDRV_DMA_TYPE_DEV, afe->dev, 0, size); + return 0; } EXPORT_SYMBOL_GPL(mtk_afe_pcm_new); From 13221496065fa12fac4f8a8e725444679ffddb78 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Wed, 18 Dec 2024 20:54:53 +0100 Subject: [PATCH 0774/1450] regulator: rename regulator-uv-survival-time-ms according to DT binding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The regulator bindings don't document regulator-uv-survival-time-ms, but the more descriptive regulator-uv-less-critical-window-ms instead. Looking back at v3[1] and v4[2] of the series adding the support, the property was indeed renamed between these patch series, but unfortunately the rename only made it into the DT bindings with the driver code still using the old name. Let's therefore rename the property in the driver code to follow suit. This will break backwards compatibility, but there are no upstream device trees using the property and we never documented the old name of the property anyway. ¯\_(ツ)_/¯" [1]: https://lore.kernel.org/all/20231025084614.3092295-7-o.rempel@pengutronix.de/ [2]: https://lore.kernel.org/all/20231026144824.4065145-5-o.rempel@pengutronix.de/ Signed-off-by: Ahmad Fatoum Link: https://patch.msgid.link/20241218-regulator-uv-survival-time-ms-rename-v1-1-6cac9c3c75da@pengutronix.de Signed-off-by: Mark Brown --- drivers/regulator/of_regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index 3d85762beda63..e5b4b93c07e3f 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -175,7 +175,7 @@ static int of_get_regulation_constraints(struct device *dev, if (!ret) constraints->enable_time = pval; - ret = of_property_read_u32(np, "regulator-uv-survival-time-ms", &pval); + ret = of_property_read_u32(np, "regulator-uv-less-critical-window-ms", &pval); if (!ret) constraints->uv_less_critical_window_ms = pval; else From 40be32303ec829ea12f9883e499bfd3fe9e52baf Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 17 Dec 2024 15:56:45 +0530 Subject: [PATCH 0775/1450] RDMA/bnxt_re: Fix max_qp_wrs reported While creating qps, driver adds one extra entry to the sq size passed by the ULPs in order to avoid queue full condition. When ULPs creates QPs with max_qp_wr reported, driver creates QP with 1 more than the max_wqes supported by HW. Create QP fails in this case. To avoid this error, reduce 1 entry in max_qp_wqes and report it to the stack. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Reviewed-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 7e20ae3d2c4fe..73c9baaebb4e8 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -129,7 +129,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_qp_init_rd_atom = sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; - attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr); + attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; /* * 128 WQEs needs to be reserved for the HW (8916). Prevent * reporting the max number From d5a38bf2f35979537c526acbc56bc435ed40685f Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 17 Dec 2024 15:56:46 +0530 Subject: [PATCH 0776/1450] RDMA/bnxt_re: Disable use of reserved wqes Disabling the reserved wqes logic for Gen P5/P7 devices because this workaround is required only for legacy devices. Fixes: ecb53febfcad ("RDMA/bnxt_en: Enable RDMA driver support for 57500 chip") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 73c9baaebb4e8..776f8f1f1432f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -130,11 +130,13 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; - /* - * 128 WQEs needs to be reserved for the HW (8916). Prevent - * reporting the max number - */ - attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; + if (!bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx)) { + /* + * 128 WQEs needs to be reserved for the HW (8916). Prevent + * reporting the max number on legacy devices + */ + attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; + } attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; From d13be54dc18baee7a3e44349b80755a8c8205d3f Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Tue, 17 Dec 2024 15:56:47 +0530 Subject: [PATCH 0777/1450] RDMA/bnxt_re: Add send queue size check for variable wqe For the fixed WQE case, HW supports 0xFFFF WQEs. For variable Size WQEs, HW treats this number as the 16 bytes slots. The maximum supported WQEs needs to be adjusted based on the number of slots. Set a maximum WQE limit for variable WQE scenario. Fixes: de1d364c3815 ("RDMA/bnxt_re: Add support for Variable WQE in Genp7 adapters") Reviewed-by: Kalesh AP Signed-off-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 776f8f1f1432f..9df3e3271577d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -138,6 +138,10 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; } + /* Adjust for max_qp_wqes for variable wqe */ + if (cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + attr->max_qp_wqes = BNXT_VAR_MAX_WQE - 1; + attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; attr->max_cq = le32_to_cpu(sb->max_cq); From bb839f3ace0fee532a0487b692cc4d868fccb7cf Mon Sep 17 00:00:00 2001 From: Damodharam Ammepalli Date: Tue, 17 Dec 2024 15:56:48 +0530 Subject: [PATCH 0778/1450] RDMA/bnxt_re: Fix MSN table size for variable wqe mode For variable size wqe mode, the MSN table size should be half the size of the SQ depth. Fixing this to avoid wrap around problems in the retransmission path. Fixes: de1d364c3815 ("RDMA/bnxt_re: Add support for Variable WQE in Genp7 adapters") Reviewed-by: Kashyap Desai Reviewed-by: Kalesh AP Signed-off-by: Damodharam Ammepalli Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-5-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index d8a2a929bbe32..951ad90f5aa96 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1033,7 +1033,12 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) : 0; /* Update msn tbl size */ if (qp->is_host_msn_tbl && psn_sz) { - hwq_attr.aux_depth = roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); + else + hwq_attr.aux_depth = + roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)) / 2; qp->msn_tbl_sz = hwq_attr.aux_depth; qp->msn = 0; } From 9272cba0ded71b5a2084da3004ec7806b8cb7fd2 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 17 Dec 2024 15:56:49 +0530 Subject: [PATCH 0779/1450] RDMA/bnxt_re: Fix the locking while accessing the QP table QP table handling is synchronized with destroy QP and Async event from the HW. The same needs to be synchronized during create_qp also. Use the same lock in create_qp also. Fixes: 76d3ddff7153 ("RDMA/bnxt_re: synchronize the qp-handle table array") Fixes: f218d67ef004 ("RDMA/bnxt_re: Allow posting when QPs are in error") Fixes: 84cf229f4001 ("RDMA/bnxt_re: Fix the qp table indexing") Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20241217102649.1377704-6-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 951ad90f5aa96..5336f74297f81 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1181,9 +1181,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rq->dbinfo.db = qp->dpi->dbr; rq->dbinfo.max_slot = bnxt_qplib_set_rq_max_slot(rq->wqe_size); } + spin_lock_bh(&rcfw->tbl_lock); tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw); rcfw->qp_tbl[tbl_indx].qp_id = qp->id; rcfw->qp_tbl[tbl_indx].qp_handle = (void *)qp; + spin_unlock_bh(&rcfw->tbl_lock); return 0; fail: From 29d44cce324dab2bd86c447071a596262e7109b6 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 19 Dec 2024 19:15:06 +0800 Subject: [PATCH 0780/1450] selftests/bpf: Use asm constraint "m" for LoongArch Currently, LoongArch LLVM does not support the constraint "o" and no plan to support it, it only supports the similar constraint "m", so change the constraints from "nor" in the "else" case to arch-specific "nmr" to avoid the build error such as "unexpected asm memory constraint" for LoongArch. Fixes: 630301b0d59d ("selftests/bpf: Add basic USDT selftests") Suggested-by: Weining Lu Suggested-by: Li Chen Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Reviewed-by: Huacai Chen Cc: stable@vger.kernel.org Link: https://llvm.org/docs/LangRef.html#supported-constraint-code-list Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp#L172 Link: https://lore.kernel.org/bpf/20241219111506.20643-1-yangtiezhu@loongson.cn --- tools/testing/selftests/bpf/sdt.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/sdt.h b/tools/testing/selftests/bpf/sdt.h index ca0162b4dc575..1fcfa5160231d 100644 --- a/tools/testing/selftests/bpf/sdt.h +++ b/tools/testing/selftests/bpf/sdt.h @@ -102,6 +102,8 @@ # define STAP_SDT_ARG_CONSTRAINT nZr # elif defined __arm__ # define STAP_SDT_ARG_CONSTRAINT g +# elif defined __loongarch__ +# define STAP_SDT_ARG_CONSTRAINT nmr # else # define STAP_SDT_ARG_CONSTRAINT nor # endif From 4b2efb9db0c22a130bbd1275e489b42c02d08050 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 10 Dec 2024 14:09:37 +0100 Subject: [PATCH 0781/1450] accel/ivpu: Fix general protection fault in ivpu_bo_list() Check if ctx is not NULL before accessing its fields. Fixes: 37dee2a2f433 ("accel/ivpu: Improve buffer object debug logs") Cc: stable@vger.kernel.org # v6.8 Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-2-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c index d8e97a760fbc0..16178054e6296 100644 --- a/drivers/accel/ivpu/ivpu_gem.c +++ b/drivers/accel/ivpu/ivpu_gem.c @@ -409,7 +409,7 @@ static void ivpu_bo_print_info(struct ivpu_bo *bo, struct drm_printer *p) mutex_lock(&bo->lock); drm_printf(p, "%-9p %-3u 0x%-12llx %-10lu 0x%-8x %-4u", - bo, bo->ctx->id, bo->vpu_addr, bo->base.base.size, + bo, bo->ctx ? bo->ctx->id : 0, bo->vpu_addr, bo->base.base.size, bo->flags, kref_read(&bo->base.base.refcount)); if (bo->base.pages) From 6c9ba75f147b24b5c59aac7356a38a0fef664afa Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 10 Dec 2024 14:09:38 +0100 Subject: [PATCH 0782/1450] accel/ivpu: Fix memory leak in ivpu_mmu_reserved_context_init() Add appropriate error handling to ensure all allocated resources are released upon encountering an error. Fixes: a74f4d991352 ("accel/ivpu: Defer MMU root page table allocation") Cc: Karol Wachowski Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-3-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_mmu_context.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_mmu_context.c b/drivers/accel/ivpu/ivpu_mmu_context.c index 891967a95bc3c..0af614dfb6f92 100644 --- a/drivers/accel/ivpu/ivpu_mmu_context.c +++ b/drivers/accel/ivpu/ivpu_mmu_context.c @@ -612,18 +612,22 @@ int ivpu_mmu_reserved_context_init(struct ivpu_device *vdev) if (!ivpu_mmu_ensure_pgd(vdev, &vdev->rctx.pgtable)) { ivpu_err(vdev, "Failed to allocate root page table for reserved context\n"); ret = -ENOMEM; - goto unlock; + goto err_ctx_fini; } ret = ivpu_mmu_cd_set(vdev, vdev->rctx.id, &vdev->rctx.pgtable); if (ret) { ivpu_err(vdev, "Failed to set context descriptor for reserved context\n"); - goto unlock; + goto err_ctx_fini; } -unlock: mutex_unlock(&vdev->rctx.lock); return ret; + +err_ctx_fini: + mutex_unlock(&vdev->rctx.lock); + ivpu_mmu_context_fini(vdev, &vdev->rctx); + return ret; } void ivpu_mmu_reserved_context_fini(struct ivpu_device *vdev) From 0f6482caa6acdfdfc744db7430771fe7e6c4e787 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Tue, 10 Dec 2024 14:09:39 +0100 Subject: [PATCH 0783/1450] accel/ivpu: Fix WARN in ivpu_ipc_send_receive_internal() Move pm_runtime_set_active() to ivpu_pm_init() so when ivpu_ipc_send_receive_internal() is executed before ivpu_pm_enable() it already has correct runtime state, even if last resume was not successful. Fixes: 8ed520ff4682 ("accel/ivpu: Move set autosuspend delay to HW specific code") Cc: stable@vger.kernel.org # v6.7+ Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-4-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index dbc0711e28d13..949f4233946c6 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -378,6 +378,7 @@ void ivpu_pm_init(struct ivpu_device *vdev) pm_runtime_use_autosuspend(dev); pm_runtime_set_autosuspend_delay(dev, delay); + pm_runtime_set_active(dev); ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay); } @@ -392,7 +393,6 @@ void ivpu_pm_enable(struct ivpu_device *vdev) { struct device *dev = vdev->drm.dev; - pm_runtime_set_active(dev); pm_runtime_allow(dev); pm_runtime_mark_last_busy(dev); pm_runtime_put_autosuspend(dev); From 716f2bca1ce93bb95364f1fc0555c1650507b588 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 18 Dec 2024 18:57:24 +0100 Subject: [PATCH 0784/1450] selftests/bpf: Fix compilation error in get_uprobe_offset() In get_uprobe_offset(), the call to procmap_query() use the constant PROCMAP_QUERY_VMA_EXECUTABLE, even if PROCMAP_QUERY is not defined. Define PROCMAP_QUERY_VMA_EXECUTABLE when PROCMAP_QUERY isn't. Fixes: 4e9e07603ecd ("selftests/bpf: make use of PROCMAP_QUERY ioctl if available") Signed-off-by: Jerome Marchand Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20241218175724.578884-1-jmarchan@redhat.com --- tools/testing/selftests/bpf/trace_helpers.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 2d742fdac6b97..81943c6254e6b 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -293,6 +293,10 @@ static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *st return 0; } #else +# ifndef PROCMAP_QUERY_VMA_EXECUTABLE +# define PROCMAP_QUERY_VMA_EXECUTABLE 0x04 +# endif + static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) { return -EOPNOTSUPP; From 8d90a86ed053226a297ce062f4d9f4f521e05c4c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Dec 2024 20:19:48 -0800 Subject: [PATCH 0785/1450] mmc: sdhci-msm: fix crypto key eviction Commit c7eed31e235c ("mmc: sdhci-msm: Switch to the new ICE API") introduced an incorrect check of the algorithm ID into the key eviction path, and thus qcom_ice_evict_key() is no longer ever called. Fix it. Fixes: c7eed31e235c ("mmc: sdhci-msm: Switch to the new ICE API") Cc: stable@vger.kernel.org Cc: Abel Vesa Signed-off-by: Eric Biggers Message-ID: <20241213041958.202565-6-ebiggers@kernel.org> Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index e00208535bd1c..319f0ebbe652d 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -1867,20 +1867,20 @@ static int sdhci_msm_program_key(struct cqhci_host *cq_host, struct sdhci_msm_host *msm_host = sdhci_pltfm_priv(pltfm_host); union cqhci_crypto_cap_entry cap; + if (!(cfg->config_enable & CQHCI_CRYPTO_CONFIGURATION_ENABLE)) + return qcom_ice_evict_key(msm_host->ice, slot); + /* Only AES-256-XTS has been tested so far. */ cap = cq_host->crypto_cap_array[cfg->crypto_cap_idx]; if (cap.algorithm_id != CQHCI_CRYPTO_ALG_AES_XTS || cap.key_size != CQHCI_CRYPTO_KEY_SIZE_256) return -EINVAL; - if (cfg->config_enable & CQHCI_CRYPTO_CONFIGURATION_ENABLE) - return qcom_ice_program_key(msm_host->ice, - QCOM_ICE_CRYPTO_ALG_AES_XTS, - QCOM_ICE_CRYPTO_KEY_SIZE_256, - cfg->crypto_key, - cfg->data_unit_size, slot); - else - return qcom_ice_evict_key(msm_host->ice, slot); + return qcom_ice_program_key(msm_host->ice, + QCOM_ICE_CRYPTO_ALG_AES_XTS, + QCOM_ICE_CRYPTO_KEY_SIZE_256, + cfg->crypto_key, + cfg->data_unit_size, slot); } #else /* CONFIG_MMC_CRYPTO */ From 974e3fe0ac61de85015bbe5a4990cf4127b304b2 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 19 Dec 2024 12:53:01 +0100 Subject: [PATCH 0786/1450] fs: relax assertions on failure to encode file handles Encoding file handles is usually performed by a filesystem >encode_fh() method that may fail for various reasons. The legacy users of exportfs_encode_fh(), namely, nfsd and name_to_handle_at(2) syscall are ready to cope with the possibility of failure to encode a file handle. There are a few other users of exportfs_encode_{fh,fid}() that currently have a WARN_ON() assertion when ->encode_fh() fails. Relax those assertions because they are wrong. The second linked bug report states commit 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles") in v6.6 as the regressing commit, but this is not accurate. The aforementioned commit only increases the chances of the assertion and allows triggering the assertion with the reproducer using overlayfs, inotify and drop_caches. Triggering this assertion was always possible with other filesystems and other reasons of ->encode_fh() failures and more particularly, it was also possible with the exact same reproducer using overlayfs that is mounted with options index=on,nfs_export=on also on kernels < v6.6. Therefore, I am not listing the aforementioned commit as a Fixes commit. Backport hint: this patch will have a trivial conflict applying to v6.6.y, and other trivial conflicts applying to stable kernels < v6.6. Reported-by: syzbot+ec07f6f5ce62b858579f@syzkaller.appspotmail.com Tested-by: syzbot+ec07f6f5ce62b858579f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-unionfs/671fd40c.050a0220.4735a.024f.GAE@google.com/ Reported-by: Dmitry Safonov Closes: https://lore.kernel.org/linux-fsdevel/CAGrbwDTLt6drB9eaUagnQVgdPBmhLfqqxAf3F+Juqy_o6oP8uw@mail.gmail.com/ Cc: stable@vger.kernel.org Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20241219115301.465396-1-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/notify/fdinfo.c | 4 +--- fs/overlayfs/copy_up.c | 5 ++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index dec553034027e..e933f9c65d904 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -47,10 +47,8 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f->handle_bytes >> 2; ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size); - if ((ret == FILEID_INVALID) || (ret < 0)) { - WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); + if ((ret == FILEID_INVALID) || (ret < 0)) return; - } f->handle_type = ret; f->handle_bytes = size * sizeof(u32); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 3601ddfeddc2e..56eee9f23ea9a 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -442,9 +442,8 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, buflen = (dwords << 2); err = -EIO; - if (WARN_ON(fh_type < 0) || - WARN_ON(buflen > MAX_HANDLE_SZ) || - WARN_ON(fh_type == FILEID_INVALID)) + if (fh_type < 0 || fh_type == FILEID_INVALID || + WARN_ON(buflen > MAX_HANDLE_SZ)) goto out_err; fh->fb.version = OVL_FH_VERSION; From 469c0682e03d67d8dc970ecaa70c2d753057c7c0 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sun, 15 Dec 2024 12:01:59 +0900 Subject: [PATCH 0787/1450] pmdomain: imx: gpcv2: fix an OF node reference leak in imx_gpcv2_probe() imx_gpcv2_probe() leaks an OF node reference obtained by of_get_child_by_name(). Fix it by declaring the device node with the __free(device_node) cleanup construct. This bug was found by an experimental static analysis tool that I am developing. Fixes: 03aa12629fc4 ("soc: imx: Add GPCv2 power gating driver") Signed-off-by: Joe Hattori Cc: stable@vger.kernel.org Message-ID: <20241215030159.1526624-1-joe@pf.is.s.u-tokyo.ac.jp> Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/gpcv2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pmdomain/imx/gpcv2.c b/drivers/pmdomain/imx/gpcv2.c index e67ecf99ef84f..9bdb80fd72108 100644 --- a/drivers/pmdomain/imx/gpcv2.c +++ b/drivers/pmdomain/imx/gpcv2.c @@ -1458,12 +1458,12 @@ static int imx_gpcv2_probe(struct platform_device *pdev) .max_register = SZ_4K, }; struct device *dev = &pdev->dev; - struct device_node *pgc_np; + struct device_node *pgc_np __free(device_node) = + of_get_child_by_name(dev->of_node, "pgc"); struct regmap *regmap; void __iomem *base; int ret; - pgc_np = of_get_child_by_name(dev->of_node, "pgc"); if (!pgc_np) { dev_err(dev, "No power domains specified in DT\n"); return -EINVAL; From f64f610ec6ab59dd0391b03842cea3a4cd8ee34f Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Wed, 18 Dec 2024 19:44:33 +0100 Subject: [PATCH 0788/1450] pmdomain: core: add dummy release function to genpd device The genpd device, which is really only used as a handle to lookup OPP, but not even registered to the device core otherwise and thus lifetime linked to the genpd struct it is contained in, is missing a release function. After b8f7bbd1f4ec ("pmdomain: core: Add missing put_device()") the device will be cleaned up going through the driver core device_release() function, which will warn when no release callback is present for the device. Add a dummy release function to shut up the warning. Signed-off-by: Lucas Stach Tested-by: Luca Ceresoli Fixes: b8f7bbd1f4ec ("pmdomain: core: Add missing put_device()") Cc: stable@vger.kernel.org Message-ID: <20241218184433.1930532-1-l.stach@pengutronix.de> Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index bb11f467dc78c..20a9efebbcb76 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -2142,6 +2142,11 @@ static int genpd_set_default_power_state(struct generic_pm_domain *genpd) return 0; } +static void genpd_provider_release(struct device *dev) +{ + /* nothing to be done here */ +} + static int genpd_alloc_data(struct generic_pm_domain *genpd) { struct genpd_governor_data *gd = NULL; @@ -2173,6 +2178,7 @@ static int genpd_alloc_data(struct generic_pm_domain *genpd) genpd->gd = gd; device_initialize(&genpd->dev); + genpd->dev.release = genpd_provider_release; if (!genpd_is_dev_name_fw(genpd)) { dev_set_name(&genpd->dev, "%s", genpd->name); From d1d761b3012e99d55d288d435384be606302cb2c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:11:53 +0200 Subject: [PATCH 0789/1450] net: fib_rules: Add flow label selector attributes Add new FIB rule attributes which will allow user space to match on the IPv6 flow label with a mask. Temporarily set the type of the attributes to 'NLA_REJECT' while support is being added in the IPv6 code. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- include/uapi/linux/fib_rules.h | 2 ++ net/core/fib_rules.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index a6924dd3aff17..00e9890ca3c05 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -68,6 +68,8 @@ enum { FRA_SPORT_RANGE, /* sport */ FRA_DPORT_RANGE, /* dport */ FRA_DSCP, /* dscp */ + FRA_FLOWLABEL, /* flowlabel */ + FRA_FLOWLABEL_MASK, /* flowlabel mask */ __FRA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 34185d138c95a..153b14aade42e 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -770,6 +770,8 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), + [FRA_FLOWLABEL] = { .type = NLA_REJECT }, + [FRA_FLOWLABEL_MASK] = { .type = NLA_REJECT }, }; int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, From f0c898d8c279e6cfdf5e25dc04424d518dec1aa4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:11:54 +0200 Subject: [PATCH 0790/1450] ipv4: fib_rules: Reject flow label attributes IPv4 FIB rules cannot match on flow label so reject requests that try to add such rules. Do that in the IPv4 configure callback as the netlink policy resides in the core and used by both IPv4 and IPv6. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- net/ipv4/fib_rules.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 8325224ef0723..9517b8667e000 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -249,6 +249,12 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; + if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) { + NL_SET_ERR_MSG(extack, + "Flow label cannot be specified for IPv4 FIB rules"); + goto errout; + } + if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); From 9aa77531a1314dd46d3694eac5dc469a6690fca7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:11:55 +0200 Subject: [PATCH 0791/1450] ipv6: fib_rules: Add flow label support Implement support for the new flow label selector which allows IPv6 FIB rules to match on the flow label with a mask. Ensure that both flow label attributes are specified (or none) and that the mask is valid. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- net/ipv6/fib6_rules.c | 57 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index c85c1627cb16e..67d39114d9a63 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -26,6 +26,8 @@ struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; + __be32 flowlabel; + __be32 flowlabel_mask; dscp_t dscp; u8 dscp_full:1; /* DSCP or TOS selector */ }; @@ -34,7 +36,7 @@ static bool fib6_rule_matchall(const struct fib_rule *rule) { struct fib6_rule *r = container_of(rule, struct fib6_rule, common); - if (r->dst.plen || r->src.plen || r->dscp) + if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask) return false; return fib_rule_matchall(rule); } @@ -332,6 +334,9 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel)) return 0; + if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask) + return 0; + if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) return 0; @@ -360,6 +365,35 @@ static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6, return 0; } +static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + __be32 flowlabel, flowlabel_mask; + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) || + NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK)) + return -EINVAL; + + flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]); + flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]); + + if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK], + "Invalid flow label mask"); + return -EINVAL; + } + + if (flowlabel & ~flowlabel_mask) { + NL_SET_ERR_MSG(extack, "Flow label and mask do not match"); + return -EINVAL; + } + + rule6->flowlabel = flowlabel; + rule6->flowlabel_mask = flowlabel_mask; + + return 0; +} + static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, @@ -379,6 +413,10 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0) goto errout; + if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) && + fib6_nl2rule_flowlabel(tb, rule6, extack) < 0) + goto errout; + if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid table"); @@ -444,6 +482,14 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; } + if (tb[FRA_FLOWLABEL] && + nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel) + return 0; + + if (tb[FRA_FLOWLABEL_MASK] && + nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask) + return 0; + if (frh->src_len && nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) return 0; @@ -472,6 +518,11 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->tos = inet_dscp_to_dsfield(rule6->dscp); } + if (rule6->flowlabel_mask && + (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) || + nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask))) + goto nla_put_failure; + if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || (rule6->src.plen && @@ -487,7 +538,9 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ + nla_total_size(16) /* src */ - + nla_total_size(1); /* dscp */ + + nla_total_size(1) /* dscp */ + + nla_total_size(4) /* flowlabel */ + + nla_total_size(4); /* flowlabel mask */ } static void fib6_rule_flush_cache(struct fib_rules_ops *ops) From 4c25f3f0519486382644c76ee11b127026095c61 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:11:56 +0200 Subject: [PATCH 0792/1450] net: fib_rules: Enable flow label selector usage Now that both IPv4 and IPv6 correctly handle the new flow label attributes, enable user space to configure FIB rules that make use of the flow label by changing the policy to stop rejecting them and accepting 32 bit values in big-endian byte order. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- net/core/fib_rules.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 153b14aade42e..e684ba3ebb385 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -770,8 +770,8 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), - [FRA_FLOWLABEL] = { .type = NLA_REJECT }, - [FRA_FLOWLABEL_MASK] = { .type = NLA_REJECT }, + [FRA_FLOWLABEL] = { .type = NLA_BE32 }, + [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, }; int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, From c72004aac60a9ffdf4bc29b1e7ff0798a7eab3c2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:11:57 +0200 Subject: [PATCH 0793/1450] netlink: specs: Add FIB rule flow label attributes Add the new flow label attributes to the spec. Example: # ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/rt_rule.yaml \ --do newrule \ --json '{"family": 10, "flowlabel": 1, "flowlabel-mask": 1, "action": 1, "table": 1}' None $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/rt_rule.yaml \ --dump getrule --json '{"family": 10}' --output-json \ | jq '.[] | select(.flowlabel == "0x1")' { "table": 1, "suppress-prefixlen": "0xffffffff", "protocol": 0, "priority": 32765, "flowlabel": "0x1", "flowlabel-mask": "0x1", "family": 10, "dst-len": 0, "src-len": 0, "tos": 0, "action": "to-tbl", "flags": 0 } Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/rt_rule.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/netlink/specs/rt_rule.yaml b/Documentation/netlink/specs/rt_rule.yaml index 03a8eef7952e8..a9debac3058a0 100644 --- a/Documentation/netlink/specs/rt_rule.yaml +++ b/Documentation/netlink/specs/rt_rule.yaml @@ -172,6 +172,16 @@ attribute-sets: - name: dscp type: u8 + - + name: flowlabel + type: u32 + byte-order: big-endian + display-hint: hex + - + name: flowlabel-mask + type: u32 + byte-order: big-endian + display-hint: hex operations: enum-model: directional @@ -203,6 +213,8 @@ operations: - sport-range - dport-range - dscp + - flowlabel + - flowlabel-mask - name: newrule-ntf doc: Notify a rule creation From ba4138032ae3b5b8e2b68d2f2647cdc0817b05a6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:11:58 +0200 Subject: [PATCH 0794/1450] ipv6: Add flow label to route get requests The default IPv6 multipath hash policy takes the flow label into account when calculating a multipath hash and previous patches added a flow label selector to IPv6 FIB rules. Allow user space to specify a flow label in route get requests by adding a new netlink attribute and using its value to populate the "flowlabel" field in the IPv6 flow info structure prior to a route lookup. Deny the attribute in RTM_{NEW,DEL}ROUTE requests by checking for it in rtm_to_fib6_config() and returning an error if present. A subsequent patch will use this capability to test the new flow label selector in IPv6 FIB rules. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- include/uapi/linux/rtnetlink.h | 1 + net/ipv6/route.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index eccc0e7dcb7db..5ee94c511a28a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -393,6 +393,7 @@ enum rtattr_type_t { RTA_SPORT, RTA_DPORT, RTA_NH_ID, + RTA_FLOWLABEL, __RTA_MAX }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 67ff16c047180..78362822b9070 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5005,6 +5005,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, + [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -5030,6 +5031,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + if (tb[RTA_FLOWLABEL]) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Flow label cannot be specified for this operation"); + goto errout; + } + *cfg = (struct fib6_config){ .fc_table = rtm->rtm_table, .fc_dst_len = rtm->rtm_dst_len, @@ -6013,6 +6020,13 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } + if (tb[RTA_FLOWLABEL] && + (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Invalid flow label"); + return -EINVAL; + } + for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; @@ -6027,6 +6041,7 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, case RTA_SPORT: case RTA_DPORT: case RTA_IP_PROTO: + case RTA_FLOWLABEL: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); @@ -6049,6 +6064,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6 = {}; + __be32 flowlabel; bool fibmatch; err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); @@ -6057,7 +6073,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = -EINVAL; rtm = nlmsg_data(nlh); - fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { @@ -6103,6 +6118,9 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0); + fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel); + if (iif) { struct net_device *dev; int flags = 0; From d26b8267d9e02b02c8d1aeb38d7730b5efab3b64 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:11:59 +0200 Subject: [PATCH 0795/1450] netlink: specs: Add route flow label attribute Add the new flow label attribute to the spec. Example: # ip link add name dummy1 up type dummy # ip -6 route add default table 254 dev dummy1 # ip -6 route add default table 10 dev dummy1 # ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/rt_rule.yaml \ --do newrule \ --json '{"family": 10, "priority": 1, "flowlabel": 10, "flowlabel-mask": 255, "action": 1, "table": 10}' None $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/rt_route.yaml \ --do getroute \ --json '{"rtm-family": 10, "rta-flowlabel": 1}' --output-json \ | jq '.["rta-table"]' 254 $ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/rt_route.yaml \ --do getroute \ --json '{"rtm-family": 10, "rta-flowlabel": 10}' --output-json \ | jq '.["rta-table"]' 10 Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/rt_route.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/netlink/specs/rt_route.yaml b/Documentation/netlink/specs/rt_route.yaml index f4368be0caed9..a674103e5bc4e 100644 --- a/Documentation/netlink/specs/rt_route.yaml +++ b/Documentation/netlink/specs/rt_route.yaml @@ -177,6 +177,11 @@ attribute-sets: - name: rta-nh-id type: u32 + - + name: rta-flowlabel + type: u32 + byte-order: big-endian + display-hint: hex - name: rta-metrics attributes: @@ -260,6 +265,7 @@ operations: - rta-dport - rta-mark - rta-uid + - rta-flowlabel reply: value: 24 attributes: &all-route-attrs @@ -299,6 +305,7 @@ operations: - rta-sport - rta-dport - rta-nh-id + - rta-flowlabel dump: request: value: 26 From 002bf68a3b3e5f90ce61ea8fd11b8b62fd0765ce Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:12:00 +0200 Subject: [PATCH 0796/1450] tracing: ipv6: Add flow label to fib6_table_lookup tracepoint The different parameters affecting the IPv6 route lookup are printed to the trace buffer by the fib6_table_lookup tracepoint. Add the IPv6 flow label for better observability as it can affect the route lookup both in terms of multipath hash calculation and policy based routing (FIB rules). Example: # echo 1 > /sys/kernel/tracing/events/fib6/fib6_table_lookup/enable # ip -6 route get ::1 flowlabel 0x12345 ipproto udp sport 12345 dport 54321 &> /dev/null # cat /sys/kernel/tracing/trace_pipe ip-358 [010] ..... 44.897484: fib6_table_lookup: table 255 oif 0 iif 1 proto 17 ::/12345 -> ::1/54321 flowlabel 0x12345 tos 0 scope 0 flags 0 ==> dev lo gw :: err 0 Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- include/trace/events/fib6.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index 5d7ee26107280..8d22b2e98d481 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -22,6 +22,7 @@ TRACE_EVENT(fib6_table_lookup, __field( int, err ) __field( int, oif ) __field( int, iif ) + __field( u32, flowlabel ) __field( __u8, tos ) __field( __u8, scope ) __field( __u8, flags ) @@ -42,6 +43,7 @@ TRACE_EVENT(fib6_table_lookup, __entry->err = ip6_rt_type_to_error(res->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; + __entry->flowlabel = ntohl(flowi6_get_flowlabel(flp)); __entry->tos = ip6_tclass(flp->flowlabel); __entry->scope = flp->flowi6_scope; __entry->flags = flp->flowi6_flags; @@ -76,11 +78,11 @@ TRACE_EVENT(fib6_table_lookup, } ), - TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", + TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u flowlabel %#x tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, - __entry->tos, __entry->scope, __entry->flags, - __entry->name, __entry->gw, __entry->err) + __entry->flowlabel, __entry->tos, __entry->scope, + __entry->flags, __entry->name, __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ From 5760711e198d86bd0d0b9270a54a494ae9a501e0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:12:01 +0200 Subject: [PATCH 0797/1450] selftests: fib_rule_tests: Add flow label selector match tests Add tests for the new FIB rule flow label selector. Test both good and bad flows and with both input and output routes. # ./fib_rule_tests.sh IPv6 FIB rule tests [...] TEST: rule6 check: flowlabel redirect to table [ OK ] TEST: rule6 check: flowlabel no redirect to table [ OK ] TEST: rule6 del by pref: flowlabel redirect to table [ OK ] TEST: rule6 check: iif flowlabel redirect to table [ OK ] TEST: rule6 check: iif flowlabel no redirect to table [ OK ] TEST: rule6 del by pref: iif flowlabel redirect to table [ OK ] TEST: rule6 check: flowlabel masked redirect to table [ OK ] TEST: rule6 check: flowlabel masked no redirect to table [ OK ] TEST: rule6 del by pref: flowlabel masked redirect to table [ OK ] TEST: rule6 check: iif flowlabel masked redirect to table [ OK ] TEST: rule6 check: iif flowlabel masked no redirect to table [ OK ] TEST: rule6 del by pref: iif flowlabel masked redirect to table [ OK ] [...] Tests passed: 268 Tests failed: 0 Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/fib_rule_tests.sh | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 1d58b3b87465e..847936363a12b 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -291,6 +291,37 @@ fib_rule6_test() "$getnomatch" "iif dscp redirect to table" \ "iif dscp no redirect to table" fi + + fib_check_iproute_support "flowlabel" "flowlabel" + if [ $? -eq 0 ]; then + match="flowlabel 0xfffff" + getmatch="flowlabel 0xfffff" + getnomatch="flowlabel 0xf" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "flowlabel redirect to table" \ + "flowlabel no redirect to table" + + match="flowlabel 0xfffff" + getmatch="from $SRC_IP6 iif $DEV flowlabel 0xfffff" + getnomatch="from $SRC_IP6 iif $DEV flowlabel 0xf" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "iif flowlabel redirect to table" \ + "iif flowlabel no redirect to table" + + match="flowlabel 0x08000/0x08000" + getmatch="flowlabel 0xfffff" + getnomatch="flowlabel 0xf7fff" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "flowlabel masked redirect to table" \ + "flowlabel masked no redirect to table" + + match="flowlabel 0x08000/0x08000" + getmatch="from $SRC_IP6 iif $DEV flowlabel 0xfffff" + getnomatch="from $SRC_IP6 iif $DEV flowlabel 0xf7fff" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "iif flowlabel masked redirect to table" \ + "iif flowlabel masked no redirect to table" + fi } fib_rule6_vrf_test() From 1b684ca15f9d78f45de3cdba7e19611387e16aa7 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Tue, 17 Dec 2024 10:49:15 +0700 Subject: [PATCH 0798/1450] drm/sched: Fix drm_sched_fini() docu generation Commit baf4afc5831438 ("drm/sched: Improve teardown documentation") added a list of drm_sched_fini()'s problems. The list triggers htmldocs warning (but renders correctly in htmldocs output): Documentation/gpu/drm-mm:571: ./drivers/gpu/drm/scheduler/sched_main.c:1359: ERROR: Unexpected indentation. Separate the list from the preceding paragraph by a blank line to fix the warning. While at it, also end the aforementioned paragraph by a colon. Fixes: baf4afc58314 ("drm/sched: Improve teardown documentation") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/r/20241108175655.6d3fcfb7@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya [phasta: Adjust commit message] Signed-off-by: Philipp Stanner Link: https://patchwork.freedesktop.org/patch/msgid/20241217034915.62594-1-bagasdotme@gmail.com --- drivers/gpu/drm/scheduler/sched_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 7ce25281c74ce..57da84908752c 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -1355,7 +1355,8 @@ EXPORT_SYMBOL(drm_sched_init); * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job() * will not be called for all jobs still in drm_gpu_scheduler.pending_list. * There is no solution for this currently. Thus, it is up to the driver to make - * sure that + * sure that: + * * a) drm_sched_fini() is only called after for all submitted jobs * drm_sched_backend_ops.free_job() has been called or that * b) the jobs for which drm_sched_backend_ops.free_job() has not been called From a769bee5f9fbca47efd4fa6bc3d726d370cedebe Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Tue, 17 Dec 2024 00:09:36 +0530 Subject: [PATCH 0799/1450] smb: use macros instead of constants for leasekey size and default cifsattrs value Replace default hardcoded value for cifsAttrs with ATTR_ARCHIVE macro Use SMB2_LEASE_KEY_SIZE macro for leasekey size in smb2_lease_break Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 2 +- fs/smb/client/smb2pdu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 9d96b833015c8..b800c9f585d8d 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -398,7 +398,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL); if (!cifs_inode) return NULL; - cifs_inode->cifsAttrs = 0x20; /* default */ + cifs_inode->cifsAttrs = ATTR_ARCHIVE; /* default */ cifs_inode->time = 0; /* * Until the file is open and we have gotten oplock info back from the diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 010eae9d6c47e..c945b94318f84 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -6204,7 +6204,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, req->StructureSize = cpu_to_le16(36); total_len += 12; - memcpy(req->LeaseKey, lease_key, 16); + memcpy(req->LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); req->LeaseState = lease_state; flags |= CIFS_NO_RSP_BUF; From ee1c8e6b2931811a906b8c78006bfe0a3386fa60 Mon Sep 17 00:00:00 2001 From: Dragan Simic Date: Tue, 17 Dec 2024 10:25:10 +0100 Subject: [PATCH 0800/1450] smb: client: Deduplicate "select NETFS_SUPPORT" in Kconfig Repeating automatically selected options in Kconfig files is redundant, so let's delete repeated "select NETFS_SUPPORT" that was added accidentally. Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks") Signed-off-by: Dragan Simic Signed-off-by: Steve French --- fs/smb/client/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 2aff6d1395ce3..9f05f94e265a6 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -2,7 +2,6 @@ config CIFS tristate "SMB3 and CIFS support (advanced network filesystem)" depends on INET - select NETFS_SUPPORT select NLS select NLS_UCS2_UTILS select CRYPTO From e9f2517a3e18a54a3943c098d2226b245d488801 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Tue, 10 Dec 2024 18:15:12 -0300 Subject: [PATCH 0801/1450] smb: client: fix TCP timers deadlock after rmmod Commit ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.") fixed a netns UAF by manually enabled socket refcounting (sk->sk_net_refcnt=1 and sock_inuse_add(net, 1)). The reason the patch worked for that bug was because we now hold references to the netns (get_net_track() gets a ref internally) and they're properly released (internally, on __sk_destruct()), but only because sk->sk_net_refcnt was set. Problem: (this happens regardless of CONFIG_NET_NS_REFCNT_TRACKER and regardless if init_net or other) Setting sk->sk_net_refcnt=1 *manually* and *after* socket creation is not only out of cifs scope, but also technically wrong -- it's set conditionally based on user (=1) vs kernel (=0) sockets. And net/ implementations seem to base their user vs kernel space operations on it. e.g. upon TCP socket close, the TCP timers are not cleared because sk->sk_net_refcnt=1: (cf. commit 151c9c724d05 ("tcp: properly terminate timers for kernel sockets")) net/ipv4/tcp.c: void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); if (!sk->sk_net_refcnt) inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } Which will throw a lockdep warning and then, as expected, deadlock on tcp_write_timer(). A way to reproduce this is by running the reproducer from ef7134c7fc48 and then 'rmmod cifs'. A few seconds later, the deadlock/lockdep warning shows up. Fix: We shouldn't mess with socket internals ourselves, so do not set sk_net_refcnt manually. Also change __sock_create() to sock_create_kern() for explicitness. As for non-init_net network namespaces, we deal with it the best way we can -- hold an extra netns reference for server->ssocket and drop it when it's released. This ensures that the netns still exists whenever we need to create/destroy server->ssocket, but is not directly tied to it. Fixes: ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.") Cc: stable@vger.kernel.org Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/smb/client/connect.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 2372538a12118..ddcc9e514a0ed 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -987,9 +987,13 @@ clean_demultiplex_info(struct TCP_Server_Info *server) msleep(125); if (cifs_rdma_enabled(server)) smbd_destroy(server); + if (server->ssocket) { sock_release(server->ssocket); server->ssocket = NULL; + + /* Release netns reference for the socket. */ + put_net(cifs_net_ns(server)); } if (!list_empty(&server->pending_mid_q)) { @@ -1037,6 +1041,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server) */ } + /* Release netns reference for this server. */ put_net(cifs_net_ns(server)); kfree(server->leaf_fullpath); kfree(server); @@ -1713,6 +1718,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; + + /* Grab netns reference for this server. */ cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); @@ -1844,6 +1851,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, out_err_crypto_release: cifs_crypto_secmech_release(tcp_ses); + /* Release netns reference for this server. */ put_net(cifs_net_ns(tcp_ses)); out_err: @@ -1852,8 +1860,10 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, cifs_put_tcp_session(tcp_ses->primary_server, false); kfree(tcp_ses->hostname); kfree(tcp_ses->leaf_fullpath); - if (tcp_ses->ssocket) + if (tcp_ses->ssocket) { sock_release(tcp_ses->ssocket); + put_net(cifs_net_ns(tcp_ses)); + } kfree(tcp_ses); } return ERR_PTR(rc); @@ -3131,20 +3141,20 @@ generic_ip_connect(struct TCP_Server_Info *server) socket = server->ssocket; } else { struct net *net = cifs_net_ns(server); - struct sock *sk; - rc = __sock_create(net, sfamily, SOCK_STREAM, - IPPROTO_TCP, &server->ssocket, 1); + rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } - sk = server->ssocket->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + /* + * Grab netns reference for the socket. + * + * It'll be released here, on error, or in clean_demultiplex_info() upon server + * teardown. + */ + get_net(net); /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); @@ -3158,8 +3168,10 @@ generic_ip_connect(struct TCP_Server_Info *server) } rc = bind_socket(server); - if (rc < 0) + if (rc < 0) { + put_net(cifs_net_ns(server)); return rc; + } /* * Eventually check for other socket options to change from @@ -3196,6 +3208,7 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); + put_net(cifs_net_ns(server)); sock_release(socket); server->ssocket = NULL; return rc; @@ -3204,6 +3217,9 @@ generic_ip_connect(struct TCP_Server_Info *server) if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); + if (rc < 0) + put_net(cifs_net_ns(server)); + return rc; } From de35994ecd2dd6148ab5a6c5050a1670a04dec77 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 19 Dec 2024 09:30:30 +0000 Subject: [PATCH 0802/1450] workqueue: Do not warn when cancelling WQ_MEM_RECLAIM work from !WQ_MEM_RECLAIM worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 746ae46c1113 ("drm/sched: Mark scheduler work queues with WQ_MEM_RECLAIM") amdgpu started seeing the following warning: [ ] workqueue: WQ_MEM_RECLAIM sdma0:drm_sched_run_job_work [gpu_sched] is flushing !WQ_MEM_RECLAIM events:amdgpu_device_delay_enable_gfx_off [amdgpu] ... [ ] Workqueue: sdma0 drm_sched_run_job_work [gpu_sched] ... [ ] Call Trace: [ ] ... [ ] ? check_flush_dependency+0xf5/0x110 ... [ ] cancel_delayed_work_sync+0x6e/0x80 [ ] amdgpu_gfx_off_ctrl+0xab/0x140 [amdgpu] [ ] amdgpu_ring_alloc+0x40/0x50 [amdgpu] [ ] amdgpu_ib_schedule+0xf4/0x810 [amdgpu] [ ] ? drm_sched_run_job_work+0x22c/0x430 [gpu_sched] [ ] amdgpu_job_run+0xaa/0x1f0 [amdgpu] [ ] drm_sched_run_job_work+0x257/0x430 [gpu_sched] [ ] process_one_work+0x217/0x720 ... [ ] The intent of the verifcation done in check_flush_depedency is to ensure forward progress during memory reclaim, by flagging cases when either a memory reclaim process, or a memory reclaim work item is flushed from a context not marked as memory reclaim safe. This is correct when flushing, but when called from the cancel(_delayed)_work_sync() paths it is a false positive because work is either already running, or will not be running at all. Therefore cancelling it is safe and we can relax the warning criteria by letting the helper know of the calling context. Signed-off-by: Tvrtko Ursulin Fixes: fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue") References: 746ae46c1113 ("drm/sched: Mark scheduler work queues with WQ_MEM_RECLAIM") Cc: Tejun Heo Cc: Peter Zijlstra Cc: Lai Jiangshan Cc: Alex Deucher Cc: Christian König Cc: # v4.5+ Signed-off-by: Tejun Heo --- kernel/workqueue.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8b07576814a58..8336218ec4b86 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3680,23 +3680,27 @@ void workqueue_softirq_dead(unsigned int cpu) * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed * @target_work: work item being flushed (NULL for workqueue flushes) + * @from_cancel: are we called from the work cancel path * * %current is trying to flush the whole @target_wq or @target_work on it. - * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not - * reclaiming memory or running on a workqueue which doesn't have - * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to - * a deadlock. + * If this is not the cancel path (which implies work being flushed is either + * already running, or will not be at all), check if @target_wq doesn't have + * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running + * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward- + * progress guarantee leading to a deadlock. */ static void check_flush_dependency(struct workqueue_struct *target_wq, - struct work_struct *target_work) + struct work_struct *target_work, + bool from_cancel) { - work_func_t target_func = target_work ? target_work->func : NULL; + work_func_t target_func; struct worker *worker; - if (target_wq->flags & WQ_MEM_RECLAIM) + if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM) return; worker = current_wq_worker(); + target_func = target_work ? target_work->func : NULL; WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", @@ -3980,7 +3984,7 @@ void __flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } - check_flush_dependency(wq, NULL); + check_flush_dependency(wq, NULL, false); mutex_unlock(&wq->mutex); @@ -4155,7 +4159,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, } wq = pwq->wq; - check_flush_dependency(wq, work); + check_flush_dependency(wq, work, from_cancel); insert_wq_barrier(pwq, barr, work, worker); raw_spin_unlock_irq(&pool->lock); From c261e4f1dd29fabab54b325bc1da8769a3998be1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Dec 2024 09:32:26 -0700 Subject: [PATCH 0803/1450] io_uring/register: limit ring resizing to DEFER_TASKRUN With DEFER_TASKRUN, we know the ring can't be both waited upon and resized at the same time. This is important for CQ resizing. Allowing SQ ring resizing is more trivial, but isn't the interesting use case. Hence limit ring resizing in general to DEFER_TASKRUN only for now. This isn't a huge problem as CQ ring resizing is generally the most useful on networking type of workloads where it can be hard to size the ring appropriately upfront, and those should be using DEFER_TASKRUN for better performance. Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Signed-off-by: Jens Axboe --- io_uring/register.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/register.c b/io_uring/register.c index 1e99c783abdf6..fdd44914c39c7 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -414,6 +414,9 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && current != ctx->submitter_task) return -EEXIST; + /* limited to DEFER_TASKRUN for now */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; if (copy_from_user(&p, arg, sizeof(p))) return -EFAULT; if (p.flags & ~RESIZE_FLAGS) From 3202ca221578850f34e0fea39dc6cfa745ed7aac Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 17 Dec 2024 10:51:01 +0100 Subject: [PATCH 0804/1450] PCI: Honor Max Link Speed when determining supported speeds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Supported Link Speeds Vector in the Link Capabilities 2 Register indicates the *supported* link speeds. The Max Link Speed field in the Link Capabilities Register indicates the *maximum* of those speeds. pcie_get_supported_speeds() neglects to honor the Max Link Speed field and will thus incorrectly deem higher speeds as supported. Fix it. One user-visible issue addressed here is an incorrect value in the sysfs attribute "max_link_speed". But the main motivation is a boot hang reported by Niklas: Intel JHL7540 "Titan Ridge 2018" Thunderbolt controllers supports 2.5-8 GT/s speeds, but indicate 2.5 GT/s as maximum. Ilpo recalls seeing this on more devices. It can be explained by the controller's Downstream Ports supporting 8 GT/s if an Endpoint is attached, but limiting to 2.5 GT/s if the port interfaces to a PCIe Adapter, in accordance with USB4 v2 sec 11.2.1: "This section defines the functionality of an Internal PCIe Port that interfaces to a PCIe Adapter. [...] The Logical sub-block shall update the PCIe configuration registers with the following characteristics: [...] Max Link Speed field in the Link Capabilities Register set to 0001b (data rate of 2.5 GT/s only). Note: These settings do not represent actual throughput. Throughput is implementation specific and based on the USB4 Fabric performance." The present commit is not sufficient on its own to fix Niklas' boot hang, but it is a prerequisite: A subsequent commit will fix the boot hang by enabling bandwidth control only if more than one speed is supported. The GENMASK() macro used herein specifies 0 as lowest bit, even though the Supported Link Speeds Vector ends at bit 1. This is done on purpose to avoid a GENMASK(0, 1) macro if Max Link Speed is zero. That macro would be invalid as the lowest bit is greater than the highest bit. Ilpo has witnessed a zero Max Link Speed on Root Complex Integrated Endpoints in particular, so it does occur in practice. (The Link Capabilities Register is optional on RCiEPs per PCIe r6.2 sec 7.5.3.) Fixes: d2bd39c0456b ("PCI: Store all PCIe Supported Link Speeds") Closes: https://lore.kernel.org/r/70829798889c6d779ca0f6cd3260a765780d1369.camel@kernel.org Link: https://lore.kernel.org/r/fe03941e3e1cc42fb9bf4395e302bff53ee2198b.1734428762.git.lukas@wunner.de Reported-by: Niklas Schnelle Tested-by: Niklas Schnelle Signed-off-by: Lukas Wunner Signed-off-by: Krzysztof Wilczyński Reviewed-by: Jonathan Cameron Reviewed-by: Ilpo Järvinen --- drivers/pci/pci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0b29ec6e8e5e2..661f98c6c63a3 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6232,12 +6232,14 @@ u8 pcie_get_supported_speeds(struct pci_dev *dev) pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2); speeds = lnkcap2 & PCI_EXP_LNKCAP2_SLS; + /* Ignore speeds higher than Max Link Speed */ + pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); + speeds &= GENMASK(lnkcap & PCI_EXP_LNKCAP_SLS, 0); + /* PCIe r3.0-compliant */ if (speeds) return speeds; - pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); - /* Synthesize from the Max Link Speed field */ if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB) speeds = PCI_EXP_LNKCAP2_SLS_5_0GB | PCI_EXP_LNKCAP2_SLS_2_5GB; From 774c71c52aa487001c7da9f93b10cedc9985c371 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 17 Dec 2024 10:51:02 +0100 Subject: [PATCH 0805/1450] PCI/bwctrl: Enable only if more than one speed is supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a PCIe port only supports a single speed, enabling bandwidth control is pointless: There's no need to monitor autonomous speed changes, nor can the speed be changed. Not enabling it saves a small amount of memory and compute resources, but also fixes a boot hang reported by Niklas: It occurs when enabling bandwidth control on Downstream Ports of Intel JHL7540 "Titan Ridge 2018" Thunderbolt controllers. The ports only support 2.5 GT/s in accordance with USB4 v2 sec 11.2.1, so the present commit works around the issue. PCIe r6.2 sec 8.2.1 prescribes that: "A device must support 2.5 GT/s and is not permitted to skip support for any data rates between 2.5 GT/s and the highest supported rate." Consequently, bandwidth control is currently only disabled if a port doesn't support higher speeds than 2.5 GT/s. However the Implementation Note in PCIe r6.2 sec 7.5.3.18 cautions: "It is strongly encouraged that software primarily utilize the Supported Link Speeds Vector instead of the Max Link Speed field, so that software can determine the exact set of supported speeds on current and future hardware. This can avoid software being confused if a future specification defines Links that do not require support for all slower speeds." In other words, future revisions of the PCIe Base Spec may allow gaps in the Supported Link Speeds Vector. To be future-proof, don't just check whether speeds above 2.5 GT/s are supported, but rather check whether *more than one* speed is supported. Fixes: 665745f27487 ("PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller") Closes: https://lore.kernel.org/r/db8e457fcd155436449b035e8791a8241b0df400.camel@kernel.org Link: https://lore.kernel.org/r/3564908a9c99fc0d2a292473af7a94ebfc8f5820.1734428762.git.lukas@wunner.de Reported-by: Niklas Schnelle Tested-by: Niklas Schnelle Signed-off-by: Lukas Wunner Signed-off-by: Krzysztof Wilczyński Reviewed-by: Jonathan Cameron Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen --- drivers/pci/pcie/portdrv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c index 5e10306b63081..02e73099bad05 100644 --- a/drivers/pci/pcie/portdrv.c +++ b/drivers/pci/pcie/portdrv.c @@ -265,12 +265,14 @@ static int get_port_device_capability(struct pci_dev *dev) (pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER))) services |= PCIE_PORT_SERVICE_DPC; + /* Enable bandwidth control if more than one speed is supported. */ if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM || pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) { u32 linkcap; pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap); - if (linkcap & PCI_EXP_LNKCAP_LBNC) + if (linkcap & PCI_EXP_LNKCAP_LBNC && + hweight8(dev->supported_speeds) > 1) services |= PCIE_PORT_SERVICE_BWCTRL; } From 92941c7f2c9529fac1b2670482d0ced3b46eac70 Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Thu, 19 Dec 2024 23:28:50 +0530 Subject: [PATCH 0806/1450] smb: fix bytes written value in /proc/fs/cifs/Stats With recent netfs apis changes, the bytes written value was not getting updated in /proc/fs/cifs/Stats. Fix this by updating tcon->bytes in write operations. Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib") Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index c945b94318f84..9593593012508 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4840,6 +4840,8 @@ smb2_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; + cifs_stats_bytes_written(tcon, written); + if (written < wdata->subreq.len) wdata->result = -ENOSPC; else @@ -5156,6 +5158,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_dbg(VFS, "Send error in write = %d\n", rc); } else { *nbytes = le32_to_cpu(rsp->DataLength); + cifs_stats_bytes_written(io_parms->tcon, *nbytes); trace_smb3_write_done(0, 0, xid, req->PersistentFileId, io_parms->tcon->tid, From dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 19 Dec 2024 19:52:58 +0000 Subject: [PATCH 0807/1450] io_uring: check if iowq is killed before queuing task work can be executed after the task has gone through io_uring termination, whether it's the final task_work run or the fallback path. In this case, task work will find ->io_wq being already killed and null'ed, which is a problem if it then tries to forward the request to io_queue_iowq(). Make io_queue_iowq() fail requests in this case. Note that it also checks PF_KTHREAD, because the user can first close a DEFER_TASKRUN ring and shortly after kill the task, in which case ->iowq check would race. Cc: stable@vger.kernel.org Fixes: 50c52250e2d74 ("block: implement async io_uring discard cmd") Fixes: 773af69121ecc ("io_uring: always reissue from task_work context") Reported-by: Will Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/63312b4a2c2bb67ad67b857d17a300e1d3b078e8.1734637909.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 432b95ca9c85e..d3403c8216db8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -514,7 +514,11 @@ static void io_queue_iowq(struct io_kiocb *req) struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); + + if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } /* init ->work of the whole link before punting */ io_prep_async_link(req); From 25c6a5ab151fb9c886552bf5aa7cbf2a5c6e96af Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 17 Dec 2024 14:35:00 +0800 Subject: [PATCH 0808/1450] net: phy: micrel: Dynamically control external clock of KSZ PHY On the i.MX6ULL-14x14-EVK board, enet1_ref and enet2_ref are used as the clock sources for two external KSZ PHYs. However, after closing the two FEC ports, the clk_enable_count of the enet1_ref and enet2_ref clocks is not 0. The root cause is that since the commit 985329462723 ("net: phy: micrel: use devm_clk_get_optional_enabled for the rmii-ref clock"), the external clock of KSZ PHY has been enabled when the PHY driver probes, and it can only be disabled when the PHY driver is removed. This causes the clock to continue working when the system is suspended or the network port is down. Although Heiko explained in the commit message that the patch was because some clock suppliers need to enable the clock to get the valid clock rate , it seems that the simple fix is to disable the clock after getting the clock rate to solve the current problem. This is indeed true, but we need to admit that Heiko's patch has been applied for more than a year, and we cannot guarantee whether there are platforms that only enable rmii-ref in the KSZ PHY driver during this period. If this is the case, disabling rmii-ref will cause RMII on these platforms to not work. Secondly, commit 99ac4cbcc2a5 ("net: phy: micrel: allow usage of generic ethernet-phy clock") just simply enables the generic clock permanently, which seems like the generic clock may only be enabled in the PHY driver. If we simply disable the generic clock, RMII may not work. If we keep it as it is, the platform using the generic clock will have the same problem as the i.MX6ULL platform. To solve this problem, the clock is enabled when phy_driver::resume() is called, and the clock is disabled when phy_driver::suspend() is called. Since phy_driver::resume() and phy_driver::suspend() are not called in pairs, an additional clk_enable flag is added. When phy_driver::suspend() is called, the clock is disabled only if clk_enable is true. Conversely, when phy_driver::resume() is called, the clock is enabled if clk_enable is false. The changes that introduced the problem were only a few lines, while the current fix is about a hundred lines, which seems out of proportion, but it is necessary because kszphy_probe() is used by multiple KSZ PHYs and we need to fix all of them. Fixes: 985329462723 ("net: phy: micrel: use devm_clk_get_optional_enabled for the rmii-ref clock") Fixes: 99ac4cbcc2a5 ("net: phy: micrel: allow usage of generic ethernet-phy clock") Signed-off-by: Wei Fang Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20241217063500.1424011-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 114 ++++++++++++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 3ef5088406740..eeb33eb181ac9 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -432,10 +432,12 @@ struct kszphy_ptp_priv { struct kszphy_priv { struct kszphy_ptp_priv ptp_priv; const struct kszphy_type *type; + struct clk *clk; int led_mode; u16 vct_ctrl1000; bool rmii_ref_clk_sel; bool rmii_ref_clk_sel_val; + bool clk_enable; u64 stats[ARRAY_SIZE(kszphy_hw_stats)]; }; @@ -2050,6 +2052,46 @@ static void kszphy_get_stats(struct phy_device *phydev, data[i] = kszphy_get_stat(phydev, i); } +static void kszphy_enable_clk(struct phy_device *phydev) +{ + struct kszphy_priv *priv = phydev->priv; + + if (!priv->clk_enable && priv->clk) { + clk_prepare_enable(priv->clk); + priv->clk_enable = true; + } +} + +static void kszphy_disable_clk(struct phy_device *phydev) +{ + struct kszphy_priv *priv = phydev->priv; + + if (priv->clk_enable && priv->clk) { + clk_disable_unprepare(priv->clk); + priv->clk_enable = false; + } +} + +static int kszphy_generic_resume(struct phy_device *phydev) +{ + kszphy_enable_clk(phydev); + + return genphy_resume(phydev); +} + +static int kszphy_generic_suspend(struct phy_device *phydev) +{ + int ret; + + ret = genphy_suspend(phydev); + if (ret) + return ret; + + kszphy_disable_clk(phydev); + + return 0; +} + static int kszphy_suspend(struct phy_device *phydev) { /* Disable PHY Interrupts */ @@ -2059,7 +2101,7 @@ static int kszphy_suspend(struct phy_device *phydev) phydev->drv->config_intr(phydev); } - return genphy_suspend(phydev); + return kszphy_generic_suspend(phydev); } static void kszphy_parse_led_mode(struct phy_device *phydev) @@ -2090,7 +2132,9 @@ static int kszphy_resume(struct phy_device *phydev) { int ret; - genphy_resume(phydev); + ret = kszphy_generic_resume(phydev); + if (ret) + return ret; /* After switching from power-down to normal mode, an internal global * reset is automatically generated. Wait a minimum of 1 ms before @@ -2112,6 +2156,24 @@ static int kszphy_resume(struct phy_device *phydev) return 0; } +/* Because of errata DS80000700A, receiver error following software + * power down. Suspend and resume callbacks only disable and enable + * external rmii reference clock. + */ +static int ksz8041_resume(struct phy_device *phydev) +{ + kszphy_enable_clk(phydev); + + return 0; +} + +static int ksz8041_suspend(struct phy_device *phydev) +{ + kszphy_disable_clk(phydev); + + return 0; +} + static int ksz9477_resume(struct phy_device *phydev) { int ret; @@ -2159,7 +2221,10 @@ static int ksz8061_resume(struct phy_device *phydev) if (!(ret & BMCR_PDOWN)) return 0; - genphy_resume(phydev); + ret = kszphy_generic_resume(phydev); + if (ret) + return ret; + usleep_range(1000, 2000); /* Re-program the value after chip is reset. */ @@ -2177,6 +2242,11 @@ static int ksz8061_resume(struct phy_device *phydev) return 0; } +static int ksz8061_suspend(struct phy_device *phydev) +{ + return kszphy_suspend(phydev); +} + static int kszphy_probe(struct phy_device *phydev) { const struct kszphy_type *type = phydev->drv->driver_data; @@ -2217,10 +2287,14 @@ static int kszphy_probe(struct phy_device *phydev) } else if (!clk) { /* unnamed clock from the generic ethernet-phy binding */ clk = devm_clk_get_optional_enabled(&phydev->mdio.dev, NULL); - if (IS_ERR(clk)) - return PTR_ERR(clk); } + if (IS_ERR(clk)) + return PTR_ERR(clk); + + clk_disable_unprepare(clk); + priv->clk = clk; + if (ksz8041_fiber_mode(phydev)) phydev->port = PORT_FIBRE; @@ -5290,6 +5364,21 @@ static int lan8841_probe(struct phy_device *phydev) return 0; } +static int lan8804_resume(struct phy_device *phydev) +{ + return kszphy_resume(phydev); +} + +static int lan8804_suspend(struct phy_device *phydev) +{ + return kszphy_generic_suspend(phydev); +} + +static int lan8841_resume(struct phy_device *phydev) +{ + return kszphy_generic_resume(phydev); +} + static int lan8841_suspend(struct phy_device *phydev) { struct kszphy_priv *priv = phydev->priv; @@ -5298,7 +5387,7 @@ static int lan8841_suspend(struct phy_device *phydev) if (ptp_priv->ptp_clock) ptp_cancel_worker_sync(ptp_priv->ptp_clock); - return genphy_suspend(phydev); + return kszphy_generic_suspend(phydev); } static struct phy_driver ksphy_driver[] = { @@ -5358,9 +5447,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - /* No suspend/resume callbacks because of errata DS80000700A, - * receiver error following software power down. - */ + .suspend = ksz8041_suspend, + .resume = ksz8041_resume, }, { .phy_id = PHY_ID_KSZ8041RNLI, .phy_id_mask = MICREL_PHY_ID_MASK, @@ -5436,7 +5524,7 @@ static struct phy_driver ksphy_driver[] = { .soft_reset = genphy_soft_reset, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, - .suspend = kszphy_suspend, + .suspend = ksz8061_suspend, .resume = ksz8061_resume, }, { .phy_id = PHY_ID_KSZ9021, @@ -5507,8 +5595,8 @@ static struct phy_driver ksphy_driver[] = { .get_sset_count = kszphy_get_sset_count, .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, - .suspend = genphy_suspend, - .resume = kszphy_resume, + .suspend = lan8804_suspend, + .resume = lan8804_resume, .config_intr = lan8804_config_intr, .handle_interrupt = lan8804_handle_interrupt, }, { @@ -5526,7 +5614,7 @@ static struct phy_driver ksphy_driver[] = { .get_strings = kszphy_get_strings, .get_stats = kszphy_get_stats, .suspend = lan8841_suspend, - .resume = genphy_resume, + .resume = lan8841_resume, .cable_test_start = lan8814_cable_test_start, .cable_test_get_status = ksz886x_cable_test_get_status, }, { From b1b66ae094cd2aa49c6841683cb7846bd46f38ca Mon Sep 17 00:00:00 2001 From: Hongguang Gao Date: Tue, 17 Dec 2024 10:26:15 -0800 Subject: [PATCH 0809/1450] bnxt_en: Use FW defined resource limits for RoCE If FW supports setting resource limits for RoCE, then just use the FW limits instead of using some fixed values in the driver. These limits will be used to allocate context memory for QP, SRQ, AH, and MR resources for RoCE. Reviewed-by: Damodharam Ammepalli Reviewed-by: Kalesh AP Signed-off-by: Hongguang Gao Signed-off-by: Selvin Xavier Signed-off-by: Michael Chan Link: https://patch.msgid.link/20241217182620.2454075-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 41 +++++++++++++------ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 3 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 2 + drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 2 + 4 files changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b86f980fa7ea6..469352ac1f7e7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9117,10 +9117,18 @@ static int bnxt_alloc_ctx_mem(struct bnxt *bp) ena = 0; if ((bp->flags & BNXT_FLAG_ROCE_CAP) && !is_kdump_kernel()) { pg_lvl = 2; - extra_qps = min_t(u32, 65536, max_qps - l2_qps - qp1_qps); - /* allocate extra qps if fw supports RoCE fast qp destroy feature */ - extra_qps += fast_qpmd_qps; - extra_srqs = min_t(u32, 8192, max_srqs - srqs); + if (BNXT_SW_RES_LMT(bp)) { + extra_qps = max_qps - l2_qps - qp1_qps; + extra_srqs = max_srqs - srqs; + } else { + extra_qps = min_t(u32, 65536, + max_qps - l2_qps - qp1_qps); + /* allocate extra qps if fw supports RoCE fast qp + * destroy feature + */ + extra_qps += fast_qpmd_qps; + extra_srqs = min_t(u32, 8192, max_srqs - srqs); + } if (fast_qpmd_qps) ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_QP_FAST_QPMD; } @@ -9156,14 +9164,20 @@ static int bnxt_alloc_ctx_mem(struct bnxt *bp) goto skip_rdma; ctxm = &ctx->ctx_arr[BNXT_CTX_MRAV]; - /* 128K extra is needed to accommodate static AH context - * allocation by f/w. - */ - num_mr = min_t(u32, ctxm->max_entries / 2, 1024 * 256); - num_ah = min_t(u32, num_mr, 1024 * 128); - ctxm->split_entry_cnt = BNXT_CTX_MRAV_AV_SPLIT_ENTRY + 1; - if (!ctxm->mrav_av_entries || ctxm->mrav_av_entries > num_ah) - ctxm->mrav_av_entries = num_ah; + if (BNXT_SW_RES_LMT(bp) && + ctxm->split_entry_cnt == BNXT_CTX_MRAV_AV_SPLIT_ENTRY + 1) { + num_ah = ctxm->mrav_av_entries; + num_mr = ctxm->max_entries - num_ah; + } else { + /* 128K extra is needed to accommodate static AH context + * allocation by f/w. + */ + num_mr = min_t(u32, ctxm->max_entries / 2, 1024 * 256); + num_ah = min_t(u32, num_mr, 1024 * 128); + ctxm->split_entry_cnt = BNXT_CTX_MRAV_AV_SPLIT_ENTRY + 1; + if (!ctxm->mrav_av_entries || ctxm->mrav_av_entries > num_ah) + ctxm->mrav_av_entries = num_ah; + } rc = bnxt_setup_ctxm_pg_tbls(bp, ctxm, num_mr + num_ah, 2); if (rc) @@ -9470,6 +9484,9 @@ static int __bnxt_hwrm_func_qcaps(struct bnxt *bp) bp->flags |= BNXT_FLAG_UDP_GSO_CAP; if (flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_TX_PKT_TS_CMPL_SUPPORTED) bp->fw_cap |= BNXT_FW_CAP_TX_TS_CMP; + if (flags_ext2 & + FUNC_QCAPS_RESP_FLAGS_EXT2_SW_MAX_RESOURCE_LIMITS_SUPPORTED) + bp->fw_cap |= BNXT_FW_CAP_SW_MAX_RESOURCE_LIMITS; if (BNXT_PF(bp) && (flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_ROCE_VF_RESOURCE_MGMT_SUPPORTED)) bp->fw_cap |= BNXT_FW_CAP_ROCE_VF_RESC_MGMT_SUPPORTED; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 7df7a2233307a..3e20d200da620 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2482,6 +2482,7 @@ struct bnxt { #define BNXT_FW_CAP_CFA_NTUPLE_RX_EXT_IP_PROTO BIT_ULL(38) #define BNXT_FW_CAP_CFA_RFS_RING_TBL_IDX_V3 BIT_ULL(39) #define BNXT_FW_CAP_VNIC_RE_FLUSH BIT_ULL(40) + #define BNXT_FW_CAP_SW_MAX_RESOURCE_LIMITS BIT_ULL(41) u32 fw_dbg_cap; @@ -2501,6 +2502,8 @@ struct bnxt { ((bp)->fw_cap & BNXT_FW_CAP_ENABLE_RDMA_SRIOV) #define BNXT_ROCE_VF_RESC_CAP(bp) \ ((bp)->fw_cap & BNXT_FW_CAP_ROCE_VF_RESC_MGMT_SUPPORTED) +#define BNXT_SW_RES_LMT(bp) \ + ((bp)->fw_cap & BNXT_FW_CAP_SW_MAX_RESOURCE_LIMITS) u32 hwrm_spec_code; u16 hwrm_cmd_seq; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index b771c84cdd895..94c6a0928ca02 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -416,6 +416,8 @@ static void bnxt_set_edev_info(struct bnxt_en_dev *edev, struct bnxt *bp) edev->flags |= BNXT_EN_FLAG_VF; if (BNXT_ROCE_VF_RESC_CAP(bp)) edev->flags |= BNXT_EN_FLAG_ROCE_VF_RES_MGMT; + if (BNXT_SW_RES_LMT(bp)) + edev->flags |= BNXT_EN_FLAG_SW_RES_LMT; edev->chip_num = bp->chip_num; edev->hw_ring_stats_size = bp->hw_ring_stats_size; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h index 5d6aac60f2365..54ad9f8273d76 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h @@ -65,6 +65,8 @@ struct bnxt_en_dev { #define BNXT_EN_FLAG_VF 0x10 #define BNXT_EN_VF(edev) ((edev)->flags & BNXT_EN_FLAG_VF) #define BNXT_EN_FLAG_ROCE_VF_RES_MGMT 0x20 + #define BNXT_EN_FLAG_SW_RES_LMT 0x40 +#define BNXT_EN_SW_RES_LMT(edev) ((edev)->flags & BNXT_EN_FLAG_SW_RES_LMT) struct bnxt_ulp *ulp_tbl; int l2_db_size; /* Doorbell BAR size in From fac5472fc845115ea543acbe9b183d330d6277ed Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 17 Dec 2024 10:26:16 -0800 Subject: [PATCH 0810/1450] bnxt_en: Do not allow ethtool -m on an untrusted VF Block all ethtool module operations on an untrusted VF. The firmware won't allow it and will return error. Reviewed-by: Kalesh AP Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://patch.msgid.link/20241217182620.2454075-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 +++++- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 5 +++++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 12 ++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 469352ac1f7e7..c0728d5ff8bc1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8279,16 +8279,20 @@ static int bnxt_hwrm_func_qcfg(struct bnxt *bp) if (rc) goto func_qcfg_exit; + flags = le16_to_cpu(resp->flags); #ifdef CONFIG_BNXT_SRIOV if (BNXT_VF(bp)) { struct bnxt_vf_info *vf = &bp->vf; vf->vlan = le16_to_cpu(resp->vlan) & VLAN_VID_MASK; + if (flags & FUNC_QCFG_RESP_FLAGS_TRUSTED_VF) + vf->flags |= BNXT_VF_TRUST; + else + vf->flags &= ~BNXT_VF_TRUST; } else { bp->pf.registered_vfs = le16_to_cpu(resp->registered_vfs); } #endif - flags = le16_to_cpu(resp->flags); if (flags & (FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED | FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED)) { bp->fw_cap |= BNXT_FW_CAP_LLDP_AGENT; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 3e20d200da620..d5e81e008ab5b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2270,6 +2270,11 @@ struct bnxt { #define BNXT_PF(bp) (!((bp)->flags & BNXT_FLAG_VF)) #define BNXT_VF(bp) ((bp)->flags & BNXT_FLAG_VF) +#ifdef CONFIG_BNXT_SRIOV +#define BNXT_VF_IS_TRUSTED(bp) ((bp)->vf.flags & BNXT_VF_TRUST) +#else +#define BNXT_VF_IS_TRUSTED(bp) 0 +#endif #define BNXT_NPAR(bp) ((bp)->port_partition_type) #define BNXT_MH(bp) ((bp)->flags & BNXT_FLAG_MULTI_HOST) #define BNXT_SINGLE_PF(bp) (BNXT_PF(bp) && !BNXT_NPAR(bp) && !BNXT_MH(bp)) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index d87681d711067..28f2c471652c4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4375,6 +4375,9 @@ static int bnxt_get_module_info(struct net_device *dev, struct bnxt *bp = netdev_priv(dev); int rc; + if (BNXT_VF(bp) && !BNXT_VF_IS_TRUSTED(bp)) + return -EPERM; + /* No point in going further if phy status indicates * module is not inserted or if it is powered down or * if it is of type 10GBase-T @@ -4426,6 +4429,9 @@ static int bnxt_get_module_eeprom(struct net_device *dev, u16 start = eeprom->offset, length = eeprom->len; int rc = 0; + if (BNXT_VF(bp) && !BNXT_VF_IS_TRUSTED(bp)) + return -EPERM; + memset(data, 0, eeprom->len); /* Read A0 portion of the EEPROM */ @@ -4480,6 +4486,12 @@ static int bnxt_get_module_eeprom_by_page(struct net_device *dev, struct bnxt *bp = netdev_priv(dev); int rc; + if (BNXT_VF(bp) && !BNXT_VF_IS_TRUSTED(bp)) { + NL_SET_ERR_MSG_MOD(extack, + "Module read not permitted on untrusted VF"); + return -EPERM; + } + rc = bnxt_get_module_status(bp, extack); if (rc) return rc; From 36d1e70a90e98c59ee6635552db243d9ebc3c5ea Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 17 Dec 2024 10:26:17 -0800 Subject: [PATCH 0811/1450] bnxt_en: Skip PHY loopback ethtool selftest if unsupported by FW Skip PHY loopback selftest if firmware advertises that it is unsupported in the HWRM_PORT_PHY_QCAPS call. Only show PHY loopback test result to be 0 if the test has run and passes. Do the same for external loopback to be consistent. Reviewed-by: Pavan Chebbi Reviewed-by: Kalesh AP Signed-off-by: Michael Chan Link: https://patch.msgid.link/20241217182620.2454075-4-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 28f2c471652c4..8001849af879f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4914,20 +4914,26 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, buf[BNXT_MACLPBK_TEST_IDX] = 0; bnxt_hwrm_mac_loopback(bp, false); + buf[BNXT_PHYLPBK_TEST_IDX] = 1; + if (bp->phy_flags & BNXT_PHY_FL_NO_PHY_LPBK) + goto skip_phy_loopback; + bnxt_hwrm_phy_loopback(bp, true, false); msleep(1000); - if (bnxt_run_loopback(bp)) { - buf[BNXT_PHYLPBK_TEST_IDX] = 1; + if (bnxt_run_loopback(bp)) etest->flags |= ETH_TEST_FL_FAILED; - } + else + buf[BNXT_PHYLPBK_TEST_IDX] = 0; +skip_phy_loopback: + buf[BNXT_EXTLPBK_TEST_IDX] = 1; if (do_ext_lpbk) { etest->flags |= ETH_TEST_FL_EXTERNAL_LB_DONE; bnxt_hwrm_phy_loopback(bp, true, true); msleep(1000); - if (bnxt_run_loopback(bp)) { - buf[BNXT_EXTLPBK_TEST_IDX] = 1; + if (bnxt_run_loopback(bp)) etest->flags |= ETH_TEST_FL_FAILED; - } + else + buf[BNXT_EXTLPBK_TEST_IDX] = 0; } bnxt_hwrm_phy_loopback(bp, false, false); bnxt_half_close_nic(bp); From b45a850585ca0cc45f7fe0f83be33a769ecc43ab Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 17 Dec 2024 10:26:18 -0800 Subject: [PATCH 0812/1450] bnxt_en: Skip MAC loopback selftest if it is unsupported by FW Call the new HWRM_PORT_MAC_QCAPS to check if mac loopback is supported. Skip the MAC loopback ethtool self test if it is not supported. Reviewed-by: Kalesh AP Reviewed-by: Pavan Chebbi Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20241217182620.2454075-5-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 24 +++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 5 ++++ .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 11 +++++---- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c0728d5ff8bc1..46edea75e062f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11551,6 +11551,26 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp) return rc; } +static void bnxt_hwrm_mac_qcaps(struct bnxt *bp) +{ + struct hwrm_port_mac_qcaps_output *resp; + struct hwrm_port_mac_qcaps_input *req; + int rc; + + if (bp->hwrm_spec_code < 0x10a03) + return; + + rc = hwrm_req_init(bp, req, HWRM_PORT_MAC_QCAPS); + if (rc) + return; + + resp = hwrm_req_hold(bp, req); + rc = hwrm_req_send_silent(bp, req); + if (!rc) + bp->mac_flags = resp->flags; + hwrm_req_drop(bp, req); +} + static bool bnxt_support_dropped(u16 advertising, u16 supported) { u16 diff = advertising ^ supported; @@ -15679,6 +15699,10 @@ static int bnxt_probe_phy(struct bnxt *bp, bool fw_dflt) bp->dev->priv_flags |= IFF_SUPP_NOFCS; else bp->dev->priv_flags &= ~IFF_SUPP_NOFCS; + + bp->mac_flags = 0; + bnxt_hwrm_mac_qcaps(bp); + if (!fw_dflt) return 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index d5e81e008ab5b..094c9e95b4639 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2668,6 +2668,11 @@ struct bnxt { #define BNXT_PHY_FL_BANK_SEL (PORT_PHY_QCAPS_RESP_FLAGS2_BANK_ADDR_SUPPORTED << 8) #define BNXT_PHY_FL_SPEEDS2 (PORT_PHY_QCAPS_RESP_FLAGS2_SPEEDS2_SUPPORTED << 8) + /* copied from flags in hwrm_port_mac_qcaps_output */ + u8 mac_flags; +#define BNXT_MAC_FL_NO_MAC_LPBK \ + PORT_MAC_QCAPS_RESP_FLAGS_LOCAL_LPBK_NOT_SUPPORTED + u8 num_tests; struct bnxt_test_info *test_info; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 8001849af879f..c094abfa1ebc7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4899,21 +4899,24 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, bnxt_close_nic(bp, true, false); bnxt_run_fw_tests(bp, test_mask, &test_results); - buf[BNXT_MACLPBK_TEST_IDX] = 1; - bnxt_hwrm_mac_loopback(bp, true); - msleep(250); rc = bnxt_half_open_nic(bp); if (rc) { - bnxt_hwrm_mac_loopback(bp, false); etest->flags |= ETH_TEST_FL_FAILED; return; } + buf[BNXT_MACLPBK_TEST_IDX] = 1; + if (bp->mac_flags & BNXT_MAC_FL_NO_MAC_LPBK) + goto skip_mac_loopback; + + bnxt_hwrm_mac_loopback(bp, true); + msleep(250); if (bnxt_run_loopback(bp)) etest->flags |= ETH_TEST_FL_FAILED; else buf[BNXT_MACLPBK_TEST_IDX] = 0; bnxt_hwrm_mac_loopback(bp, false); +skip_mac_loopback: buf[BNXT_PHYLPBK_TEST_IDX] = 1; if (bp->phy_flags & BNXT_PHY_FL_NO_PHY_LPBK) goto skip_phy_loopback; From bf2afe0f1493e992852df4a58e4aabd01ab8b384 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 17 Dec 2024 10:26:19 -0800 Subject: [PATCH 0813/1450] bnxt_en: Skip reading PXP registers during ethtool -d if unsupported Newer firmware does not allow reading the PXP registers during ethtool -d, so skip the firmware call in that case. Userspace (bnxt.c) always expects the register block to be populated so zeroes will be returned instead. Reviewed-by: Ajit Khaparde Reviewed-by: Kalesh AP Reviewed-by: Somnath Kotur Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://patch.msgid.link/20241217182620.2454075-6-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index c094abfa1ebc7..75a59dd72bcea 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2050,7 +2050,8 @@ static void bnxt_get_regs(struct net_device *dev, struct ethtool_regs *regs, int rc; regs->version = 0; - bnxt_dbg_hwrm_rd_reg(bp, 0, BNXT_PXP_REG_LEN / 4, _p); + if (!(bp->fw_dbg_cap & DBG_QCAPS_RESP_FLAGS_REG_ACCESS_RESTRICTED)) + bnxt_dbg_hwrm_rd_reg(bp, 0, BNXT_PXP_REG_LEN / 4, _p); if (!(bp->fw_cap & BNXT_FW_CAP_PCIE_STATS_SUPPORTED)) return; From 73df38b097a608ee5d0054211e6cb479c8edad91 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 17 Dec 2024 10:26:20 -0800 Subject: [PATCH 0814/1450] MAINTAINERS: bnxt_en: Add Pavan Chebbi as co-maintainer Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://patch.msgid.link/20241217182620.2454075-7-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e67e0c1883492..1579124ef426b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4611,6 +4611,7 @@ F: drivers/net/ethernet/broadcom/bnx2x/ BROADCOM BNXT_EN 50 GIGABIT ETHERNET DRIVER M: Michael Chan +M: Pavan Chebbi L: netdev@vger.kernel.org S: Supported F: drivers/firmware/broadcom/tee_bnxt_fw.c From d81cadbe164265337f149cf31c9462d7217c1eed Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 4 Nov 2024 07:58:45 +0000 Subject: [PATCH 0815/1450] KVM: SVM: Disable AVIC on SNP-enabled system without HvInUseWrAllowed feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On SNP-enabled system, VMRUN marks AVIC Backing Page as in-use while the guest is running for both secure and non-secure guest. Any hypervisor write to the in-use vCPU's AVIC backing page (e.g. to inject an interrupt) will generate unexpected #PF in the host. Currently, attempt to run AVIC guest would result in the following error: BUG: unable to handle page fault for address: ff3a442e549cc270 #PF: supervisor write access in kernel mode #PF: error_code(0x80000003) - RMP violation PGD b6ee01067 P4D b6ee02067 PUD 10096d063 PMD 11c540063 PTE 80000001149cc163 SEV-SNP: PFN 0x1149cc unassigned, dumping non-zero entries in 2M PFN region: [0x114800 - 0x114a00] ... Newer AMD system is enhanced to allow hypervisor to modify the backing page for non-secure guest on SNP-enabled system. This enhancement is available when the CPUID Fn8000_001F_EAX bit 30 is set (HvInUseWrAllowed). This table describes AVIC support matrix w.r.t. SNP enablement: | Non-SNP system | SNP system ----------------------------------------------------- Non-SNP guest | AVIC Activate | AVIC Activate iff | | HvInuseWrAllowed=1 ----------------------------------------------------- SNP guest | N/A | Secure AVIC Therefore, check and disable AVIC in kvm_amd driver when the feature is not available on SNP-enabled system. See the AMD64 Architecture Programmer’s Manual (APM) Volume 2 for detail. (https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/ programmer-references/40332.pdf) Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20241104075845.7583-1-suravee.suthikulpanit@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/svm/avic.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 17b6590748c00..645aa360628da 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -452,6 +452,7 @@ #define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ +#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 4b74ea91f4e6b..65fd245a9953c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1199,6 +1199,12 @@ bool avic_hardware_setup(void) return false; } + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && + !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { + pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); + return false; + } + if (boot_cpu_has(X86_FEATURE_AVIC)) { pr_info("AVIC enabled\n"); } else if (force_avic) { From 9b42d1e8e4fe9dc631162c04caa69b0d1860b0f0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Nov 2024 16:43:39 -0800 Subject: [PATCH 0816/1450] KVM: x86: Play nice with protected guests in complete_hypercall_exit() Use is_64_bit_hypercall() instead of is_64_bit_mode() to detect a 64-bit hypercall when completing said hypercall. For guests with protected state, e.g. SEV-ES and SEV-SNP, KVM must assume the hypercall was made in 64-bit mode as the vCPU state needed to detect 64-bit mode is unavailable. Hacking the sev_smoke_test selftest to generate a KVM_HC_MAP_GPA_RANGE hypercall via VMGEXIT trips the WARN: ------------[ cut here ]------------ WARNING: CPU: 273 PID: 326626 at arch/x86/kvm/x86.h:180 complete_hypercall_exit+0x44/0xe0 [kvm] Modules linked in: kvm_amd kvm ... [last unloaded: kvm] CPU: 273 UID: 0 PID: 326626 Comm: sev_smoke_test Not tainted 6.12.0-smp--392e932fa0f3-feat #470 Hardware name: Google Astoria/astoria, BIOS 0.20240617.0-0 06/17/2024 RIP: 0010:complete_hypercall_exit+0x44/0xe0 [kvm] Call Trace: kvm_arch_vcpu_ioctl_run+0x2400/0x2720 [kvm] kvm_vcpu_ioctl+0x54f/0x630 [kvm] __se_sys_ioctl+0x6b/0xc0 do_syscall_64+0x83/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e ---[ end trace 0000000000000000 ]--- Fixes: b5aead0064f3 ("KVM: x86: Assume a 64-bit hypercall for guests with protected state") Cc: stable@vger.kernel.org Cc: Tom Lendacky Reviewed-by: Xiaoyao Li Reviewed-by: Nikunj A Dadhania Reviewed-by: Tom Lendacky Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20241128004344.4072099-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e713480933a1..0b2fe4aa04a29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9976,7 +9976,7 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) { u64 ret = vcpu->run->hypercall.ret; - if (!is_64_bit_mode(vcpu)) + if (!is_64_bit_hypercall(vcpu)) ret = (u32)ret; kvm_rax_write(vcpu, ret); ++vcpu->stat.hypercalls; From 4d5163cba43fe96902165606fa54e1aecbbb32de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Dec 2024 09:29:52 -0800 Subject: [PATCH 0817/1450] KVM: SVM: Allow guest writes to set MSR_AMD64_DE_CFG bits Drop KVM's arbitrary behavior of making DE_CFG.LFENCE_SERIALIZE read-only for the guest, as rejecting writes can lead to guest crashes, e.g. Windows in particular doesn't gracefully handle unexpected #GPs on the WRMSR, and nothing in the AMD manuals suggests that LFENCE_SERIALIZE is read-only _if it exists_. KVM only allows LFENCE_SERIALIZE to be set, by the guest or host, if the underlying CPU has X86_FEATURE_LFENCE_RDTSC, i.e. if LFENCE is guaranteed to be serializing. So if the guest sets LFENCE_SERIALIZE, KVM will provide the desired/correct behavior without any additional action (the guest's value is never stuffed into hardware). And having LFENCE be serializing even when it's not _required_ to be is a-ok from a functional perspective. Fixes: 74a0e79df68a ("KVM: SVM: Disallow guest from changing userspace's MSR_AMD64_DE_CFG value") Fixes: d1d93fa90f1a ("KVM: SVM: Add MSR-based feature support for serializing LFENCE") Reported-by: Simon Pilkington Closes: https://lore.kernel.org/all/52914da7-a97b-45ad-86a0-affdf8266c61@mailbox.org Cc: Tom Lendacky Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20241211172952.1477605-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dd15cc6356553..21dacd3127792 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3201,15 +3201,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & ~supported_de_cfg) return 1; - /* - * Don't let the guest change the host-programmed value. The - * MSR is very model specific, i.e. contains multiple bits that - * are completely unknown to KVM, and the one bit known to KVM - * is simply a reflection of hardware capabilities. - */ - if (!msr->host_initiated && data != svm->msr_decfg) - return 1; - svm->msr_decfg = data; break; } From 386d69f9f29b0814881fa4f92ac7b8dfa9b4f44a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2024 13:36:11 -0800 Subject: [PATCH 0818/1450] KVM: x86/mmu: Treat TDP MMU faults as spurious if access is already allowed Treat slow-path TDP MMU faults as spurious if the access is allowed given the existing SPTE to fix a benign warning (other than the WARN itself) due to replacing a writable SPTE with a read-only SPTE, and to avoid the unnecessary LOCK CMPXCHG and subsequent TLB flush. If a read fault races with a write fault, fast GUP fails for any reason when trying to "promote" the read fault to a writable mapping, and KVM resolves the write fault first, then KVM will end up trying to install a read-only SPTE (for a !map_writable fault) overtop a writable SPTE. Note, it's not entirely clear why fast GUP fails, or if that's even how KVM ends up with a !map_writable fault with a writable SPTE. If something else is going awry, e.g. due to a bug in mmu_notifiers, then treating read faults as spurious in this scenario could effectively mask the underlying problem. However, retrying the faulting access instead of overwriting an existing SPTE is functionally correct and desirable irrespective of the WARN, and fast GUP _can_ legitimately fail with a writable VMA, e.g. if the Accessed bit in primary MMU's PTE is toggled and causes a PTE value mismatch. The WARN was also recently added, specifically to track down scenarios where KVM is unnecessarily overwrites SPTEs, i.e. treating the fault as spurious doesn't regress KVM's bug-finding capabilities in any way. In short, letting the WARN linger because there's a tiny chance it's due to a bug elsewhere would be excessively paranoid. Fixes: 1a175082b190 ("KVM: x86/mmu: WARN and flush if resolving a TDP MMU fault clears MMU-writable") Reported-by: Lei Yang Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219588 Tested-by: Lei Yang Link: https://lore.kernel.org/r/20241218213611.3181643-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 12 ------------ arch/x86/kvm/mmu/spte.h | 17 +++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.c | 5 +++++ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 22e7ad2351231..2401606db2604 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3364,18 +3364,6 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, return true; } -static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) -{ - if (fault->exec) - return is_executable_pte(spte); - - if (fault->write) - return is_writable_pte(spte); - - /* Fault was on Read access */ - return spte & PT_PRESENT_MASK; -} - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index f332b33bc8178..af10bc0380a32 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -461,6 +461,23 @@ static inline bool is_mmu_writable_spte(u64 spte) return spte & shadow_mmu_writable_mask; } +/* + * Returns true if the access indicated by @fault is allowed by the existing + * SPTE protections. Note, the caller is responsible for checking that the + * SPTE is a shadow-present, leaf SPTE (either before or after). + */ +static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) +{ + if (fault->exec) + return is_executable_pte(spte); + + if (fault->write) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + /* * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4508d868f1cdc..2f15e0e33903a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -985,6 +985,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) return RET_PF_SPURIOUS; + if (is_shadow_present_pte(iter->old_spte) && + is_access_allowed(fault, iter->old_spte) && + is_last_spte(iter->old_spte, iter->level)) + return RET_PF_SPURIOUS; + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else From 902806baf3c1e8383c1fe3ff0b6042b8cb5c2707 Mon Sep 17 00:00:00 2001 From: Stefan Ekenberg Date: Tue, 19 Nov 2024 08:40:29 +0100 Subject: [PATCH 0819/1450] drm/bridge: adv7511_audio: Update Audio InfoFrame properly AUDIO_UPDATE bit (Bit 5 of MAIN register 0x4A) needs to be set to 1 while updating Audio InfoFrame information and then set to 0 when done. Otherwise partially updated Audio InfoFrames could be sent out. Two cases where this rule were not followed are fixed: - In adv7511_hdmi_hw_params() make sure AUDIO_UPDATE bit is updated before/after setting ADV7511_REG_AUDIO_INFOFRAME. - In audio_startup() use the correct register for clearing AUDIO_UPDATE bit. The problem with corrupted audio infoframes were discovered by letting a HDMI logic analyser check the output of ADV7535. Note that this patchs replaces writing REG_GC(1) with REG_INFOFRAME_UPDATE. Bit 5 of REG_GC(1) is positioned within field GC_PP[3:0] and that field doesn't control audio infoframe and is read- only. My conclusion therefore was that the author if this code meant to clear bit 5 of REG_INFOFRAME_UPDATE from the very beginning. Tested-by: Biju Das Fixes: 53c515befe28 ("drm/bridge: adv7511: Add Audio support") Signed-off-by: Stefan Ekenberg Reviewed-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20241119-adv7511-audio-info-frame-v4-1-4ae68e76c89c@axis.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/adv7511/adv7511_audio.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c b/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c index 61f4a38e7d2bf..8f786592143b6 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_audio.c @@ -153,7 +153,16 @@ static int adv7511_hdmi_hw_params(struct device *dev, void *data, ADV7511_AUDIO_CFG3_LEN_MASK, len); regmap_update_bits(adv7511->regmap, ADV7511_REG_I2C_FREQ_ID_CFG, ADV7511_I2C_FREQ_ID_CFG_RATE_MASK, rate << 4); - regmap_write(adv7511->regmap, 0x73, 0x1); + + /* send current Audio infoframe values while updating */ + regmap_update_bits(adv7511->regmap, ADV7511_REG_INFOFRAME_UPDATE, + BIT(5), BIT(5)); + + regmap_write(adv7511->regmap, ADV7511_REG_AUDIO_INFOFRAME(0), 0x1); + + /* use Audio infoframe updated info */ + regmap_update_bits(adv7511->regmap, ADV7511_REG_INFOFRAME_UPDATE, + BIT(5), 0); return 0; } @@ -184,8 +193,9 @@ static int audio_startup(struct device *dev, void *data) regmap_update_bits(adv7511->regmap, ADV7511_REG_GC(0), BIT(7) | BIT(6), BIT(7)); /* use Audio infoframe updated info */ - regmap_update_bits(adv7511->regmap, ADV7511_REG_GC(1), + regmap_update_bits(adv7511->regmap, ADV7511_REG_INFOFRAME_UPDATE, BIT(5), 0); + /* enable SPDIF receiver */ if (adv7511->audio_source == ADV7511_AUDIO_SOURCE_SPDIF) regmap_update_bits(adv7511->regmap, ADV7511_REG_AUDIO_CONFIG, From 81adbd3ff21c1182e06aa02c6be0bfd9ea02d8e8 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Nov 2024 19:20:29 +0000 Subject: [PATCH 0820/1450] drm: adv7511: Fix use-after-free in adv7533_attach_dsi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The host_node pointer was assigned and freed in adv7533_parse_dt(), and later, adv7533_attach_dsi() uses the same. Fix this use-after-free issue by dropping of_node_put() in adv7533_parse_dt() and calling of_node_put() in error path of probe() and also in the remove(). Fixes: 1e4d58cd7f88 ("drm/bridge: adv7533: Create a MIPI DSI device") Cc: stable@vger.kernel.org Reviewed-by: Laurent Pinchart Signed-off-by: Biju Das Link: https://patchwork.freedesktop.org/patch/msgid/20241119192040.152657-2-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/adv7511/adv7511_drv.c | 10 ++++++++-- drivers/gpu/drm/bridge/adv7511/adv7533.c | 2 -- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c index eb5919b382635..a13b3d8ab6ac6 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c @@ -1241,8 +1241,10 @@ static int adv7511_probe(struct i2c_client *i2c) return ret; ret = adv7511_init_regulators(adv7511); - if (ret) - return dev_err_probe(dev, ret, "failed to init regulators\n"); + if (ret) { + dev_err_probe(dev, ret, "failed to init regulators\n"); + goto err_of_node_put; + } /* * The power down GPIO is optional. If present, toggle it from active to @@ -1363,6 +1365,8 @@ static int adv7511_probe(struct i2c_client *i2c) i2c_unregister_device(adv7511->i2c_edid); uninit_regulators: adv7511_uninit_regulators(adv7511); +err_of_node_put: + of_node_put(adv7511->host_node); return ret; } @@ -1371,6 +1375,8 @@ static void adv7511_remove(struct i2c_client *i2c) { struct adv7511 *adv7511 = i2c_get_clientdata(i2c); + of_node_put(adv7511->host_node); + adv7511_uninit_regulators(adv7511); drm_bridge_remove(&adv7511->bridge); diff --git a/drivers/gpu/drm/bridge/adv7511/adv7533.c b/drivers/gpu/drm/bridge/adv7511/adv7533.c index 4481489aaf5eb..5f195e91b3e6d 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7533.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7533.c @@ -181,8 +181,6 @@ int adv7533_parse_dt(struct device_node *np, struct adv7511 *adv) if (!adv->host_node) return -ENODEV; - of_node_put(adv->host_node); - adv->use_timing_gen = !of_property_read_bool(np, "adi,disable-timing-generator"); From ee8f9ed57a397605434caeef351bafa3ec4dfdd4 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Nov 2024 19:20:30 +0000 Subject: [PATCH 0821/1450] dt-bindings: display: adi,adv7533: Drop single lane support As per [1] and [2], ADV7535/7533 supports only 2-, 3-, or 4-lane. Drop unsupported 1-lane from bindings. [1] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7535.pdf [2] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7533.pdf Fixes: 1e4d58cd7f88 ("drm/bridge: adv7533: Create a MIPI DSI device") Cc: stable@vger.kernel.org Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Signed-off-by: Biju Das Link: https://patchwork.freedesktop.org/patch/msgid/20241119192040.152657-3-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Baryshkov --- .../devicetree/bindings/display/bridge/adi,adv7533.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml b/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml index df20a3c9c7447..ec89115c74e4d 100644 --- a/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml +++ b/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml @@ -90,7 +90,7 @@ properties: adi,dsi-lanes: description: Number of DSI data lanes connected to the DSI host. $ref: /schemas/types.yaml#/definitions/uint32 - enum: [ 1, 2, 3, 4 ] + enum: [ 2, 3, 4 ] "#sound-dai-cells": const: 0 From 79d67c499c3f886202a40c5cb27e747e4fa4d738 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 19 Nov 2024 19:20:31 +0000 Subject: [PATCH 0822/1450] drm: adv7511: Drop dsi single lane support As per [1] and [2], ADV7535/7533 supports only 2-, 3-, or 4-lane. Drop unsupported 1-lane. [1] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7535.pdf [2] https://www.analog.com/media/en/technical-documentation/data-sheets/ADV7533.pdf Fixes: 1e4d58cd7f88 ("drm/bridge: adv7533: Create a MIPI DSI device") Reported-by: Hien Huynh Cc: stable@vger.kernel.org Reviewed-by: Laurent Pinchart Reviewed-by: Adam Ford Signed-off-by: Biju Das Link: https://patchwork.freedesktop.org/patch/msgid/20241119192040.152657-4-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/adv7511/adv7533.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7533.c b/drivers/gpu/drm/bridge/adv7511/adv7533.c index 5f195e91b3e6d..122ad91e8a329 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7533.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7533.c @@ -172,7 +172,7 @@ int adv7533_parse_dt(struct device_node *np, struct adv7511 *adv) of_property_read_u32(np, "adi,dsi-lanes", &num_lanes); - if (num_lanes < 1 || num_lanes > 4) + if (num_lanes < 2 || num_lanes > 4) return -EINVAL; adv->num_dsi_lanes = num_lanes; From 262bfba8ab820641c8cfbbf03b86d6c00242c078 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:02:23 -0800 Subject: [PATCH 0823/1450] net: dsa: microchip: Fix KSZ9477 set_ageing_time function The aging count is not a simple 11-bit value but comprises a 3-bit multiplier and an 8-bit second count. The code tries to use the original multiplier which is 4 as the second count is still 300 seconds by default. Fixes: 2c119d9982b1 ("net: dsa: microchip: add the support for set_ageing_time") Signed-off-by: Tristram Ha Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218020224.70590-2-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz9477.c | 47 +++++++++++++++++++------ drivers/net/dsa/microchip/ksz9477_reg.h | 4 +-- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index d16817e0476f2..29fe79ea74cdd 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -2,7 +2,7 @@ /* * Microchip KSZ9477 switch driver main logic * - * Copyright (C) 2017-2019 Microchip Technology Inc. + * Copyright (C) 2017-2024 Microchip Technology Inc. */ #include @@ -983,26 +983,51 @@ void ksz9477_get_caps(struct ksz_device *dev, int port, int ksz9477_set_ageing_time(struct ksz_device *dev, unsigned int msecs) { u32 secs = msecs / 1000; - u8 value; - u8 data; + u8 data, mult, value; + u32 max_val; int ret; - value = FIELD_GET(SW_AGE_PERIOD_7_0_M, secs); +#define MAX_TIMER_VAL ((1 << 8) - 1) - ret = ksz_write8(dev, REG_SW_LUE_CTRL_3, value); - if (ret < 0) - return ret; + /* The aging timer comprises a 3-bit multiplier and an 8-bit second + * value. Either of them cannot be zero. The maximum timer is then + * 7 * 255 = 1785 seconds. + */ + if (!secs) + secs = 1; - data = FIELD_GET(SW_AGE_PERIOD_10_8_M, secs); + /* Return error if too large. */ + else if (secs > 7 * MAX_TIMER_VAL) + return -EINVAL; ret = ksz_read8(dev, REG_SW_LUE_CTRL_0, &value); if (ret < 0) return ret; - value &= ~SW_AGE_CNT_M; - value |= FIELD_PREP(SW_AGE_CNT_M, data); + /* Check whether there is need to update the multiplier. */ + mult = FIELD_GET(SW_AGE_CNT_M, value); + max_val = MAX_TIMER_VAL; + if (mult > 0) { + /* Try to use the same multiplier already in the register as + * the hardware default uses multiplier 4 and 75 seconds for + * 300 seconds. + */ + max_val = DIV_ROUND_UP(secs, mult); + if (max_val > MAX_TIMER_VAL || max_val * mult != secs) + max_val = MAX_TIMER_VAL; + } + + data = DIV_ROUND_UP(secs, max_val); + if (mult != data) { + value &= ~SW_AGE_CNT_M; + value |= FIELD_PREP(SW_AGE_CNT_M, data); + ret = ksz_write8(dev, REG_SW_LUE_CTRL_0, value); + if (ret < 0) + return ret; + } - return ksz_write8(dev, REG_SW_LUE_CTRL_0, value); + value = DIV_ROUND_UP(secs, data); + return ksz_write8(dev, REG_SW_LUE_CTRL_3, value); } void ksz9477_port_queue_split(struct ksz_device *dev, int port) diff --git a/drivers/net/dsa/microchip/ksz9477_reg.h b/drivers/net/dsa/microchip/ksz9477_reg.h index 04235c22bf40e..ff579920078ee 100644 --- a/drivers/net/dsa/microchip/ksz9477_reg.h +++ b/drivers/net/dsa/microchip/ksz9477_reg.h @@ -2,7 +2,7 @@ /* * Microchip KSZ9477 register definitions * - * Copyright (C) 2017-2018 Microchip Technology Inc. + * Copyright (C) 2017-2024 Microchip Technology Inc. */ #ifndef __KSZ9477_REGS_H @@ -165,8 +165,6 @@ #define SW_VLAN_ENABLE BIT(7) #define SW_DROP_INVALID_VID BIT(6) #define SW_AGE_CNT_M GENMASK(5, 3) -#define SW_AGE_CNT_S 3 -#define SW_AGE_PERIOD_10_8_M GENMASK(10, 8) #define SW_RESV_MCAST_ENABLE BIT(2) #define SW_HASH_OPTION_M 0x03 #define SW_HASH_OPTION_CRC 1 From bb9869043438af5b94230f94fb4c39206525d758 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:02:24 -0800 Subject: [PATCH 0824/1450] net: dsa: microchip: Fix LAN937X set_ageing_time function The aging count is not a simple 20-bit value but comprises a 3-bit multiplier and a 20-bit second time. The code tries to use the original multiplier which is 4 as the second count is still 300 seconds by default. As the 20-bit number is now too large for practical use there is an option to interpret it as microseconds instead of seconds. Fixes: 2c119d9982b1 ("net: dsa: microchip: add the support for set_ageing_time") Signed-off-by: Tristram Ha Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218020224.70590-3-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/lan937x_main.c | 62 ++++++++++++++++++++++-- drivers/net/dsa/microchip/lan937x_reg.h | 9 ++-- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/microchip/lan937x_main.c b/drivers/net/dsa/microchip/lan937x_main.c index b7652efd632ea..b1ae3b9de3d16 100644 --- a/drivers/net/dsa/microchip/lan937x_main.c +++ b/drivers/net/dsa/microchip/lan937x_main.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Microchip LAN937X switch driver main logic - * Copyright (C) 2019-2022 Microchip Technology Inc. + * Copyright (C) 2019-2024 Microchip Technology Inc. */ #include #include @@ -461,10 +461,66 @@ int lan937x_change_mtu(struct ksz_device *dev, int port, int new_mtu) int lan937x_set_ageing_time(struct ksz_device *dev, unsigned int msecs) { - u32 secs = msecs / 1000; - u32 value; + u8 data, mult, value8; + bool in_msec = false; + u32 max_val, value; + u32 secs = msecs; int ret; +#define MAX_TIMER_VAL ((1 << 20) - 1) + + /* The aging timer comprises a 3-bit multiplier and a 20-bit second + * value. Either of them cannot be zero. The maximum timer is then + * 7 * 1048575 = 7340025 seconds. As this value is too large for + * practical use it can be interpreted as microseconds, making the + * maximum timer 7340 seconds with finer control. This allows for + * maximum 122 minutes compared to 29 minutes in KSZ9477 switch. + */ + if (msecs % 1000) + in_msec = true; + else + secs /= 1000; + if (!secs) + secs = 1; + + /* Return error if too large. */ + else if (secs > 7 * MAX_TIMER_VAL) + return -EINVAL; + + /* Configure how to interpret the number value. */ + ret = ksz_rmw8(dev, REG_SW_LUE_CTRL_2, SW_AGE_CNT_IN_MICROSEC, + in_msec ? SW_AGE_CNT_IN_MICROSEC : 0); + if (ret < 0) + return ret; + + ret = ksz_read8(dev, REG_SW_LUE_CTRL_0, &value8); + if (ret < 0) + return ret; + + /* Check whether there is need to update the multiplier. */ + mult = FIELD_GET(SW_AGE_CNT_M, value8); + max_val = MAX_TIMER_VAL; + if (mult > 0) { + /* Try to use the same multiplier already in the register as + * the hardware default uses multiplier 4 and 75 seconds for + * 300 seconds. + */ + max_val = DIV_ROUND_UP(secs, mult); + if (max_val > MAX_TIMER_VAL || max_val * mult != secs) + max_val = MAX_TIMER_VAL; + } + + data = DIV_ROUND_UP(secs, max_val); + if (mult != data) { + value8 &= ~SW_AGE_CNT_M; + value8 |= FIELD_PREP(SW_AGE_CNT_M, data); + ret = ksz_write8(dev, REG_SW_LUE_CTRL_0, value8); + if (ret < 0) + return ret; + } + + secs = DIV_ROUND_UP(secs, data); + value = FIELD_GET(SW_AGE_PERIOD_7_0_M, secs); ret = ksz_write8(dev, REG_SW_AGE_PERIOD__1, value); diff --git a/drivers/net/dsa/microchip/lan937x_reg.h b/drivers/net/dsa/microchip/lan937x_reg.h index 4ec93e421da45..72042fd64e5b0 100644 --- a/drivers/net/dsa/microchip/lan937x_reg.h +++ b/drivers/net/dsa/microchip/lan937x_reg.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Microchip LAN937X switch register definitions - * Copyright (C) 2019-2021 Microchip Technology Inc. + * Copyright (C) 2019-2024 Microchip Technology Inc. */ #ifndef __LAN937X_REG_H #define __LAN937X_REG_H @@ -56,8 +56,7 @@ #define SW_VLAN_ENABLE BIT(7) #define SW_DROP_INVALID_VID BIT(6) -#define SW_AGE_CNT_M 0x7 -#define SW_AGE_CNT_S 3 +#define SW_AGE_CNT_M GENMASK(5, 3) #define SW_RESV_MCAST_ENABLE BIT(2) #define REG_SW_LUE_CTRL_1 0x0311 @@ -70,6 +69,10 @@ #define SW_FAST_AGING BIT(1) #define SW_LINK_AUTO_AGING BIT(0) +#define REG_SW_LUE_CTRL_2 0x0312 + +#define SW_AGE_CNT_IN_MICROSEC BIT(7) + #define REG_SW_AGE_PERIOD__1 0x0313 #define SW_AGE_PERIOD_7_0_M GENMASK(7, 0) From 3fc87cb94f5f3224a9ea168ee935286d915d2a6a Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:03:11 -0800 Subject: [PATCH 0825/1450] net: dsa: microchip: Add suspend/resume support to KSZ DSA driver The KSZ DSA driver starts a timer to read MIB counters periodically to avoid count overrun. During system suspend this will give an error for not able to write to register as the SPI system returns an error when it is in suspend state. This implementation stops the timer when the system goes into suspend and restarts it when resumed. Signed-off-by: Tristram Ha Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241218020311.70628-1-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz9477_i2c.c | 4 +++ drivers/net/dsa/microchip/ksz_common.c | 37 +++++++++++++++++++++++++ drivers/net/dsa/microchip/ksz_common.h | 2 ++ drivers/net/dsa/microchip/ksz_spi.c | 4 +++ 4 files changed, 47 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz9477_i2c.c b/drivers/net/dsa/microchip/ksz9477_i2c.c index 1c6d7fc167723..a2beb27459f18 100644 --- a/drivers/net/dsa/microchip/ksz9477_i2c.c +++ b/drivers/net/dsa/microchip/ksz9477_i2c.c @@ -127,10 +127,14 @@ static const struct of_device_id ksz9477_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, ksz9477_dt_ids); +static DEFINE_SIMPLE_DEV_PM_OPS(ksz_i2c_pm_ops, + ksz_switch_suspend, ksz_switch_resume); + static struct i2c_driver ksz9477_i2c_driver = { .driver = { .name = "ksz9477-switch", .of_match_table = ksz9477_dt_ids, + .pm = &ksz_i2c_pm_ops, }, .probe = ksz9477_i2c_probe, .remove = ksz9477_i2c_remove, diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index df314724e6a74..a8dac7ff6b817 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -4586,6 +4586,23 @@ static int ksz_hsr_leave(struct dsa_switch *ds, int port, return 0; } +static int ksz_suspend(struct dsa_switch *ds) +{ + struct ksz_device *dev = ds->priv; + + cancel_delayed_work_sync(&dev->mib_read); + return 0; +} + +static int ksz_resume(struct dsa_switch *ds) +{ + struct ksz_device *dev = ds->priv; + + if (dev->mib_read_interval) + schedule_delayed_work(&dev->mib_read, dev->mib_read_interval); + return 0; +} + static const struct dsa_switch_ops ksz_switch_ops = { .get_tag_protocol = ksz_get_tag_protocol, .connect_tag_protocol = ksz_connect_tag_protocol, @@ -4626,6 +4643,8 @@ static const struct dsa_switch_ops ksz_switch_ops = { .port_max_mtu = ksz_max_mtu, .get_wol = ksz_get_wol, .set_wol = ksz_set_wol, + .suspend = ksz_suspend, + .resume = ksz_resume, .get_ts_info = ksz_get_ts_info, .port_hwtstamp_get = ksz_hwtstamp_get, .port_hwtstamp_set = ksz_hwtstamp_set, @@ -5126,6 +5145,24 @@ void ksz_switch_remove(struct ksz_device *dev) } EXPORT_SYMBOL(ksz_switch_remove); +#ifdef CONFIG_PM_SLEEP +int ksz_switch_suspend(struct device *dev) +{ + struct ksz_device *priv = dev_get_drvdata(dev); + + return dsa_switch_suspend(priv->ds); +} +EXPORT_SYMBOL(ksz_switch_suspend); + +int ksz_switch_resume(struct device *dev) +{ + struct ksz_device *priv = dev_get_drvdata(dev); + + return dsa_switch_resume(priv->ds); +} +EXPORT_SYMBOL(ksz_switch_resume); +#endif + MODULE_AUTHOR("Woojung Huh "); MODULE_DESCRIPTION("Microchip KSZ Series Switch DSA Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index b3bb75ca0796d..2bc96127a447b 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -444,6 +444,8 @@ struct ksz_dev_ops { struct ksz_device *ksz_switch_alloc(struct device *base, void *priv); int ksz_switch_register(struct ksz_device *dev); void ksz_switch_remove(struct ksz_device *dev); +int ksz_switch_suspend(struct device *dev); +int ksz_switch_resume(struct device *dev); void ksz_init_mib_timer(struct ksz_device *dev); bool ksz_is_port_mac_global_usable(struct dsa_switch *ds, int port); diff --git a/drivers/net/dsa/microchip/ksz_spi.c b/drivers/net/dsa/microchip/ksz_spi.c index 108a958dc356f..b633d263098c6 100644 --- a/drivers/net/dsa/microchip/ksz_spi.c +++ b/drivers/net/dsa/microchip/ksz_spi.c @@ -239,10 +239,14 @@ static const struct spi_device_id ksz_spi_ids[] = { }; MODULE_DEVICE_TABLE(spi, ksz_spi_ids); +static DEFINE_SIMPLE_DEV_PM_OPS(ksz_spi_pm_ops, + ksz_switch_suspend, ksz_switch_resume); + static struct spi_driver ksz_spi_driver = { .driver = { .name = "ksz-switch", .of_match_table = ksz_dt_ids, + .pm = &ksz_spi_pm_ops, }, .id_table = ksz_spi_ids, .probe = ksz_spi_probe, From 75e2c86c7b180fd1068ad271178c2820a199e7eb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Dec 2024 18:44:00 -0800 Subject: [PATCH 0826/1450] net: netlink: catch attempts to send empty messages syzbot can figure out a way to redirect a netlink message to a tap. Sending empty skbs to devices is not valid and we end up hitting a skb_assert_len() in __dev_queue_xmit(). Make catching these mistakes easier, assert the skb size directly in netlink core. Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241218024400.824355-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f4e7b5e4bb59f..85311226183a2 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1287,6 +1287,7 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; + skb_assert_len(skb); WARN_ON(skb->sk != NULL); delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) From b55498ff14bd14860d48dc8d2a0b6889b218c408 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 16 Dec 2024 22:31:18 +0100 Subject: [PATCH 0827/1450] net: phy: add phy_disable_eee If a MAC driver doesn't support EEE, then the PHY shouldn't advertise it. Add phy_disable_eee() for this purpose. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/fd51738c-dcd6-4d61-b8c5-faa6ac0f1026@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 16 ++++++++++++++++ include/linux/phy.h | 1 + 2 files changed, 17 insertions(+) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 1a908af4175b2..928dc3c509b66 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3003,6 +3003,22 @@ void phy_support_eee(struct phy_device *phydev) } EXPORT_SYMBOL(phy_support_eee); +/** + * phy_disable_eee - Disable EEE for the PHY + * @phydev: Target phy_device struct + * + * This function is used by MAC drivers for MAC's which don't support EEE. + * It disables EEE on the PHY layer. + */ +void phy_disable_eee(struct phy_device *phydev) +{ + linkmode_zero(phydev->supported_eee); + linkmode_zero(phydev->advertising_eee); + phydev->eee_cfg.tx_lpi_enabled = false; + phydev->eee_cfg.eee_enabled = false; +} +EXPORT_SYMBOL_GPL(phy_disable_eee); + /** * phy_support_sym_pause - Enable support of symmetrical pause * @phydev: target phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index e597a32cc787a..5bc71d59910c3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2071,6 +2071,7 @@ void phy_advertise_eee_all(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); void phy_support_eee(struct phy_device *phydev); +void phy_disable_eee(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, bool autoneg); void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx); From c9f5a5dabbf5ab53a6392d7c782d373d2c892e21 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 16 Dec 2024 22:32:25 +0100 Subject: [PATCH 0828/1450] net: ethernet: ti: cpsw: disable PHY EEE advertisement It seems the cpsw MAC doesn't support EEE. See e.g. the commit message of ce2899428ec0 ("ARM: dts: am335x-baltos: disable EEE for Atheros 8035 PHY"). There are cases where this causes issues if the PHY's on both sides have negotiated EEE. As a workaround EEE modes of the PHY are marked broken in DT, effectively disabling EEE advertisement. Improve this by using new function phy_disable_eee() in the MAC driver. This properly disables EEE advertisement, and allows to remove the eee-broken-xxx properties from DT. As EEE is disabled anyway, we can remove also the set_eee ethtool op. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/d08a798e-8565-422c-b2ed-121794db077f@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/cpsw.c | 3 ++- drivers/net/ethernet/ti/cpsw_ethtool.c | 12 ------------ drivers/net/ethernet/ti/cpsw_new.c | 3 ++- drivers/net/ethernet/ti/cpsw_priv.h | 1 - 4 files changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 4ef8cf6ea135a..1e290ee8edfd3 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -635,6 +635,8 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv) slave->phy = phy; + phy_disable_eee(slave->phy); + phy_attached_info(slave->phy); phy_start(slave->phy); @@ -1225,7 +1227,6 @@ static const struct ethtool_ops cpsw_ethtool_ops = { .get_link_ksettings = cpsw_get_link_ksettings, .set_link_ksettings = cpsw_set_link_ksettings, .get_eee = cpsw_get_eee, - .set_eee = cpsw_set_eee, .nway_reset = cpsw_nway_reset, .get_ringparam = cpsw_get_ringparam, .set_ringparam = cpsw_set_ringparam, diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c index 21d55a180ef6e..bdc4db0d169c4 100644 --- a/drivers/net/ethernet/ti/cpsw_ethtool.c +++ b/drivers/net/ethernet/ti/cpsw_ethtool.c @@ -434,18 +434,6 @@ int cpsw_get_eee(struct net_device *ndev, struct ethtool_keee *edata) return -EOPNOTSUPP; } -int cpsw_set_eee(struct net_device *ndev, struct ethtool_keee *edata) -{ - struct cpsw_priv *priv = netdev_priv(ndev); - struct cpsw_common *cpsw = priv->cpsw; - int slave_no = cpsw_slave_index(cpsw, priv); - - if (cpsw->slaves[slave_no].phy) - return phy_ethtool_set_eee(cpsw->slaves[slave_no].phy, edata); - else - return -EOPNOTSUPP; -} - int cpsw_nway_reset(struct net_device *ndev) { struct cpsw_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index a98bcc5eb5663..be4d90c1cbe7f 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -778,6 +778,8 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv) slave->phy = phy; + phy_disable_eee(slave->phy); + phy_attached_info(slave->phy); phy_start(slave->phy); @@ -1209,7 +1211,6 @@ static const struct ethtool_ops cpsw_ethtool_ops = { .get_link_ksettings = cpsw_get_link_ksettings, .set_link_ksettings = cpsw_set_link_ksettings, .get_eee = cpsw_get_eee, - .set_eee = cpsw_set_eee, .nway_reset = cpsw_nway_reset, .get_ringparam = cpsw_get_ringparam, .set_ringparam = cpsw_set_ringparam, diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h index 1f448290b9f4b..f2fc55d9295dc 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.h +++ b/drivers/net/ethernet/ti/cpsw_priv.h @@ -497,7 +497,6 @@ int cpsw_get_link_ksettings(struct net_device *ndev, int cpsw_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *ecmd); int cpsw_get_eee(struct net_device *ndev, struct ethtool_keee *edata); -int cpsw_set_eee(struct net_device *ndev, struct ethtool_keee *edata); int cpsw_nway_reset(struct net_device *ndev); void cpsw_get_ringparam(struct net_device *ndev, struct ethtool_ringparam *ering, From 0c7469ee718e1dd929f52bfb142a7f6fb68f0765 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 16 Dec 2024 18:47:33 +0100 Subject: [PATCH 0829/1450] net: airoha: Fix error path in airoha_probe() Do not run napi_disable() if airoha_hw_init() fails since Tx/Rx napi has not been started yet. In order to fix the issue, introduce airoha_qdma_stop_napi routine and remove napi_disable in airoha_hw_cleanup(). Fixes: 23020f049327 ("net: airoha: Introduce ethernet support for EN7581 SoC") Reviewed-by: Michal Swiatkowski Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20241216-airoha_probe-error-path-fix-v2-1-6b10e04e9a5c@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/airoha_eth.c | 33 ++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mediatek/airoha_eth.c b/drivers/net/ethernet/mediatek/airoha_eth.c index 6c683a12d5aa5..d8bfc21a5b194 100644 --- a/drivers/net/ethernet/mediatek/airoha_eth.c +++ b/drivers/net/ethernet/mediatek/airoha_eth.c @@ -2138,17 +2138,14 @@ static void airoha_hw_cleanup(struct airoha_qdma *qdma) if (!qdma->q_rx[i].ndesc) continue; - napi_disable(&qdma->q_rx[i].napi); netif_napi_del(&qdma->q_rx[i].napi); airoha_qdma_cleanup_rx_queue(&qdma->q_rx[i]); if (qdma->q_rx[i].page_pool) page_pool_destroy(qdma->q_rx[i].page_pool); } - for (i = 0; i < ARRAY_SIZE(qdma->q_tx_irq); i++) { - napi_disable(&qdma->q_tx_irq[i].napi); + for (i = 0; i < ARRAY_SIZE(qdma->q_tx_irq); i++) netif_napi_del(&qdma->q_tx_irq[i].napi); - } for (i = 0; i < ARRAY_SIZE(qdma->q_tx); i++) { if (!qdma->q_tx[i].ndesc) @@ -2173,6 +2170,21 @@ static void airoha_qdma_start_napi(struct airoha_qdma *qdma) } } +static void airoha_qdma_stop_napi(struct airoha_qdma *qdma) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(qdma->q_tx_irq); i++) + napi_disable(&qdma->q_tx_irq[i].napi); + + for (i = 0; i < ARRAY_SIZE(qdma->q_rx); i++) { + if (!qdma->q_rx[i].ndesc) + continue; + + napi_disable(&qdma->q_rx[i].napi); + } +} + static void airoha_update_hw_stats(struct airoha_gdm_port *port) { struct airoha_eth *eth = port->qdma->eth; @@ -2738,7 +2750,7 @@ static int airoha_probe(struct platform_device *pdev) err = airoha_hw_init(pdev, eth); if (err) - goto error; + goto error_hw_cleanup; for (i = 0; i < ARRAY_SIZE(eth->qdma); i++) airoha_qdma_start_napi(ð->qdma[i]); @@ -2753,13 +2765,16 @@ static int airoha_probe(struct platform_device *pdev) err = airoha_alloc_gdm_port(eth, np); if (err) { of_node_put(np); - goto error; + goto error_napi_stop; } } return 0; -error: +error_napi_stop: + for (i = 0; i < ARRAY_SIZE(eth->qdma); i++) + airoha_qdma_stop_napi(ð->qdma[i]); +error_hw_cleanup: for (i = 0; i < ARRAY_SIZE(eth->qdma); i++) airoha_hw_cleanup(ð->qdma[i]); @@ -2780,8 +2795,10 @@ static void airoha_remove(struct platform_device *pdev) struct airoha_eth *eth = platform_get_drvdata(pdev); int i; - for (i = 0; i < ARRAY_SIZE(eth->qdma); i++) + for (i = 0; i < ARRAY_SIZE(eth->qdma); i++) { + airoha_qdma_stop_napi(ð->qdma[i]); airoha_hw_cleanup(ð->qdma[i]); + } for (i = 0; i < ARRAY_SIZE(eth->ports); i++) { struct airoha_gdm_port *port = eth->ports[i]; From f6038d913b13b41dcaf10ff2a89f76d7ffac9edc Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 17 Dec 2024 20:12:06 +0000 Subject: [PATCH 0830/1450] net: Document netmem driver support Document expectations from drivers looking to add support for device memory tcp or other netmem based features. Signed-off-by: Mina Almasry Reviewed-by: Bagas Sanjaya Reviewed-by: Shannon Nelson Link: https://patch.msgid.link/20241217201206.2360389-1-almasrymina@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/index.rst | 1 + Documentation/networking/netmem.rst | 79 +++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 Documentation/networking/netmem.rst diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 46c178e564b34..058193ed2eeb8 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -86,6 +86,7 @@ Contents: netdevices netfilter-sysctl netif-msg + netmem nexthop-group-resilient nf_conntrack-sysctl nf_flowtable diff --git a/Documentation/networking/netmem.rst b/Documentation/networking/netmem.rst new file mode 100644 index 0000000000000..7de21ddb54129 --- /dev/null +++ b/Documentation/networking/netmem.rst @@ -0,0 +1,79 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +Netmem Support for Network Drivers +================================== + +This document outlines the requirements for network drivers to support netmem, +an abstract memory type that enables features like device memory TCP. By +supporting netmem, drivers can work with various underlying memory types +with little to no modification. + +Benefits of Netmem : + +* Flexibility: Netmem can be backed by different memory types (e.g., struct + page, DMA-buf), allowing drivers to support various use cases such as device + memory TCP. +* Future-proof: Drivers with netmem support are ready for upcoming + features that rely on it. +* Simplified Development: Drivers interact with a consistent API, + regardless of the underlying memory implementation. + +Driver Requirements +=================== + +1. The driver must support page_pool. + +2. The driver must support the tcp-data-split ethtool option. + +3. The driver must use the page_pool netmem APIs for payload memory. The netmem + APIs currently 1-to-1 correspond with page APIs. Conversion to netmem should + be achievable by switching the page APIs to netmem APIs and tracking memory + via netmem_refs in the driver rather than struct page * : + + - page_pool_alloc -> page_pool_alloc_netmem + - page_pool_get_dma_addr -> page_pool_get_dma_addr_netmem + - page_pool_put_page -> page_pool_put_netmem + + Not all page APIs have netmem equivalents at the moment. If your driver + relies on a missing netmem API, feel free to add and propose to netdev@, or + reach out to the maintainers and/or almasrymina@google.com for help adding + the netmem API. + +4. The driver must use the following PP_FLAGS: + + - PP_FLAG_DMA_MAP: netmem is not dma-mappable by the driver. The driver + must delegate the dma mapping to the page_pool, which knows when + dma-mapping is (or is not) appropriate. + - PP_FLAG_DMA_SYNC_DEV: netmem dma addr is not necessarily dma-syncable + by the driver. The driver must delegate the dma syncing to the page_pool, + which knows when dma-syncing is (or is not) appropriate. + - PP_FLAG_ALLOW_UNREADABLE_NETMEM. The driver must specify this flag iff + tcp-data-split is enabled. + +5. The driver must not assume the netmem is readable and/or backed by pages. + The netmem returned by the page_pool may be unreadable, in which case + netmem_address() will return NULL. The driver must correctly handle + unreadable netmem, i.e. don't attempt to handle its contents when + netmem_address() is NULL. + + Ideally, drivers should not have to check the underlying netmem type via + helpers like netmem_is_net_iov() or convert the netmem to any of its + underlying types via netmem_to_page() or netmem_to_net_iov(). In most cases, + netmem or page_pool helpers that abstract this complexity are provided + (and more can be added). + +6. The driver must use page_pool_dma_sync_netmem_for_cpu() in lieu of + dma_sync_single_range_for_cpu(). For some memory providers, dma_syncing for + CPU will be done by the page_pool, for others (particularly dmabuf memory + provider), dma syncing for CPU is the responsibility of the userspace using + dmabuf APIs. The driver must delegate the entire dma-syncing operation to + the page_pool which will do it correctly. + +7. Avoid implementing driver-specific recycling on top of the page_pool. Drivers + cannot hold onto a struct page to do their own recycling as the netmem may + not be backed by a struct page. However, you may hold onto a page_pool + reference with page_pool_fragment_netmem() or page_pool_ref_netmem() for + that purpose, but be mindful that some netmem types might have longer + circulation times, such as when userspace holds a reference in zerocopy + scenarios. From 5c98e89d96ecbf0e4ed38a20c46727c0ed2c112b Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Wed, 18 Dec 2024 16:34:07 +0800 Subject: [PATCH 0831/1450] net: stmmac: Drop useless code related to ethtool rx-copybreak After commit 2af6106ae949 ("net: stmmac: Introducing support for Page Pool"), the driver always copies frames to get a better performance, zero-copy for RX frames is no more, then these code turned to be useless and users of ethtool may get confused about the unhandled rx-copybreak parameter. This patch mostly reverts commit 22ad38381547 ("stmmac: do not perform zero-copy for rx frames") Signed-off-by: Furong Xu <0x1207@gmail.com> Link: https://patch.msgid.link/20241218083407.390509-1-0x1207@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 2 - .../ethernet/stmicro/stmmac/stmmac_ethtool.c | 39 ------------------- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 5 --- 3 files changed, 46 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 1d86439b8a14f..b8d631e559c0e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -126,7 +126,6 @@ struct stmmac_rx_queue { unsigned int cur_rx; unsigned int dirty_rx; unsigned int buf_alloc_num; - u32 rx_zeroc_thresh; dma_addr_t dma_rx_phy; u32 rx_tail_addr; unsigned int state_saved; @@ -266,7 +265,6 @@ struct stmmac_priv { int sph_cap; u32 sarc_type; - unsigned int rx_copybreak; u32 rx_riwt[MTL_MAX_TX_QUEUES]; int hwts_rx_en; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 1d77389ce9533..16b4d8c21c905 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -1227,43 +1227,6 @@ static int stmmac_get_ts_info(struct net_device *dev, return ethtool_op_get_ts_info(dev, info); } -static int stmmac_get_tunable(struct net_device *dev, - const struct ethtool_tunable *tuna, void *data) -{ - struct stmmac_priv *priv = netdev_priv(dev); - int ret = 0; - - switch (tuna->id) { - case ETHTOOL_RX_COPYBREAK: - *(u32 *)data = priv->rx_copybreak; - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - -static int stmmac_set_tunable(struct net_device *dev, - const struct ethtool_tunable *tuna, - const void *data) -{ - struct stmmac_priv *priv = netdev_priv(dev); - int ret = 0; - - switch (tuna->id) { - case ETHTOOL_RX_COPYBREAK: - priv->rx_copybreak = *(u32 *)data; - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - static int stmmac_get_mm(struct net_device *ndev, struct ethtool_mm_state *state) { @@ -1390,8 +1353,6 @@ static const struct ethtool_ops stmmac_ethtool_ops = { .set_per_queue_coalesce = stmmac_set_per_queue_coalesce, .get_channels = stmmac_get_channels, .set_channels = stmmac_set_channels, - .get_tunable = stmmac_get_tunable, - .set_tunable = stmmac_set_tunable, .get_link_ksettings = stmmac_ethtool_get_link_ksettings, .set_link_ksettings = stmmac_ethtool_set_link_ksettings, .get_mm = stmmac_get_mm, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 16b8bcfa8b111..6bc10ffe7a2bc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -77,7 +77,6 @@ module_param(phyaddr, int, 0444); MODULE_PARM_DESC(phyaddr, "Physical device address"); #define STMMAC_TX_THRESH(x) ((x)->dma_conf.dma_tx_size / 4) -#define STMMAC_RX_THRESH(x) ((x)->dma_conf.dma_rx_size / 4) /* Limit to make sure XDP TX and slow path can coexist */ #define STMMAC_XSK_TX_BUDGET_MAX 256 @@ -107,8 +106,6 @@ static int buf_sz = DEFAULT_BUFSIZE; module_param(buf_sz, int, 0644); MODULE_PARM_DESC(buf_sz, "DMA buffer size"); -#define STMMAC_RX_COPYBREAK 256 - static const u32 default_msg_level = (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN | NETIF_MSG_TIMER); @@ -3927,8 +3924,6 @@ static int __stmmac_open(struct net_device *dev, } } - priv->rx_copybreak = STMMAC_RX_COPYBREAK; - buf_sz = dma_conf->dma_buf_sz; for (int i = 0; i < MTL_MAX_TX_QUEUES; i++) if (priv->dma_conf.tx_queue[i].tbs & STMMAC_TBS_EN) From 455e135c3042540cf15fff629a56600c11bea396 Mon Sep 17 00:00:00 2001 From: Andy Moreton Date: Wed, 18 Dec 2024 13:59:30 +0000 Subject: [PATCH 0832/1450] sfc: remove efx_writed_page_locked From: Andy Moreton efx_writed_page_locked is a workaround for Siena hardware that is not needed on later adapters, and has no callers. Remove it. Signed-off-by: Andy Moreton Signed-off-by: Edward Cree Link: https://patch.msgid.link/20241218135930.2350358-1-edward.cree@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/io.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/drivers/net/ethernet/sfc/io.h b/drivers/net/ethernet/sfc/io.h index 4cc7b501135fb..ef374a8e05c39 100644 --- a/drivers/net/ethernet/sfc/io.h +++ b/drivers/net/ethernet/sfc/io.h @@ -217,28 +217,4 @@ _efx_writed_page(struct efx_nic *efx, const efx_dword_t *value, (reg) != 0xa1c), \ page) -/* Write TIMER_COMMAND. This is a page-mapped 32-bit CSR, but a bug - * in the BIU means that writes to TIMER_COMMAND[0] invalidate the - * collector register. - */ -static inline void _efx_writed_page_locked(struct efx_nic *efx, - const efx_dword_t *value, - unsigned int reg, - unsigned int page) -{ - unsigned long flags __attribute__ ((unused)); - - if (page == 0) { - spin_lock_irqsave(&efx->biu_lock, flags); - efx_writed(efx, value, efx_paged_reg(efx, page, reg)); - spin_unlock_irqrestore(&efx->biu_lock, flags); - } else { - efx_writed(efx, value, efx_paged_reg(efx, page, reg)); - } -} -#define efx_writed_page_locked(efx, value, reg, page) \ - _efx_writed_page_locked(efx, value, \ - reg + BUILD_BUG_ON_ZERO((reg) != 0x420), \ - page) - #endif /* EFX_IO_H */ From 6724bc65e59b57e64f65269da8956f8bdc12bb03 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Wed, 18 Dec 2024 09:00:18 -0500 Subject: [PATCH 0833/1450] selftests: net: remove redundant ncdevmem print Remove extrenous fprintf Signed-off-by: Jamal Hadi Salim Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20241218140018.15607-1-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/ncdevmem.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 8e502a1f8f9bc..19a6969643f46 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -619,9 +619,6 @@ int do_server(struct memory_buffer *mem) fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", page_aligned_frags, non_page_aligned_frags); - fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", - page_aligned_frags, non_page_aligned_frags); - cleanup: free(tmp_mem); From 5155cbcdbf03f207095f9a3794942a25aa7e5f58 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 18 Dec 2024 15:33:34 +0100 Subject: [PATCH 0834/1450] af_unix: Add a prompt to CONFIG_AF_UNIX_OOB This makes it possible to disable the MSG_OOB support in .config. Signed-off-by: Florent Revest Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241218143334.1507465-1-revest@chromium.org Signed-off-by: Jakub Kicinski --- net/unix/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/unix/Kconfig b/net/unix/Kconfig index 8b5d04210d7cf..6f1783c1659b8 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -17,9 +17,11 @@ config UNIX Say Y unless you know what you are doing. config AF_UNIX_OOB - bool + bool "UNIX: out-of-bound messages" depends on UNIX default y + help + Support for MSG_OOB in UNIX domain sockets. If unsure, say Y. config UNIX_DIAG tristate "UNIX: socket monitoring interface" From 29b540795b42a3e610c0d5e9d908a8d6c1333676 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 18 Dec 2024 14:17:16 +0100 Subject: [PATCH 0835/1450] gre: Drop ip_route_output_gre(). We already have enough variants of ip_route_output*() functions. We don't need a GRE specific one in the generic route.h header file. Furthermore, ip_route_output_gre() is only used once, in ipgre_open(), where it can be easily replaced by a simple call to ip_route_output_key(). While there, and for clarity, explicitly set .flowi4_scope to RT_SCOPE_UNIVERSE instead of relying on the implicit zero initialisation. Signed-off-by: Guillaume Nault Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/ab7cba47b8558cd4bfe2dc843c38b622a95ee48e.1734527729.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 14 -------------- net/ipv4/ip_gre.c | 17 ++++++++++------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 84cb1e04f5cd9..6947a155d5011 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -185,20 +185,6 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi return ip_route_output_flow(net, fl4, sk); } -static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4, - __be32 daddr, __be32 saddr, - __be32 gre_key, __u8 tos, int oif) -{ - memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; - fl4->daddr = daddr; - fl4->saddr = saddr; - fl4->flowi4_tos = tos; - fl4->flowi4_proto = IPPROTO_GRE; - fl4->fl4_gre_key = gre_key; - return ip_route_output_key(net, fl4); -} - enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f1f31ebfc7934..a020342f618d6 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -924,15 +924,18 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct flowi4 fl4; + struct flowi4 fl4 = { + .flowi4_oif = t->parms.link, + .flowi4_tos = t->parms.iph.tos & INET_DSCP_MASK, + .flowi4_scope = RT_SCOPE_UNIVERSE, + .flowi4_proto = IPPROTO_GRE, + .saddr = t->parms.iph.saddr, + .daddr = t->parms.iph.daddr, + .fl4_gre_key = t->parms.o_key, + }; struct rtable *rt; - rt = ip_route_output_gre(t->net, &fl4, - t->parms.iph.daddr, - t->parms.iph.saddr, - t->parms.o_key, - t->parms.iph.tos & INET_DSCP_MASK, - t->parms.link); + rt = ip_route_output_key(t->net, &fl4); if (IS_ERR(rt)) return -EADDRNOTAVAIL; dev = rt->dst.dev; From a19d0236f466f1ce8f44a04a96c302d3023eebf4 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:29 +0100 Subject: [PATCH 0836/1450] page_pool: add page_pool_dev_alloc_netmem() Similarly to other _dev shorthands, add one for page_pool_alloc_netmem() to allocate a netmem using the default Rx GFP flags (ATOMIC | NOWARN) to make the page -> netmem transition of drivers easier. Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 776a3008ac28c..543f54fa3020c 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -144,6 +144,15 @@ static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, return netmem; } +static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size) +{ + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + + return page_pool_alloc_netmem(pool, offset, size, gfp); +} + static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) From 68ddc8ae17685a8c4ac78260bde8fe4a79511aef Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:30 +0100 Subject: [PATCH 0837/1450] xdp: add generic xdp_buff_add_frag() The code piece which would attach a frag to &xdp_buff is almost identical across the drivers supporting XDP multi-buffer on Rx. Make it a generic elegant "oneliner". Also, I see lots of drivers calculating frags_truesize as `xdp->frame_sz * nr_frags`. I can't say this is fully correct, since frags might be backed by chunks of different sizes, especially with stuff like the header split. Even page_pool_alloc() can give you two different truesizes on two subsequent requests to allocate the same buffer size. Add a field to &skb_shared_info (unionized as there's no free slot currently on x86_64) to track the "true" truesize. It can be used later when updating the skb. Reviewed-by: Maciej Fijalkowski Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-3-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 16 +++++-- include/net/xdp.h | 96 +++++++++++++++++++++++++++++++++++++++++- net/core/xdp.c | 11 +++++ 3 files changed, 118 insertions(+), 5 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b2509cd0b9306..bb2b751d274ac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -608,11 +608,19 @@ struct skb_shared_info { * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; - unsigned int xdp_frags_size; - /* Intermediate layers must ensure that destructor_arg - * remains valid until skb destructor */ - void * destructor_arg; + union { + struct { + u32 xdp_frags_size; + u32 xdp_frags_truesize; + }; + + /* + * Intermediate layers must ensure that destructor_arg + * remains valid until skb destructor. + */ + void *destructor_arg; + }; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; diff --git a/include/net/xdp.h b/include/net/xdp.h index d2089cfecefd5..11139c210b498 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -167,6 +167,93 @@ xdp_get_buff_len(const struct xdp_buff *xdp) return len; } +void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp); + +/** + * __xdp_buff_add_frag - attach frag to &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @netmem: network memory containing the frag + * @offset: offset at which the frag starts + * @size: size of the frag + * @truesize: total memory size occupied by the frag + * @try_coalesce: whether to try coalescing the frags (not valid for XSk) + * + * Attach frag to the XDP buffer. If it currently has no frags attached, + * initialize the related fields, otherwise check that the frag number + * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing + * the frag with the previous one. + * The function doesn't check/update the pfmemalloc bit. Please use the + * non-underscored wrapper in drivers. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, + u32 offset, u32 size, u32 truesize, + bool try_coalesce) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *prev; + u32 nr_frags; + + if (!xdp_buff_has_frags(xdp)) { + xdp_buff_set_frags_flag(xdp); + + nr_frags = 0; + sinfo->xdp_frags_size = 0; + sinfo->xdp_frags_truesize = 0; + + goto fill; + } + + nr_frags = sinfo->nr_frags; + prev = &sinfo->frags[nr_frags - 1]; + + if (try_coalesce && netmem == skb_frag_netmem(prev) && + offset == skb_frag_off(prev) + skb_frag_size(prev)) { + skb_frag_size_add(prev, size); + /* Guaranteed to only decrement the refcount */ + xdp_return_frag(netmem, xdp); + } else if (unlikely(nr_frags == MAX_SKB_FRAGS)) { + return false; + } else { +fill: + __skb_fill_netmem_desc_noacc(sinfo, nr_frags++, netmem, + offset, size); + } + + sinfo->nr_frags = nr_frags; + sinfo->xdp_frags_size += size; + sinfo->xdp_frags_truesize += truesize; + + return true; +} + +/** + * xdp_buff_add_frag - attach frag to &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @netmem: network memory containing the frag + * @offset: offset at which the frag starts + * @size: size of the frag + * @truesize: total memory size occupied by the frag + * + * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, + u32 offset, u32 size, u32 truesize) +{ + if (!__xdp_buff_add_frag(xdp, netmem, offset, size, truesize, true)) + return false; + + if (unlikely(netmem_is_pfmemalloc(netmem))) + xdp_buff_set_frag_pfmemalloc(xdp); + + return true; +} + struct xdp_frame { void *data; u32 len; @@ -230,7 +317,14 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, bool pfmemalloc) { - skb_shinfo(skb)->nr_frags = nr_frags; + struct skb_shared_info *sinfo = skb_shinfo(skb); + + sinfo->nr_frags = nr_frags; + /* + * ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``, + * reset it after that these fields aren't used anymore. + */ + sinfo->destructor_arg = NULL; skb->len += size; skb->data_len += size; diff --git a/net/core/xdp.c b/net/core/xdp.c index f1165a35411bc..a66a4e036f53f 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -535,6 +535,17 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); +/** + * xdp_return_frag -- free one XDP frag or decrement its refcount + * @netmem: network memory reference to release + * @xdp: &xdp_buff to release the frag for + */ +void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp) +{ + __xdp_return(netmem, xdp->rxq->mem.type, true, NULL); +} +EXPORT_SYMBOL_GPL(xdp_return_frag); + void xdp_return_buff(struct xdp_buff *xdp) { struct skb_shared_info *sinfo; From 539c1fba1ac77184215d892eda0857f5687b7366 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:31 +0100 Subject: [PATCH 0838/1450] xdp: add generic xdp_build_skb_from_buff() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code which builds an skb from an &xdp_buff keeps multiplying itself around the drivers with almost no changes. Let's try to stop that by adding a generic function. Unlike __xdp_build_skb_from_frame(), always allocate an skbuff head using napi_build_skb() and make use of the available xdp_rxq pointer to assign the Rx queue index. In case of PP-backed buffer, mark the skb to be recycled, as every PP user's been switched to recycle skbs. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-4-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 1 + net/core/xdp.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index 11139c210b498..aa24fa78cbe6e 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -336,6 +336,7 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, diff --git a/net/core/xdp.c b/net/core/xdp.c index a66a4e036f53f..704203a15a18e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -629,6 +629,61 @@ int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) } EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); +/** + * xdp_build_skb_from_buff - create an skb from &xdp_buff + * @xdp: &xdp_buff to convert to an skb + * + * Perform common operations to create a new skb to pass up the stack from + * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize + * skb data pointers and offsets, set the recycle bit if the buff is + * PP-backed, Rx queue index, protocol and update frags info. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) +{ + const struct xdp_rxq_info *rxq = xdp->rxq; + const struct skb_shared_info *sinfo; + struct sk_buff *skb; + u32 nr_frags = 0; + int metalen; + + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } + + skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + __skb_put(skb, xdp->data_end - xdp->data); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) + skb_metadata_set(skb, metalen); + + if (rxq->mem.type == MEM_TYPE_PAGE_POOL) + skb_mark_for_recycle(skb); + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(nr_frags)) { + u32 tsize; + + tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; + xdp_update_skb_shared_info(skb, nr_frags, + sinfo->xdp_frags_size, tsize, + xdp_buff_is_frag_pfmemalloc(xdp)); + } + + skb->protocol = eth_type_trans(skb, rxq->dev); + + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff); + struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev) From 51205f841a495c78aa59d0e41683463dac23eb27 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:32 +0100 Subject: [PATCH 0839/1450] xsk: make xsk_buff_add_frag() really add the frag via __xdp_buff_add_frag() Currently, xsk_buff_add_frag() only adds the frag to pool's linked list, not doing anything with the &xdp_buff. The drivers do that manually and the logic is the same. Make it really add an skb frag, just like xdp_buff_add_frag() does that, and freeing frags on error if needed. This allows to remove repeating code from i40e and ice and not add the same code again and again. Acked-by: Maciej Fijalkowski Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-5-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 30 ++------------------ drivers/net/ethernet/intel/ice/ice_xsk.c | 32 ++-------------------- include/net/xdp_sock_drv.h | 18 ++++++++++-- 3 files changed, 20 insertions(+), 60 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 4e885df789ef4..e28f1905a4a0f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -395,32 +395,6 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring, WARN_ON_ONCE(1); } -static int -i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first, - struct xdp_buff *xdp, const unsigned int size) -{ - struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(first); - - if (!xdp_buff_has_frags(first)) { - sinfo->nr_frags = 0; - sinfo->xdp_frags_size = 0; - xdp_buff_set_frags_flag(first); - } - - if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { - xsk_buff_free(first); - return -ENOMEM; - } - - __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, - virt_to_page(xdp->data_hard_start), - XDP_PACKET_HEADROOM, size); - sinfo->xdp_frags_size += size; - xsk_buff_add_frag(xdp); - - return 0; -} - /** * i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring * @rx_ring: Rx ring @@ -486,8 +460,10 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) if (!first) first = bi; - else if (i40e_add_xsk_frag(rx_ring, first, bi, size)) + else if (!xsk_buff_add_frag(first, bi)) { + xsk_buff_free(first); break; + } if (++next_to_process == count) next_to_process = 0; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 334ae945d6404..8975d2971bc37 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -801,35 +801,6 @@ ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, return result; } -static int -ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first, - struct xdp_buff *xdp, const unsigned int size) -{ - struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(first); - - if (!size) - return 0; - - if (!xdp_buff_has_frags(first)) { - sinfo->nr_frags = 0; - sinfo->xdp_frags_size = 0; - xdp_buff_set_frags_flag(first); - } - - if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { - xsk_buff_free(first); - return -ENOMEM; - } - - __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, - virt_to_page(xdp->data_hard_start), - XDP_PACKET_HEADROOM, size); - sinfo->xdp_frags_size += size; - xsk_buff_add_frag(xdp); - - return 0; -} - /** * ice_clean_rx_irq_zc - consumes packets from the hardware ring * @rx_ring: AF_XDP Rx ring @@ -895,7 +866,8 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, if (!first) { first = xdp; - } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) { + } else if (likely(size) && !xsk_buff_add_frag(first, xdp)) { + xsk_buff_free(first); break; } diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index f3175a5d28f79..86620c8189659 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -136,11 +136,21 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) xp_free(xskb); } -static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) { - struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); + const void *data = xdp->data; + struct xdp_buff_xsk *frag; + + if (!__xdp_buff_add_frag(head, virt_to_netmem(data), + offset_in_page(data), xdp->data_end - data, + xdp->frame_sz, false)) + return false; + frag = container_of(xdp, struct xdp_buff_xsk, xdp); list_add_tail(&frag->list_node, &frag->pool->xskb_list); + + return true; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) @@ -357,8 +367,10 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } -static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) { + return false; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) From 560d958c6c68fa62ddb4bd6f890c363598d184b0 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:33 +0100 Subject: [PATCH 0840/1450] xsk: add generic XSk &xdp_buff -> skb conversion Same as with converting &xdp_buff to skb on Rx, the code which allocates a new skb and copies the XSk frame there is identical across the drivers, so make it generic. This includes copying all the frags if they are present in the original buff. System percpu page_pools greatly improve XDP_PASS performance on XSk: instead of page_alloc() + page_free(), the net core recycles the same pages, so the only overhead left is memcpy()s. When the Page Pool is not compiled in, the whole function is a return-NULL (but it always gets selected when eBPF is enabled). Note that the passed buff gets freed if the conversion is done w/o any error, assuming you don't need this buffer after you convert it to an skb. Reviewed-by: Maciej Fijalkowski Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-6-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 1 + net/core/xdp.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index aa24fa78cbe6e..6da0e746cf755 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -337,6 +337,7 @@ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, diff --git a/net/core/xdp.c b/net/core/xdp.c index 704203a15a18e..67b53fc7191e3 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -684,6 +684,118 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff); +/** + * xdp_copy_frags_from_zc - copy frags from XSk buff to skb + * @skb: skb to copy frags to + * @xdp: XSk &xdp_buff from which the frags will be copied + * @pp: &page_pool backing page allocation, if available + * + * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack. + * Allocate a new buffer for each frag, copy it and attach to the skb. + * + * Return: true on success, false on netmem allocation fail. + */ +static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, + const struct xdp_buff *xdp, + struct page_pool *pp) +{ + struct skb_shared_info *sinfo = skb_shinfo(skb); + const struct skb_shared_info *xinfo; + u32 nr_frags, tsize = 0; + bool pfmemalloc = false; + + xinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = xinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + u32 len = skb_frag_size(&xinfo->frags[i]); + u32 offset, truesize = len; + netmem_ref netmem; + + netmem = page_pool_dev_alloc_netmem(pp, &offset, &truesize); + if (unlikely(!netmem)) { + sinfo->nr_frags = i; + return false; + } + + memcpy(__netmem_address(netmem), + __netmem_address(xinfo->frags[i].netmem), + LARGEST_ALIGN(len)); + __skb_fill_netmem_desc_noacc(sinfo, i, netmem, offset, len); + + tsize += truesize; + pfmemalloc |= netmem_is_pfmemalloc(netmem); + } + + xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, + tsize, pfmemalloc); + + return true; +} + +/** + * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff + * @xdp: source XSk buff + * + * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb + * head, new buffer for the head, copy the data and initialize the skb fields. + * If there are frags, allocate new buffers for them and copy. + * Buffers are allocated from the system percpu pools to try recycling them. + * If new skb was built successfully, @xdp is returned to XSk pool's freelist. + * On error, it remains untouched and the caller must take care of this. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) +{ + struct page_pool *pp = this_cpu_read(system_page_pool); + const struct xdp_rxq_info *rxq = xdp->rxq; + u32 len = xdp->data_end - xdp->data_meta; + u32 truesize = xdp->frame_sz; + struct sk_buff *skb; + int metalen; + void *data; + + if (!IS_ENABLED(CONFIG_PAGE_POOL)) + return NULL; + + data = page_pool_dev_alloc_va(pp, &truesize); + if (unlikely(!data)) + return NULL; + + skb = napi_build_skb(data, truesize); + if (unlikely(!skb)) { + page_pool_free_va(pp, data, true); + return NULL; + } + + skb_mark_for_recycle(skb); + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); + + memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len)); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) { + skb_metadata_set(skb, metalen); + __skb_pull(skb, metalen); + } + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(xdp_buff_has_frags(xdp)) && + unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { + napi_consume_skb(skb, true); + return NULL; + } + + xsk_buff_free(xdp); + + skb->protocol = eth_type_trans(skb, rxq->dev); + + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); + struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev) From 1ae40d5231732275c620a1c58c83884a979b6eb1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2024 10:33:40 +0100 Subject: [PATCH 0841/1450] ALSA: compress_offload: import DMA_BUF namespace The compression offload code cannot be in a loadable module unless it imports that namespace: ERROR: modpost: module snd-compress uses symbol dma_buf_get from namespace DMA_BUF, but does not import it. ERROR: modpost: module snd-compress uses symbol dma_buf_put from namespace DMA_BUF, but does not import it. ERROR: modpost: module snd-compress uses symbol dma_buf_fd from namespace DMA_BUF, but does not import it. Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Signed-off-by: Arnd Bergmann Acked-by: Shengjiu Wang Acked-by: Vinod Koul Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241216093410.377112-1-arnd@kernel.org --- sound/core/compress_offload.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 86ed2fbee0c86..ec2485c00e497 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1247,6 +1247,7 @@ void snd_compr_task_finished(struct snd_compr_stream *stream, } EXPORT_SYMBOL_GPL(snd_compr_task_finished); +MODULE_IMPORT_NS("DMA_BUF"); #endif /* CONFIG_SND_COMPRESS_ACCEL */ static long snd_compr_ioctl(struct file *f, unsigned int cmd, unsigned long arg) From 6018f2fe1089b46c6c9eb136338eca7b16a92331 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Dec 2024 10:33:41 +0100 Subject: [PATCH 0842/1450] ALSA: compress_offload: avoid 64-bit get_user() On some architectures, get_user() cannot read a 64-bit user variable: arm-linux-gnueabi-ld: sound/core/compress_offload.o: in function `snd_compr_ioctl': compress_offload.c:(.text.snd_compr_ioctl+0x538): undefined reference to `__get_user_bad' Use an equivalent copy_from_user() instead. Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Signed-off-by: Arnd Bergmann Acked-by: Shengjiu Wang Acked-by: Vinod Koul Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241216093410.377112-2-arnd@kernel.org --- sound/core/compress_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index ec2485c00e497..1d6769a66810e 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1180,9 +1180,9 @@ static int snd_compr_task_seq(struct snd_compr_stream *stream, unsigned long arg if (stream->runtime->state != SNDRV_PCM_STATE_SETUP) return -EPERM; - retval = get_user(seqno, (__u64 __user *)arg); - if (retval < 0) - return retval; + retval = copy_from_user(&seqno, (__u64 __user *)arg, sizeof(seqno)); + if (retval) + return -EFAULT; retval = 0; if (seqno == 0) { list_for_each_entry_reverse(task, &stream->runtime->tasks, list) From f25a51b47c61540585a9e8a4e16f91677ebcbbc4 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Tue, 17 Dec 2024 11:07:07 +0100 Subject: [PATCH 0843/1450] ALSA: compress_offload: use safe list iteration in snd_compr_task_seq() The sequence function can call snd_compr_task_free_one(). Use list_for_each_entry_safe_reverse() to make sure that the used pointers are safe. Link: https://lore.kernel.org/linux-sound/f2769cff-6c7a-4092-a2d1-c33a5411a182@stanley.mountain/ Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Reported-by: Dan Carpenter Cc: Vinod Koul Signed-off-by: Jaroslav Kysela Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241217100707.732766-1-perex@perex.cz --- sound/core/compress_offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 1d6769a66810e..bdb6e307e4538 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1174,7 +1174,7 @@ typedef void (*snd_compr_seq_func_t)(struct snd_compr_stream *stream, static int snd_compr_task_seq(struct snd_compr_stream *stream, unsigned long arg, snd_compr_seq_func_t fcn) { - struct snd_compr_task_runtime *task; + struct snd_compr_task_runtime *task, *temp; __u64 seqno; int retval; @@ -1185,7 +1185,7 @@ static int snd_compr_task_seq(struct snd_compr_stream *stream, unsigned long arg return -EFAULT; retval = 0; if (seqno == 0) { - list_for_each_entry_reverse(task, &stream->runtime->tasks, list) + list_for_each_entry_safe_reverse(task, temp, &stream->runtime->tasks, list) fcn(stream, task); } else { task = snd_compr_find_task(stream, seqno); From 3d3f43fab4cfb9cf245e3dbffa1736ce925bb54a Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Tue, 17 Dec 2024 11:07:26 +0100 Subject: [PATCH 0844/1450] ALSA: compress_offload: improve file descriptors installation for dma-buf Avoid to use single dma_buf_fd() call for both directions. This code ensures that both file descriptors are allocated before fd_install(). Link: https://lore.kernel.org/linux-sound/6a923647-4495-4cff-a253-b73f48cfd0ea@stanley.mountain/ Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Reported-by: Dan Carpenter Cc: Vinod Koul Signed-off-by: Jaroslav Kysela Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20241217100726.732863-1-perex@perex.cz --- sound/core/compress_offload.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index bdb6e307e4538..edf5aadf38e5a 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1025,7 +1025,7 @@ static u64 snd_compr_seqno_next(struct snd_compr_stream *stream) static int snd_compr_task_new(struct snd_compr_stream *stream, struct snd_compr_task *utask) { struct snd_compr_task_runtime *task; - int retval; + int retval, fd_i, fd_o; if (stream->runtime->total_tasks >= stream->runtime->fragments) return -EBUSY; @@ -1039,16 +1039,24 @@ static int snd_compr_task_new(struct snd_compr_stream *stream, struct snd_compr_ retval = stream->ops->task_create(stream, task); if (retval < 0) goto cleanup; - utask->input_fd = dma_buf_fd(task->input, O_WRONLY|O_CLOEXEC); - if (utask->input_fd < 0) { - retval = utask->input_fd; + /* similar functionality as in dma_buf_fd(), but ensure that both + file descriptors are allocated before fd_install() */ + if (!task->input || !task->input->file || !task->output || !task->output->file) { + retval = -EINVAL; goto cleanup; } - utask->output_fd = dma_buf_fd(task->output, O_RDONLY|O_CLOEXEC); - if (utask->output_fd < 0) { - retval = utask->output_fd; + fd_i = get_unused_fd_flags(O_WRONLY|O_CLOEXEC); + if (fd_i < 0) + goto cleanup; + fd_o = get_unused_fd_flags(O_RDONLY|O_CLOEXEC); + if (fd_o < 0) { + put_unused_fd(fd_i); goto cleanup; } + fd_install(fd_i, task->input->file); + fd_install(fd_o, task->output->file); + utask->input_fd = fd_i; + utask->output_fd = fd_o; /* keep dmabuf reference until freed with task free ioctl */ dma_buf_get(utask->input_fd); dma_buf_get(utask->output_fd); From fa0308134d26dbbeb209a1581eea46df663866b6 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 19 Dec 2024 23:33:45 +0300 Subject: [PATCH 0845/1450] ALSA: memalloc: prefer dma_mapping_error() over explicit address checking With CONFIG_DMA_API_DEBUG enabled, the following warning is observed: DMA-API: snd_hda_intel 0000:03:00.1: device driver failed to check map error[device address=0x00000000ffff0000] [size=20480 bytes] [mapped as single] WARNING: CPU: 28 PID: 2255 at kernel/dma/debug.c:1036 check_unmap+0x1408/0x2430 CPU: 28 UID: 42 PID: 2255 Comm: wireplumber Tainted: G W L 6.12.0-10-133577cad6bf48e5a7848c4338124081393bfe8a+ #759 debug_dma_unmap_page+0xe9/0xf0 snd_dma_wc_free+0x85/0x130 [snd_pcm] snd_pcm_lib_free_pages+0x1e3/0x440 [snd_pcm] snd_pcm_common_ioctl+0x1c9a/0x2960 [snd_pcm] snd_pcm_ioctl+0x6a/0xc0 [snd_pcm] ... Check for returned DMA addresses using specialized dma_mapping_error() helper which is generally recommended for this purpose by Documentation/core-api/dma-api.rst. Fixes: c880a5146642 ("ALSA: memalloc: Use proper DMA mapping API for x86 WC buffer allocations") Reported-by: Mikhail Gavrilov Closes: https://lore.kernel.org/r/CABXGCsNB3RsMGvCucOy3byTEOxoc-Ys+zB_HQ=Opb_GhX1ioDA@mail.gmail.com/ Tested-by: Mikhail Gavrilov Signed-off-by: Fedor Pchelkin Link: https://patch.msgid.link/20241219203345.195898-1-pchelkin@ispras.ru Signed-off-by: Takashi Iwai --- sound/core/memalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 13b71069ae187..b3853583d2ae1 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -505,7 +505,7 @@ static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) if (!p) return NULL; dmab->addr = dma_map_single(dmab->dev.dev, p, size, DMA_BIDIRECTIONAL); - if (dmab->addr == DMA_MAPPING_ERROR) { + if (dma_mapping_error(dmab->dev.dev, dmab->addr)) { do_free_pages(dmab->area, size, true); return NULL; } From 55853cb829dc707427c3519f6b8686682a204368 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 18 Dec 2024 10:59:31 +0800 Subject: [PATCH 0846/1450] selftests/alsa: Fix circular dependency involving global-timer The pattern rule `$(OUTPUT)/%: %.c` inadvertently included a circular dependency on the global-timer target due to its inclusion in $(TEST_GEN_PROGS_EXTENDED). This resulted in a circular dependency warning during the build process. To resolve this, the dependency on $(TEST_GEN_PROGS_EXTENDED) has been replaced with an explicit dependency on $(OUTPUT)/libatest.so. This change ensures that libatest.so is built before any other targets that require it, without creating a circular dependency. This fix addresses the following warning: make[4]: Entering directory 'tools/testing/selftests/alsa' make[4]: Circular default_modconfig/kselftest/alsa/global-timer <- default_modconfig/kselftest/alsa/global-timer dependency dropped. make[4]: Nothing to be done for 'all'. make[4]: Leaving directory 'tools/testing/selftests/alsa' Cc: Mark Brown Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Shuah Khan Signed-off-by: Li Zhijian Link: https://patch.msgid.link/20241218025931.914164-1-lizhijian@fujitsu.com Signed-off-by: Takashi Iwai --- tools/testing/selftests/alsa/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/alsa/Makefile b/tools/testing/selftests/alsa/Makefile index 944279160fed2..8dab90ad22bb2 100644 --- a/tools/testing/selftests/alsa/Makefile +++ b/tools/testing/selftests/alsa/Makefile @@ -27,5 +27,5 @@ include ../lib.mk $(OUTPUT)/libatest.so: conf.c alsa-local.h $(CC) $(CFLAGS) -shared -fPIC $< $(LDLIBS) -o $@ -$(OUTPUT)/%: %.c $(TEST_GEN_PROGS_EXTENDED) alsa-local.h +$(OUTPUT)/%: %.c $(OUTPUT)/libatest.so alsa-local.h $(CC) $(CFLAGS) $< $(LDLIBS) -latest -o $@ From 66a0a2b0473c39ae85c44628d14e4366fdc0aa0d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 20 Dec 2024 12:44:16 +0100 Subject: [PATCH 0847/1450] ALSA: sh: Fix wrong argument order for copy_from_iter() Fix a brown paper bag bug I introduced at converting to the standard iter helper; the arguments were wrongly passed and have to be swapped. Fixes: 9b5f8ee43e48 ("ALSA: sh: Use standard helper for buffer accesses") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412140019.jat5Dofr-lkp@intel.com/ Link: https://patch.msgid.link/20241220114417.5898-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/sh/sh_dac_audio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/sh/sh_dac_audio.c b/sound/sh/sh_dac_audio.c index a4d07438ad64b..3f5422145c5e2 100644 --- a/sound/sh/sh_dac_audio.c +++ b/sound/sh/sh_dac_audio.c @@ -163,7 +163,7 @@ static int snd_sh_dac_pcm_copy(struct snd_pcm_substream *substream, /* channel is not used (interleaved data) */ struct snd_sh_dac *chip = snd_pcm_substream_chip(substream); - if (copy_from_iter(chip->data_buffer + pos, src, count) != count) + if (copy_from_iter(chip->data_buffer + pos, count, src) != count) return -EFAULT; chip->buffer_end = chip->data_buffer + pos + count; From 6321f5fb70d502d95de8a212a7b484c297ec9644 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:11 -0800 Subject: [PATCH 0848/1450] gve: clean XDP queues in gve_tx_stop_ring_gqi When stopping XDP TX rings, the XDP clean function needs to be called to clean out the entire queue, similar to what happens in the normal TX queue case. Otherwise, the FIFO won't be cleared correctly, and xsk_tx_completed won't be reported. Fixes: 75eaae158b1b ("gve: Add XDP DROP and TX support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index e7fb7d6d283df..83ad278ec91fe 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -206,7 +206,10 @@ void gve_tx_stop_ring_gqi(struct gve_priv *priv, int idx) return; gve_remove_napi(priv, ntfy_idx); - gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + if (tx->q_num < priv->tx_cfg.num_queues) + gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false); + else + gve_clean_xdp_done(priv, tx, priv->tx_desc_cnt); netdev_tx_reset_queue(tx->netdev_txq); gve_tx_remove_from_block(priv, idx); } From ff7c2dea9dd1a436fc79d6273adffdcc4a7ffea3 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:12 -0800 Subject: [PATCH 0849/1450] gve: guard XDP xmit NDO on existence of xdp queues In GVE, dedicated XDP queues only exist when an XDP program is installed and the interface is up. As such, the NDO XDP XMIT callback should return early if either of these conditions are false. In the case of no loaded XDP program, priv->num_xdp_queues=0 which can cause a divide-by-zero error, and in the case of interface down, num_xdp_queues remains untouched to persist XDP queue count for the next interface up, but the TX pointer itself would be NULL. The XDP xmit callback also needs to synchronize with a device transitioning from open to close. This synchronization will happen via the GVE_PRIV_FLAGS_NAPI_ENABLED bit along with a synchronize_net() call, which waits for any RCU critical sections at call-time to complete. Fixes: 39a7f4aa3e4a ("gve: Add XDP REDIRECT support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 3 +++ drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e171ca248f9af..5d7b0cc59959c 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1899,6 +1899,9 @@ static void gve_turndown(struct gve_priv *priv) gve_clear_napi_enabled(priv); gve_clear_report_stats(priv); + + /* Make sure that all traffic is finished processing. */ + synchronize_net(); } static void gve_turnup(struct gve_priv *priv) diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 83ad278ec91fe..852f8c7e39d29 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -837,9 +837,12 @@ int gve_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, struct gve_tx_ring *tx; int i, err = 0, qid; - if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK) || !priv->xdp_prog) return -EINVAL; + if (!gve_get_napi_enabled(priv)) + return -ENETDOWN; + qid = gve_xdp_tx_queue_id(priv, smp_processor_id() % priv->num_xdp_queues); From 40338d7987d810fcaa95c500b1068a52b08eec9b Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:13 -0800 Subject: [PATCH 0850/1450] gve: guard XSK operations on the existence of queues This patch predicates the enabling and disabling of XSK pools on the existence of queues. As it stands, if the interface is down, disabling or enabling XSK pools would result in a crash, as the RX queue pointer would be NULL. XSK pool registration will occur as part of the next interface up. Similarly, xsk_wakeup needs be guarded against queues disappearing while the function is executing, so a check against the GVE_PRIV_FLAGS_NAPI_ENABLED flag is added to synchronize with the disabling of the bit and the synchronize_net() in gve_turndown. Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Reviewed-by: Larysa Zaremba Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 5d7b0cc59959c..e4e8ff4f9f806 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1623,8 +1623,8 @@ static int gve_xsk_pool_enable(struct net_device *dev, if (err) return err; - /* If XDP prog is not installed, return */ - if (!priv->xdp_prog) + /* If XDP prog is not installed or interface is down, return. */ + if (!priv->xdp_prog || !netif_running(dev)) return 0; rx = &priv->rx[qid]; @@ -1669,21 +1669,16 @@ static int gve_xsk_pool_disable(struct net_device *dev, if (qid >= priv->rx_cfg.num_queues) return -EINVAL; - /* If XDP prog is not installed, unmap DMA and return */ - if (!priv->xdp_prog) + /* If XDP prog is not installed or interface is down, unmap DMA and + * return. + */ + if (!priv->xdp_prog || !netif_running(dev)) goto done; - tx_qid = gve_xdp_tx_queue_id(priv, qid); - if (!netif_running(dev)) { - priv->rx[qid].xsk_pool = NULL; - xdp_rxq_info_unreg(&priv->rx[qid].xsk_rxq); - priv->tx[tx_qid].xsk_pool = NULL; - goto done; - } - napi_rx = &priv->ntfy_blocks[priv->rx[qid].ntfy_id].napi; napi_disable(napi_rx); /* make sure current rx poll is done */ + tx_qid = gve_xdp_tx_queue_id(priv, qid); napi_tx = &priv->ntfy_blocks[priv->tx[tx_qid].ntfy_id].napi; napi_disable(napi_tx); /* make sure current tx poll is done */ @@ -1711,6 +1706,9 @@ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) struct gve_priv *priv = netdev_priv(dev); int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id); + if (!gve_get_napi_enabled(priv)) + return -ENETDOWN; + if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog) return -EINVAL; From ba0925c34e0fa6fe02d3d642bc02ab099ab312c7 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:14 -0800 Subject: [PATCH 0851/1450] gve: process XSK TX descriptors as part of RX NAPI When busy polling is enabled, xsk_sendmsg for AF_XDP zero copy marks the NAPI ID corresponding to the memory pool allocated for the socket. In GVE, this NAPI ID will never correspond to a NAPI ID of one of the dedicated XDP TX queues registered with the umem because XDP TX is not set up to share a NAPI with a corresponding RX queue. This patch moves XSK TX descriptor processing from the TX NAPI to the RX NAPI, and the gve_xsk_wakeup callback is updated to use the RX NAPI instead of the TX NAPI, accordingly. The branch on if the wakeup is for TX is removed, as the NAPI poll should be invoked whether the wakeup is for TX or for RX. Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format") Cc: stable@vger.kernel.org Signed-off-by: Praveen Kaligineedi Signed-off-by: Joshua Washington Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve.h | 1 + drivers/net/ethernet/google/gve/gve_main.c | 8 +++++ drivers/net/ethernet/google/gve/gve_tx.c | 36 +++++++++++++--------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index dd92949bb2149..8167cc5fb0df1 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1140,6 +1140,7 @@ int gve_xdp_xmit_one(struct gve_priv *priv, struct gve_tx_ring *tx, void gve_xdp_tx_flush(struct gve_priv *priv, u32 xdp_qid); bool gve_tx_poll(struct gve_notify_block *block, int budget); bool gve_xdp_poll(struct gve_notify_block *block, int budget); +int gve_xsk_tx_poll(struct gve_notify_block *block, int budget); int gve_tx_alloc_rings_gqi(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg); void gve_tx_free_rings_gqi(struct gve_priv *priv, diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index e4e8ff4f9f806..5cab7b88610fd 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -333,6 +333,14 @@ int gve_napi_poll(struct napi_struct *napi, int budget) if (block->rx) { work_done = gve_rx_poll(block, budget); + + /* Poll XSK TX as part of RX NAPI. Setup re-poll based on max of + * TX and RX work done. + */ + if (priv->xdp_prog) + work_done = max_t(int, work_done, + gve_xsk_tx_poll(block, budget)); + reschedule |= work_done == budget; } diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 852f8c7e39d29..4350ebd9c2bd9 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -981,33 +981,41 @@ static int gve_xsk_tx(struct gve_priv *priv, struct gve_tx_ring *tx, return sent; } +int gve_xsk_tx_poll(struct gve_notify_block *rx_block, int budget) +{ + struct gve_rx_ring *rx = rx_block->rx; + struct gve_priv *priv = rx->gve; + struct gve_tx_ring *tx; + int sent = 0; + + tx = &priv->tx[gve_xdp_tx_queue_id(priv, rx->q_num)]; + if (tx->xsk_pool) { + sent = gve_xsk_tx(priv, tx, budget); + + u64_stats_update_begin(&tx->statss); + tx->xdp_xsk_sent += sent; + u64_stats_update_end(&tx->statss); + if (xsk_uses_need_wakeup(tx->xsk_pool)) + xsk_set_tx_need_wakeup(tx->xsk_pool); + } + + return sent; +} + bool gve_xdp_poll(struct gve_notify_block *block, int budget) { struct gve_priv *priv = block->priv; struct gve_tx_ring *tx = block->tx; u32 nic_done; - bool repoll; u32 to_do; /* Find out how much work there is to be done */ nic_done = gve_tx_load_event_counter(priv, tx); to_do = min_t(u32, (nic_done - tx->done), budget); gve_clean_xdp_done(priv, tx, to_do); - repoll = nic_done != tx->done; - - if (tx->xsk_pool) { - int sent = gve_xsk_tx(priv, tx, budget); - - u64_stats_update_begin(&tx->statss); - tx->xdp_xsk_sent += sent; - u64_stats_update_end(&tx->statss); - repoll |= (sent == budget); - if (xsk_uses_need_wakeup(tx->xsk_pool)) - xsk_set_tx_need_wakeup(tx->xsk_pool); - } /* If we still have work we want to repoll */ - return repoll; + return nic_done != tx->done; } bool gve_tx_poll(struct gve_notify_block *block, int budget) From de63ac44a527b2c5067551dbd70d939fe151325a Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 18 Dec 2024 05:34:15 -0800 Subject: [PATCH 0852/1450] gve: fix XDP allocation path in edge cases This patch fixes a number of consistency issues in the queue allocation path related to XDP. As it stands, the number of allocated XDP queues changes in three different scenarios. 1) Adding an XDP program while the interface is up via gve_add_xdp_queues 2) Removing an XDP program while the interface is up via gve_remove_xdp_queues 3) After queues have been allocated and the old queue memory has been removed in gve_queues_start. However, the requirement for the interface to be up for gve_(add|remove)_xdp_queues to be called, in conjunction with the fact that the number of queues stored in priv isn't updated until _after_ XDP queues have been allocated in the normal queue allocation path means that if an XDP program is added while the interface is down, XDP queues won't be added until the _second_ if_up, not the first. Given the expectation that the number of XDP queues is equal to the number of RX queues, scenario (3) has another problematic implication. When changing the number of queues while an XDP program is loaded, the number of XDP queues must be updated as well, as there is logic in the driver (gve_xdp_tx_queue_id()) which relies on every RX queue having a corresponding XDP TX queue. However, the number of XDP queues stored in priv would not be updated until _after_ a close/open leading to a mismatch in the number of XDP queues reported vs the number of XDP queues which actually exist after the queue count update completes. This patch remedies these issues by doing the following: 1) The allocation config getter function is set up to retrieve the _expected_ number of XDP queues to allocate instead of relying on the value stored in `priv` which is only updated once the queues have been allocated. 2) When adjusting queues, XDP queues are adjusted to match the number of RX queues when XDP is enabled. This only works in the case when queues are live, so part (1) of the fix must still be available in the case that queues are adjusted when there is an XDP program and the interface is down. Fixes: 5f08cd3d6423 ("gve: Alloc before freeing when adjusting queues") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 5cab7b88610fd..09fb7f16f73e2 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -930,11 +930,13 @@ static void gve_init_sync_stats(struct gve_priv *priv) static void gve_tx_get_curr_alloc_cfg(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg) { + int num_xdp_queues = priv->xdp_prog ? priv->rx_cfg.num_queues : 0; + cfg->qcfg = &priv->tx_cfg; cfg->raw_addressing = !gve_is_qpl(priv); cfg->ring_size = priv->tx_desc_cnt; cfg->start_idx = 0; - cfg->num_rings = gve_num_tx_queues(priv); + cfg->num_rings = priv->tx_cfg.num_queues + num_xdp_queues; cfg->tx = priv->tx; } @@ -1843,6 +1845,7 @@ int gve_adjust_queues(struct gve_priv *priv, { struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0}; struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0}; + int num_xdp_queues; int err; gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg); @@ -1853,6 +1856,10 @@ int gve_adjust_queues(struct gve_priv *priv, rx_alloc_cfg.qcfg = &new_rx_config; tx_alloc_cfg.num_rings = new_tx_config.num_queues; + /* Add dedicated XDP TX queues if enabled. */ + num_xdp_queues = priv->xdp_prog ? new_rx_config.num_queues : 0; + tx_alloc_cfg.num_rings += num_xdp_queues; + if (netif_running(priv->dev)) { err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg); return err; From 926e862058978a8f81872845715d67ad21c30f65 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 14 Dec 2024 02:12:06 +0000 Subject: [PATCH 0853/1450] arm64/signal: Silence sparse warning storing GCSPR_EL0 We are seeing a sparse warning in gcs_restore_signal(): arch/arm64/kernel/signal.c:1054:9: sparse: sparse: cast removes address space '__user' of expression when storing the final GCSPR_EL0 value back into the register, caused by the fact that write_sysreg_s() casts the value it writes to a u64 which sparse sees as discarding the __userness of the pointer. Avoid this by treating the address as an integer, casting to a pointer only when using it to write to userspace. While we're at it also inline gcs_signal_cap_valid() into it's one user and make equivalent updates to gcs_signal_entry(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412082005.OBJ0BbWs-lkp@intel.com/ Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241214-arm64-gcs-signal-sparse-v3-1-5e8d18fffc0c@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/signal.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 37e24f1bd2279..99ea26d400ffe 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -36,15 +36,8 @@ #include #include -#ifdef CONFIG_ARM64_GCS #define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) -static bool gcs_signal_cap_valid(u64 addr, u64 val) -{ - return val == GCS_SIGNAL_CAP(addr); -} -#endif - /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -1062,8 +1055,7 @@ static int restore_sigframe(struct pt_regs *regs, #ifdef CONFIG_ARM64_GCS static int gcs_restore_signal(void) { - unsigned long __user *gcspr_el0; - u64 cap; + u64 gcspr_el0, cap; int ret; if (!system_supports_gcs()) @@ -1072,7 +1064,7 @@ static int gcs_restore_signal(void) if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) return 0; - gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Ensure that any changes to the GCS done via GCS operations @@ -1087,22 +1079,23 @@ static int gcs_restore_signal(void) * then faults will be generated on GCS operations - the main * concern is to protect GCS pages. */ - ret = copy_from_user(&cap, gcspr_el0, sizeof(cap)); + ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, + sizeof(cap)); if (ret) return -EFAULT; /* * Check that the cap is the actual GCS before replacing it. */ - if (!gcs_signal_cap_valid((u64)gcspr_el0, cap)) + if (cap != GCS_SIGNAL_CAP(gcspr_el0)) return -EINVAL; /* Invalidate the token to prevent reuse */ - put_user_gcs(0, (__user void*)gcspr_el0, &ret); + put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); if (ret != 0) return -EFAULT; - write_sysreg_s(gcspr_el0 + 1, SYS_GCSPR_EL0); + write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); return 0; } @@ -1421,7 +1414,7 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) { - unsigned long __user *gcspr_el0; + u64 gcspr_el0; int ret = 0; if (!system_supports_gcs()) @@ -1434,18 +1427,20 @@ static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) * We are entering a signal handler, current register state is * active. */ - gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Push a cap and the GCS entry for the trampoline onto the GCS. */ - put_user_gcs((unsigned long)sigtramp, gcspr_el0 - 2, &ret); - put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 1), gcspr_el0 - 1, &ret); + put_user_gcs((unsigned long)sigtramp, + (unsigned long __user *)(gcspr_el0 - 16), &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), + (unsigned long __user *)(gcspr_el0 - 8), &ret); if (ret != 0) return ret; - gcspr_el0 -= 2; - write_sysreg_s((unsigned long)gcspr_el0, SYS_GCSPR_EL0); + gcspr_el0 -= 16; + write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); return 0; } From 7917f01a286ce01e9c085e24468421f596ee1a0c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 Dec 2024 15:28:18 +1100 Subject: [PATCH 0854/1450] nfsd: restore callback functionality for NFSv4.0 A recent patch inadvertently broke callbacks for NFSv4.0. In the 4.0 case we do not expect a session to be found but still need to call setup_callback_client() which will not try to dereference it. This patch moves the check for failure to find a session into the 4.1+ branch of setup_callback_client() Fixes: 1e02c641c3a4 ("NFSD: Prevent NULL dereference in nfsd4_process_cb_update()") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3877b53e429fd..c083e539e898b 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1100,7 +1100,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; } else { - if (!conn->cb_xprt) + if (!conn->cb_xprt || !ses) return -EINVAL; clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; @@ -1522,8 +1522,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) ses = c->cn_session; } spin_unlock(&clp->cl_lock); - if (!c) - return; err = setup_callback_client(clp, &conn, ses); if (err) { From aa5d2ca7c179c40669edb5e96d931bf9828dea3d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 08:02:52 -0800 Subject: [PATCH 0855/1450] perf/x86/intel: Fix bitmask of OCR and FRONTEND events for LNC The released OCR and FRONTEND events utilized more bits on Lunar Lake p-core. The corresponding mask in the extra_regs has to be extended to unblock the extra bits. Add a dedicated intel_lnc_extra_regs. Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Reported-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20241216160252.430858-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2e1e268460500..99c590da0ae24 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -429,6 +429,16 @@ static struct event_constraint intel_lnc_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_lnc_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0xfffffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0xfffffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0xf, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); @@ -6422,7 +6432,7 @@ static __always_inline void intel_pmu_init_lnc(struct pmu *pmu) intel_pmu_init_glc(pmu); hybrid(pmu, event_constraints) = intel_lnc_event_constraints; hybrid(pmu, pebs_constraints) = intel_lnc_pebs_event_constraints; - hybrid(pmu, extra_regs) = intel_rwc_extra_regs; + hybrid(pmu, extra_regs) = intel_lnc_extra_regs; } static __always_inline void intel_pmu_init_skt(struct pmu *pmu) From 54f89b3178d5448dd4457afbb98fc1ab99090a65 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 10 Dec 2024 01:20:38 +0000 Subject: [PATCH 0856/1450] tcp_bpf: Charge receive socket buffer in bpf_tcp_ingress() When bpf_tcp_ingress() is called, the skmsg is being redirected to the ingress of the destination socket. Therefore, we should charge its receive socket buffer, instead of sending socket buffer. Because sk_rmem_schedule() tests pfmemalloc of skb, we need to introduce a wrapper and call it for skmsg. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-2-zijianzhang@bytedance.com --- include/net/sock.h | 10 ++++++++-- net/ipv4/tcp_bpf.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c5..c383126f691de 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1527,7 +1527,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; @@ -1535,7 +1535,13 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || - skb_pfmemalloc(skb); + pfmemalloc; +} + +static inline bool +sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +{ + return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 99cef92e6290c..b21ea634909c4 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -49,7 +49,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; - if (!sk_wmem_schedule(sk, size)) { + if (!__sk_rmem_schedule(sk, size, false)) { if (!copied) ret = -ENOMEM; break; From d888b7af7c149c115dd6ac772cc11c375da3e17c Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Tue, 10 Dec 2024 01:20:39 +0000 Subject: [PATCH 0857/1450] tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will be created out of the skb, and the rmem accounting of the sk_msg will be handled by the skb. For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result, except for the global memory limit, the rmem of sk_redir is nearly unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer. Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are used in these two paths. We use "msg->skb" to test whether the sk_msg is skb backed up. If it's not, we shall do the memory accounting explicitly. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Zijian Zhang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com --- include/linux/skmsg.h | 11 ++++++++--- net/core/skmsg.c | 6 +++++- net/ipv4/tcp_bpf.c | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9b03e0746e7a..2cbe0c22a32f3 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void sk_psock_queue_msg(struct sk_psock *psock, +static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + bool ret; + spin_lock_bh(&psock->ingress_lock); - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); - else { + ret = true; + } else { sk_msg_free(psock->sk, msg); kfree(msg); + ret = false; } spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e90fbab703b2d..8ad7e6755fd64 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b21ea634909c4..392678ae80f4e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -56,6 +56,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, } sk_mem_charge(sk, size); + atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) @@ -74,7 +75,8 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; - sk_psock_queue_msg(psock, tmp); + if (!sk_psock_queue_msg(psock, tmp)) + atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); From 724c6ce38bbaeb4b3f109b0e066d6c0ecd15446c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 19 Dec 2024 14:57:34 +0100 Subject: [PATCH 0858/1450] stddef: make __struct_group() UAPI C++-friendly For the most part of the C++ history, it couldn't have type declarations inside anonymous unions for different reasons. At the same time, __struct_group() relies on the latters, so when the @TAG argument is not empty, C++ code doesn't want to build (even under `extern "C"`): ../linux/include/uapi/linux/pkt_cls.h:25:24: error: 'struct tc_u32_sel::::tc_u32_sel_hdr,' invalid; an anonymous union may only have public non-static data members [-fpermissive] The safest way to fix this without trying to switch standards (which is impossible in UAPI anyway) etc., is to disable tag declaration for that language. This won't break anything since for now it's not buildable at all. Use a separate definition for __struct_group() when __cplusplus is defined to mitigate the error, including the version from tools/. Fixes: 50d7bd38c3aa ("stddef: Introduce struct_group() helper macro") Reported-by: Christopher Ferris Closes: https://lore.kernel.org/linux-hardening/Z1HZpe3WE5As8UAz@google.com Suggested-by: Kees Cook # __struct_group_tag() Signed-off-by: Alexander Lobakin Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20241219135734.2130002-1-aleksander.lobakin@intel.com Signed-off-by: Kees Cook --- include/uapi/linux/stddef.h | 13 ++++++++++--- tools/include/uapi/linux/stddef.h | 15 +++++++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index 58154117d9b09..a6fce46aeb37c 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -8,6 +8,13 @@ #define __always_inline inline #endif +/* Not all C++ standards support type declarations inside an anonymous union */ +#ifndef __cplusplus +#define __struct_group_tag(TAG) TAG +#else +#define __struct_group_tag(TAG) +#endif + /** * __struct_group() - Create a mirrored named and anonyomous struct * @@ -20,13 +27,13 @@ * and size: one anonymous and one named. The former's members can be used * normally without sub-struct naming, and the latter can be used to * reason about the start, end, and size of the group of struct members. - * The named struct can also be explicitly tagged for layer reuse, as well - * as both having struct attributes appended. + * The named struct can also be explicitly tagged for layer reuse (C only), + * as well as both having struct attributes appended. */ #define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \ union { \ struct { MEMBERS } ATTRS; \ - struct TAG { MEMBERS } ATTRS NAME; \ + struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \ } ATTRS #ifdef __cplusplus diff --git a/tools/include/uapi/linux/stddef.h b/tools/include/uapi/linux/stddef.h index bb6ea517efb51..c53cde425406b 100644 --- a/tools/include/uapi/linux/stddef.h +++ b/tools/include/uapi/linux/stddef.h @@ -8,6 +8,13 @@ #define __always_inline __inline__ #endif +/* Not all C++ standards support type declarations inside an anonymous union */ +#ifndef __cplusplus +#define __struct_group_tag(TAG) TAG +#else +#define __struct_group_tag(TAG) +#endif + /** * __struct_group() - Create a mirrored named and anonyomous struct * @@ -20,14 +27,14 @@ * and size: one anonymous and one named. The former's members can be used * normally without sub-struct naming, and the latter can be used to * reason about the start, end, and size of the group of struct members. - * The named struct can also be explicitly tagged for layer reuse, as well - * as both having struct attributes appended. + * The named struct can also be explicitly tagged for layer reuse (C only), + * as well as both having struct attributes appended. */ #define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \ union { \ struct { MEMBERS } ATTRS; \ - struct TAG { MEMBERS } ATTRS NAME; \ - } + struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \ + } ATTRS /** * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union From 46761fd52a8868a1420f75b675caf209793b8dd1 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Thu, 5 Dec 2024 09:44:43 +0100 Subject: [PATCH 0859/1450] ixgbe: Add support for E610 FW Admin Command Interface Add low level support for Admin Command Interface (ACI). ACI is the Firmware interface used by a driver to communicate with E610 adapter. Add the following ACI features: - data structures, macros, register definitions - commands handling - events handling Co-developed-by: Stefan Wegrzyn Signed-off-by: Stefan Wegrzyn Co-developed-by: Jedrzej Jagielski Signed-off-by: Jedrzej Jagielski Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Piotr Kwapulinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/Makefile | 4 +- .../net/ethernet/intel/ixgbe/ixgbe_common.c | 6 +- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 497 ++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 19 + drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 72 +- .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 1066 +++++++++++++++++ 6 files changed, 1657 insertions(+), 7 deletions(-) create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h diff --git a/drivers/net/ethernet/intel/ixgbe/Makefile b/drivers/net/ethernet/intel/ixgbe/Makefile index 965e5ce1b3267..b456d102655ac 100644 --- a/drivers/net/ethernet/intel/ixgbe/Makefile +++ b/drivers/net/ethernet/intel/ixgbe/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright(c) 1999 - 2018 Intel Corporation. +# Copyright(c) 1999 - 2024 Intel Corporation. # # Makefile for the Intel(R) 10GbE PCI Express ethernet driver # @@ -9,7 +9,7 @@ obj-$(CONFIG_IXGBE) += ixgbe.o ixgbe-y := ixgbe_main.o ixgbe_common.o ixgbe_ethtool.o \ ixgbe_82599.o ixgbe_82598.o ixgbe_phy.o ixgbe_sriov.o \ ixgbe_mbx.o ixgbe_x540.o ixgbe_x550.o ixgbe_lib.o ixgbe_ptp.o \ - ixgbe_xsk.o + ixgbe_xsk.o ixgbe_e610.o ixgbe-$(CONFIG_IXGBE_DCB) += ixgbe_dcb.o ixgbe_dcb_82598.o \ ixgbe_dcb_82599.o ixgbe_dcb_nl.o diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 3be1bfb16498b..bfab2c0ee0aab 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -660,7 +660,11 @@ int ixgbe_get_bus_info_generic(struct ixgbe_hw *hw) hw->bus.type = ixgbe_bus_type_pci_express; /* Get the negotiated link width and speed from PCI config space */ - link_status = ixgbe_read_pci_cfg_word(hw, IXGBE_PCI_LINK_STATUS); + if (hw->mac.type == ixgbe_mac_e610) + link_status = ixgbe_read_pci_cfg_word(hw, IXGBE_PCI_LINK_STATUS_E610); + else + link_status = ixgbe_read_pci_cfg_word(hw, + IXGBE_PCI_LINK_STATUS); hw->bus.width = ixgbe_convert_bus_width(link_status); hw->bus.speed = ixgbe_convert_bus_speed(link_status); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c new file mode 100644 index 0000000000000..6a26f6b4d3d5b --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2024 Intel Corporation. */ + +#include "ixgbe_common.h" +#include "ixgbe_e610.h" +#include "ixgbe_type.h" +#include "ixgbe_x540.h" +#include "ixgbe_phy.h" + +/** + * ixgbe_should_retry_aci_send_cmd_execute - decide if ACI command should + * be resent + * @opcode: ACI opcode + * + * Check if ACI command should be sent again depending on the provided opcode. + * It may happen when CSR is busy during link state changes. + * + * Return: true if the sending command routine should be repeated, + * otherwise false. + */ +static bool ixgbe_should_retry_aci_send_cmd_execute(u16 opcode) +{ + switch (opcode) { + case ixgbe_aci_opc_disable_rxen: + case ixgbe_aci_opc_get_phy_caps: + case ixgbe_aci_opc_get_link_status: + case ixgbe_aci_opc_get_link_topo: + return true; + } + + return false; +} + +/** + * ixgbe_aci_send_cmd_execute - execute sending FW Admin Command to FW Admin + * Command Interface + * @hw: pointer to the HW struct + * @desc: descriptor describing the command + * @buf: buffer to use for indirect commands (NULL for direct commands) + * @buf_size: size of buffer for indirect commands (0 for direct commands) + * + * Admin Command is sent using CSR by setting descriptor and buffer in specific + * registers. + * + * Return: the exit code of the operation. + * * - 0 - success. + * * - -EIO - CSR mechanism is not enabled. + * * - -EBUSY - CSR mechanism is busy. + * * - -EINVAL - buf_size is too big or + * invalid argument buf or buf_size. + * * - -ETIME - Admin Command X command timeout. + * * - -EIO - Admin Command X invalid state of HICR register or + * Admin Command failed because of bad opcode was returned or + * Admin Command failed with error Y. + */ +static int ixgbe_aci_send_cmd_execute(struct ixgbe_hw *hw, + struct ixgbe_aci_desc *desc, + void *buf, u16 buf_size) +{ + u16 opcode, buf_tail_size = buf_size % 4; + u32 *raw_desc = (u32 *)desc; + u32 hicr, i, buf_tail = 0; + bool valid_buf = false; + + hw->aci.last_status = IXGBE_ACI_RC_OK; + + /* It's necessary to check if mechanism is enabled */ + hicr = IXGBE_READ_REG(hw, IXGBE_PF_HICR); + + if (!(hicr & IXGBE_PF_HICR_EN)) + return -EIO; + + if (hicr & IXGBE_PF_HICR_C) { + hw->aci.last_status = IXGBE_ACI_RC_EBUSY; + return -EBUSY; + } + + opcode = le16_to_cpu(desc->opcode); + + if (buf_size > IXGBE_ACI_MAX_BUFFER_SIZE) + return -EINVAL; + + if (buf) + desc->flags |= cpu_to_le16(IXGBE_ACI_FLAG_BUF); + + if (desc->flags & cpu_to_le16(IXGBE_ACI_FLAG_BUF)) { + if ((buf && !buf_size) || + (!buf && buf_size)) + return -EINVAL; + if (buf && buf_size) + valid_buf = true; + } + + if (valid_buf) { + if (buf_tail_size) + memcpy(&buf_tail, buf + buf_size - buf_tail_size, + buf_tail_size); + + if (((buf_size + 3) & ~0x3) > IXGBE_ACI_LG_BUF) + desc->flags |= cpu_to_le16(IXGBE_ACI_FLAG_LB); + + desc->datalen = cpu_to_le16(buf_size); + + if (desc->flags & cpu_to_le16(IXGBE_ACI_FLAG_RD)) { + for (i = 0; i < buf_size / 4; i++) + IXGBE_WRITE_REG(hw, IXGBE_PF_HIBA(i), ((u32 *)buf)[i]); + if (buf_tail_size) + IXGBE_WRITE_REG(hw, IXGBE_PF_HIBA(i), buf_tail); + } + } + + /* Descriptor is written to specific registers */ + for (i = 0; i < IXGBE_ACI_DESC_SIZE_IN_DWORDS; i++) + IXGBE_WRITE_REG(hw, IXGBE_PF_HIDA(i), raw_desc[i]); + + /* SW has to set PF_HICR.C bit and clear PF_HICR.SV and + * PF_HICR_EV + */ + hicr = (IXGBE_READ_REG(hw, IXGBE_PF_HICR) | IXGBE_PF_HICR_C) & + ~(IXGBE_PF_HICR_SV | IXGBE_PF_HICR_EV); + IXGBE_WRITE_REG(hw, IXGBE_PF_HICR, hicr); + +#define MAX_SLEEP_RESP_US 1000 +#define MAX_TMOUT_RESP_SYNC_US 100000000 + + /* Wait for sync Admin Command response */ + read_poll_timeout(IXGBE_READ_REG, hicr, + (hicr & IXGBE_PF_HICR_SV) || + !(hicr & IXGBE_PF_HICR_C), + MAX_SLEEP_RESP_US, MAX_TMOUT_RESP_SYNC_US, true, hw, + IXGBE_PF_HICR); + +#define MAX_TMOUT_RESP_ASYNC_US 150000000 + + /* Wait for async Admin Command response */ + read_poll_timeout(IXGBE_READ_REG, hicr, + (hicr & IXGBE_PF_HICR_EV) || + !(hicr & IXGBE_PF_HICR_C), + MAX_SLEEP_RESP_US, MAX_TMOUT_RESP_ASYNC_US, true, hw, + IXGBE_PF_HICR); + + /* Read sync Admin Command response */ + if ((hicr & IXGBE_PF_HICR_SV)) { + for (i = 0; i < IXGBE_ACI_DESC_SIZE_IN_DWORDS; i++) { + raw_desc[i] = IXGBE_READ_REG(hw, IXGBE_PF_HIDA(i)); + raw_desc[i] = raw_desc[i]; + } + } + + /* Read async Admin Command response */ + if ((hicr & IXGBE_PF_HICR_EV) && !(hicr & IXGBE_PF_HICR_C)) { + for (i = 0; i < IXGBE_ACI_DESC_SIZE_IN_DWORDS; i++) { + raw_desc[i] = IXGBE_READ_REG(hw, IXGBE_PF_HIDA_2(i)); + raw_desc[i] = raw_desc[i]; + } + } + + /* Handle timeout and invalid state of HICR register */ + if (hicr & IXGBE_PF_HICR_C) + return -ETIME; + + if (!(hicr & IXGBE_PF_HICR_SV) && !(hicr & IXGBE_PF_HICR_EV)) + return -EIO; + + /* For every command other than 0x0014 treat opcode mismatch + * as an error. Response to 0x0014 command read from HIDA_2 + * is a descriptor of an event which is expected to contain + * different opcode than the command. + */ + if (desc->opcode != cpu_to_le16(opcode) && + opcode != ixgbe_aci_opc_get_fw_event) + return -EIO; + + if (desc->retval) { + hw->aci.last_status = (enum ixgbe_aci_err) + le16_to_cpu(desc->retval); + return -EIO; + } + + /* Write a response values to a buf */ + if (valid_buf) { + for (i = 0; i < buf_size / 4; i++) + ((u32 *)buf)[i] = IXGBE_READ_REG(hw, IXGBE_PF_HIBA(i)); + if (buf_tail_size) { + buf_tail = IXGBE_READ_REG(hw, IXGBE_PF_HIBA(i)); + memcpy(buf + buf_size - buf_tail_size, &buf_tail, + buf_tail_size); + } + } + + return 0; +} + +/** + * ixgbe_aci_send_cmd - send FW Admin Command to FW Admin Command Interface + * @hw: pointer to the HW struct + * @desc: descriptor describing the command + * @buf: buffer to use for indirect commands (NULL for direct commands) + * @buf_size: size of buffer for indirect commands (0 for direct commands) + * + * Helper function to send FW Admin Commands to the FW Admin Command Interface. + * + * Retry sending the FW Admin Command multiple times to the FW ACI + * if the EBUSY Admin Command error is returned. + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_send_cmd(struct ixgbe_hw *hw, struct ixgbe_aci_desc *desc, + void *buf, u16 buf_size) +{ + u16 opcode = le16_to_cpu(desc->opcode); + struct ixgbe_aci_desc desc_cpy; + enum ixgbe_aci_err last_status; + u8 idx = 0, *buf_cpy = NULL; + bool is_cmd_for_retry; + unsigned long timeout; + int err; + + is_cmd_for_retry = ixgbe_should_retry_aci_send_cmd_execute(opcode); + if (is_cmd_for_retry) { + if (buf) { + buf_cpy = kmalloc(buf_size, GFP_KERNEL); + if (!buf_cpy) + return -ENOMEM; + *buf_cpy = *(u8 *)buf; + } + desc_cpy = *desc; + } + + timeout = jiffies + msecs_to_jiffies(IXGBE_ACI_SEND_TIMEOUT_MS); + do { + mutex_lock(&hw->aci.lock); + err = ixgbe_aci_send_cmd_execute(hw, desc, buf, buf_size); + last_status = hw->aci.last_status; + mutex_unlock(&hw->aci.lock); + + if (!is_cmd_for_retry || !err || + last_status != IXGBE_ACI_RC_EBUSY) + break; + + if (buf) + memcpy(buf, buf_cpy, buf_size); + *desc = desc_cpy; + + msleep(IXGBE_ACI_SEND_DELAY_TIME_MS); + } while (++idx < IXGBE_ACI_SEND_MAX_EXECUTE && + time_before(jiffies, timeout)); + + kfree(buf_cpy); + + return err; +} + +/** + * ixgbe_aci_check_event_pending - check if there are any pending events + * @hw: pointer to the HW struct + * + * Determine if there are any pending events. + * + * Return: true if there are any currently pending events + * otherwise false. + */ +bool ixgbe_aci_check_event_pending(struct ixgbe_hw *hw) +{ + u32 ep_bit_mask = hw->bus.func ? GL_FWSTS_EP_PF1 : GL_FWSTS_EP_PF0; + u32 fwsts = IXGBE_READ_REG(hw, GL_FWSTS); + + return (fwsts & ep_bit_mask) ? true : false; +} + +/** + * ixgbe_aci_get_event - get an event from ACI + * @hw: pointer to the HW struct + * @e: event information structure + * @pending: optional flag signaling that there are more pending events + * + * Obtain an event from ACI and return its content + * through 'e' using ACI command (0x0014). + * Provide information if there are more events + * to retrieve through 'pending'. + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_get_event(struct ixgbe_hw *hw, struct ixgbe_aci_event *e, + bool *pending) +{ + struct ixgbe_aci_desc desc; + int err; + + if (!e || (!e->msg_buf && e->buf_len)) + return -EINVAL; + + mutex_lock(&hw->aci.lock); + + /* Check if there are any events pending */ + if (!ixgbe_aci_check_event_pending(hw)) { + err = -ENOENT; + goto aci_get_event_exit; + } + + /* Obtain pending event */ + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_fw_event); + err = ixgbe_aci_send_cmd_execute(hw, &desc, e->msg_buf, e->buf_len); + if (err) + goto aci_get_event_exit; + + /* Returned 0x0014 opcode indicates that no event was obtained */ + if (desc.opcode == cpu_to_le16(ixgbe_aci_opc_get_fw_event)) { + err = -ENOENT; + goto aci_get_event_exit; + } + + /* Determine size of event data */ + e->msg_len = min_t(u16, le16_to_cpu(desc.datalen), e->buf_len); + /* Write event descriptor to event info structure */ + memcpy(&e->desc, &desc, sizeof(e->desc)); + + /* Check if there are any further events pending */ + if (pending) + *pending = ixgbe_aci_check_event_pending(hw); + +aci_get_event_exit: + mutex_unlock(&hw->aci.lock); + + return err; +} + +/** + * ixgbe_fill_dflt_direct_cmd_desc - fill ACI descriptor with default values. + * @desc: pointer to the temp descriptor (non DMA mem) + * @opcode: the opcode can be used to decide which flags to turn off or on + * + * Helper function to fill the descriptor desc with default values + * and the provided opcode. + */ +void ixgbe_fill_dflt_direct_cmd_desc(struct ixgbe_aci_desc *desc, u16 opcode) +{ + /* Zero out the desc. */ + memset(desc, 0, sizeof(*desc)); + desc->opcode = cpu_to_le16(opcode); + desc->flags = cpu_to_le16(IXGBE_ACI_FLAG_SI); +} + +/** + * ixgbe_aci_req_res - request a common resource + * @hw: pointer to the HW struct + * @res: resource ID + * @access: access type + * @sdp_number: resource number + * @timeout: the maximum time in ms that the driver may hold the resource + * + * Requests a common resource using the ACI command (0x0008). + * Specifies the maximum time the driver may hold the resource. + * If the requested resource is currently occupied by some other driver, + * a busy return value is returned and the timeout field value indicates the + * maximum time the current owner has to free it. + * + * Return: the exit code of the operation. + */ +static int ixgbe_aci_req_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res, + enum ixgbe_aci_res_access_type access, + u8 sdp_number, u32 *timeout) +{ + struct ixgbe_aci_cmd_req_res *cmd_resp; + struct ixgbe_aci_desc desc; + int err; + + cmd_resp = &desc.params.res_owner; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_req_res); + + cmd_resp->res_id = cpu_to_le16(res); + cmd_resp->access_type = cpu_to_le16(access); + cmd_resp->res_number = cpu_to_le32(sdp_number); + cmd_resp->timeout = cpu_to_le32(*timeout); + *timeout = 0; + + err = ixgbe_aci_send_cmd(hw, &desc, NULL, 0); + + /* If the resource is held by some other driver, the command completes + * with a busy return value and the timeout field indicates the maximum + * time the current owner of the resource has to free it. + */ + if (!err || hw->aci.last_status == IXGBE_ACI_RC_EBUSY) + *timeout = le32_to_cpu(cmd_resp->timeout); + + return err; +} + +/** + * ixgbe_aci_release_res - release a common resource using ACI + * @hw: pointer to the HW struct + * @res: resource ID + * @sdp_number: resource number + * + * Release a common resource using ACI command (0x0009). + * + * Return: the exit code of the operation. + */ +static int ixgbe_aci_release_res(struct ixgbe_hw *hw, + enum ixgbe_aci_res_ids res, u8 sdp_number) +{ + struct ixgbe_aci_cmd_req_res *cmd; + struct ixgbe_aci_desc desc; + + cmd = &desc.params.res_owner; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_release_res); + + cmd->res_id = cpu_to_le16(res); + cmd->res_number = cpu_to_le32(sdp_number); + + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + +/** + * ixgbe_acquire_res - acquire the ownership of a resource + * @hw: pointer to the HW structure + * @res: resource ID + * @access: access type (read or write) + * @timeout: timeout in milliseconds + * + * Make an attempt to acquire the ownership of a resource using + * the ixgbe_aci_req_res to utilize ACI. + * In case if some other driver has previously acquired the resource and + * performed any necessary updates, the -EALREADY is returned, + * and the caller does not obtain the resource and has no further work to do. + * If needed, the function will poll until the current lock owner timeouts. + * + * Return: the exit code of the operation. + */ +int ixgbe_acquire_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res, + enum ixgbe_aci_res_access_type access, u32 timeout) +{ +#define IXGBE_RES_POLLING_DELAY_MS 10 + u32 delay = IXGBE_RES_POLLING_DELAY_MS; + u32 res_timeout = timeout; + u32 retry_timeout; + int err; + + err = ixgbe_aci_req_res(hw, res, access, 0, &res_timeout); + + /* A return code of -EALREADY means that another driver has + * previously acquired the resource and performed any necessary updates; + * in this case the caller does not obtain the resource and has no + * further work to do. + */ + if (err == -EALREADY) + return err; + + /* If necessary, poll until the current lock owner timeouts. + * Set retry_timeout to the timeout value reported by the FW in the + * response to the "Request Resource Ownership" (0x0008) Admin Command + * as it indicates the maximum time the current owner of the resource + * is allowed to hold it. + */ + retry_timeout = res_timeout; + while (err && retry_timeout && res_timeout) { + msleep(delay); + retry_timeout = (retry_timeout > delay) ? + retry_timeout - delay : 0; + err = ixgbe_aci_req_res(hw, res, access, 0, &res_timeout); + + /* Success - lock acquired. + * -EALREADY - lock free, no work to do. + */ + if (!err || err == -EALREADY) + break; + } + + return err; +} + +/** + * ixgbe_release_res - release a common resource + * @hw: pointer to the HW structure + * @res: resource ID + * + * Release a common resource using ixgbe_aci_release_res. + */ +void ixgbe_release_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res) +{ + u32 total_delay = 0; + int err; + + err = ixgbe_aci_release_res(hw, res, 0); + + /* There are some rare cases when trying to release the resource + * results in an admin command timeout, so handle them correctly. + */ + while (err == -ETIME && + total_delay < IXGBE_ACI_RELEASE_RES_TIMEOUT) { + usleep_range(1000, 1500); + err = ixgbe_aci_release_res(hw, res, 0); + total_delay++; + } +} diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h new file mode 100644 index 0000000000000..18b831b6797d8 --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2024 Intel Corporation. */ + +#ifndef _IXGBE_E610_H_ +#define _IXGBE_E610_H_ + +#include "ixgbe_type.h" + +int ixgbe_aci_send_cmd(struct ixgbe_hw *hw, struct ixgbe_aci_desc *desc, + void *buf, u16 buf_size); +bool ixgbe_aci_check_event_pending(struct ixgbe_hw *hw); +int ixgbe_aci_get_event(struct ixgbe_hw *hw, struct ixgbe_aci_event *e, + bool *pending); +void ixgbe_fill_dflt_direct_cmd_desc(struct ixgbe_aci_desc *desc, u16 opcode); +int ixgbe_acquire_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res, + enum ixgbe_aci_res_access_type access, u32 timeout); +void ixgbe_release_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res); + +#endif /* _IXGBE_E610_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h index 9baccacd02a1e..5fdf32d79d822 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #ifndef _IXGBE_TYPE_H_ #define _IXGBE_TYPE_H_ @@ -7,6 +7,7 @@ #include #include #include +#include "ixgbe_type_e610.h" /* Device IDs */ #define IXGBE_DEV_ID_82598 0x10B6 @@ -71,12 +72,19 @@ #define IXGBE_DEV_ID_X550EM_A_1G_T 0x15E4 #define IXGBE_DEV_ID_X550EM_A_1G_T_L 0x15E5 +#define IXGBE_DEV_ID_E610_BACKPLANE 0x57AE +#define IXGBE_DEV_ID_E610_SFP 0x57AF +#define IXGBE_DEV_ID_E610_10G_T 0x57B0 +#define IXGBE_DEV_ID_E610_2_5G_T 0x57B1 +#define IXGBE_DEV_ID_E610_SGMII 0x57B2 + /* VF Device IDs */ #define IXGBE_DEV_ID_82599_VF 0x10ED #define IXGBE_DEV_ID_X540_VF 0x1515 #define IXGBE_DEV_ID_X550_VF 0x1565 #define IXGBE_DEV_ID_X550EM_X_VF 0x15A8 #define IXGBE_DEV_ID_X550EM_A_VF 0x15C5 +#define IXGBE_DEV_ID_E610_VF 0x57AD #define IXGBE_CAT(r, m) IXGBE_##r##_##m @@ -1600,7 +1608,7 @@ enum { #define IXGBE_EICR_PCI 0x00040000 /* PCI Exception */ #define IXGBE_EICR_MAILBOX 0x00080000 /* VF to PF Mailbox Interrupt */ #define IXGBE_EICR_LSC 0x00100000 /* Link Status Change */ -#define IXGBE_EICR_LINKSEC 0x00200000 /* PN Threshold */ +#define IXGBE_EICR_FW_EVENT 0x00200000 /* Async FW event */ #define IXGBE_EICR_MNG 0x00400000 /* Manageability Event Interrupt */ #define IXGBE_EICR_TS 0x00800000 /* Thermal Sensor Event */ #define IXGBE_EICR_TIMESYNC 0x01000000 /* Timesync Event */ @@ -1636,6 +1644,7 @@ enum { #define IXGBE_EICS_PCI IXGBE_EICR_PCI /* PCI Exception */ #define IXGBE_EICS_MAILBOX IXGBE_EICR_MAILBOX /* VF to PF Mailbox Int */ #define IXGBE_EICS_LSC IXGBE_EICR_LSC /* Link Status Change */ +#define IXGBE_EICS_FW_EVENT IXGBE_EICR_FW_EVENT /* Async FW event */ #define IXGBE_EICS_MNG IXGBE_EICR_MNG /* MNG Event Interrupt */ #define IXGBE_EICS_TIMESYNC IXGBE_EICR_TIMESYNC /* Timesync Event */ #define IXGBE_EICS_GPI_SDP0(_hw) IXGBE_EICR_GPI_SDP0(_hw) @@ -1654,6 +1663,7 @@ enum { #define IXGBE_EIMS_PCI IXGBE_EICR_PCI /* PCI Exception */ #define IXGBE_EIMS_MAILBOX IXGBE_EICR_MAILBOX /* VF to PF Mailbox Int */ #define IXGBE_EIMS_LSC IXGBE_EICR_LSC /* Link Status Change */ +#define IXGBE_EIMS_FW_EVENT IXGBE_EICR_FW_EVENT /* Async FW event */ #define IXGBE_EIMS_MNG IXGBE_EICR_MNG /* MNG Event Interrupt */ #define IXGBE_EIMS_TS IXGBE_EICR_TS /* Thermel Sensor Event */ #define IXGBE_EIMS_TIMESYNC IXGBE_EICR_TIMESYNC /* Timesync Event */ @@ -1673,6 +1683,7 @@ enum { #define IXGBE_EIMC_PCI IXGBE_EICR_PCI /* PCI Exception */ #define IXGBE_EIMC_MAILBOX IXGBE_EICR_MAILBOX /* VF to PF Mailbox Int */ #define IXGBE_EIMC_LSC IXGBE_EICR_LSC /* Link Status Change */ +#define IXGBE_EIMC_FW_EVENT IXGBE_EICR_FW_EVENT /* Async FW event */ #define IXGBE_EIMC_MNG IXGBE_EICR_MNG /* MNG Event Interrupt */ #define IXGBE_EIMC_TIMESYNC IXGBE_EICR_TIMESYNC /* Timesync Event */ #define IXGBE_EIMC_GPI_SDP0(_hw) IXGBE_EICR_GPI_SDP0(_hw) @@ -2068,6 +2079,7 @@ enum { #define IXGBE_SAN_MAC_ADDR_PTR 0x28 #define IXGBE_DEVICE_CAPS 0x2C #define IXGBE_SERIAL_NUMBER_MAC_ADDR 0x11 +#define IXGBE_PCIE_MSIX_E610_CAPS 0xB2 #define IXGBE_PCIE_MSIX_82599_CAPS 0x72 #define IXGBE_MAX_MSIX_VECTORS_82599 0x40 #define IXGBE_PCIE_MSIX_82598_CAPS 0x62 @@ -2168,6 +2180,7 @@ enum { #define IXGBE_PCI_DEVICE_STATUS 0xAA #define IXGBE_PCI_DEVICE_STATUS_TRANSACTION_PENDING 0x0020 #define IXGBE_PCI_LINK_STATUS 0xB2 +#define IXGBE_PCI_LINK_STATUS_E610 0x82 #define IXGBE_PCI_DEVICE_CONTROL2 0xC8 #define IXGBE_PCI_LINK_WIDTH 0x3F0 #define IXGBE_PCI_LINK_WIDTH_1 0x10 @@ -2288,6 +2301,7 @@ enum { #define IXGBE_RXMTRL_V2_MGMT_MSG 0x0D00 #define IXGBE_FCTRL_SBP 0x00000002 /* Store Bad Packet */ +#define IXGBE_FCTRL_TPE 0x00000080 /* Tag Promiscuous Ena*/ #define IXGBE_FCTRL_MPE 0x00000100 /* Multicast Promiscuous Ena*/ #define IXGBE_FCTRL_UPE 0x00000200 /* Unicast Promiscuous Ena */ #define IXGBE_FCTRL_BAM 0x00000400 /* Broadcast Accept Mode */ @@ -2351,6 +2365,7 @@ enum { /* Multiple Transmit Queue Command Register */ #define IXGBE_MTQC_RT_ENA 0x1 /* DCB Enable */ #define IXGBE_MTQC_VT_ENA 0x2 /* VMDQ2 Enable */ +#define IXGBE_MTQC_NUM_TC_OR_Q 0xC /* Number of TCs or TxQs per pool */ #define IXGBE_MTQC_64Q_1PB 0x0 /* 64 queues 1 pack buffer */ #define IXGBE_MTQC_32VF 0x8 /* 4 TX Queues per pool w/32VF's */ #define IXGBE_MTQC_64VF 0x4 /* 2 TX Queues per pool w/64VF's */ @@ -2970,6 +2985,29 @@ typedef u32 ixgbe_link_speed; IXGBE_LINK_SPEED_1GB_FULL | \ IXGBE_LINK_SPEED_10GB_FULL) +/* Physical layer type */ +typedef u64 ixgbe_physical_layer; +#define IXGBE_PHYSICAL_LAYER_UNKNOWN 0 +#define IXGBE_PHYSICAL_LAYER_10GBASE_T 0x00001 +#define IXGBE_PHYSICAL_LAYER_1000BASE_T 0x00002 +#define IXGBE_PHYSICAL_LAYER_100BASE_TX 0x00004 +#define IXGBE_PHYSICAL_LAYER_SFP_PLUS_CU 0x00008 +#define IXGBE_PHYSICAL_LAYER_10GBASE_LR 0x00010 +#define IXGBE_PHYSICAL_LAYER_10GBASE_LRM 0x00020 +#define IXGBE_PHYSICAL_LAYER_10GBASE_SR 0x00040 +#define IXGBE_PHYSICAL_LAYER_10GBASE_KX4 0x00080 +#define IXGBE_PHYSICAL_LAYER_10GBASE_CX4 0x00100 +#define IXGBE_PHYSICAL_LAYER_1000BASE_KX 0x00200 +#define IXGBE_PHYSICAL_LAYER_1000BASE_BX 0x00400 +#define IXGBE_PHYSICAL_LAYER_10GBASE_KR 0x00800 +#define IXGBE_PHYSICAL_LAYER_10GBASE_XAUI 0x01000 +#define IXGBE_PHYSICAL_LAYER_SFP_ACTIVE_DA 0x02000 +#define IXGBE_PHYSICAL_LAYER_1000BASE_SX 0x04000 +#define IXGBE_PHYSICAL_LAYER_10BASE_T 0x08000 +#define IXGBE_PHYSICAL_LAYER_2500BASE_KX 0x10000 +#define IXGBE_PHYSICAL_LAYER_2500BASE_T 0x20000 +#define IXGBE_PHYSICAL_LAYER_5000BASE_T 0x40000 + /* Flow Control Data Sheet defined values * Calculation and defines taken from 802.1bb Annex O */ @@ -3145,6 +3183,8 @@ enum ixgbe_mac_type { ixgbe_mac_X550, ixgbe_mac_X550EM_x, ixgbe_mac_x550em_a, + ixgbe_mac_e610, + ixgbe_mac_e610_vf, ixgbe_num_macs }; @@ -3224,7 +3264,9 @@ enum ixgbe_media_type { ixgbe_media_type_copper, ixgbe_media_type_backplane, ixgbe_media_type_cx4, - ixgbe_media_type_virtual + ixgbe_media_type_virtual, + ixgbe_media_type_da, + ixgbe_media_type_aui, }; /* Flow Control Settings */ @@ -3233,7 +3275,8 @@ enum ixgbe_fc_mode { ixgbe_fc_rx_pause, ixgbe_fc_tx_pause, ixgbe_fc_full, - ixgbe_fc_default + ixgbe_fc_default, + ixgbe_fc_pfc, }; /* Smart Speed Settings */ @@ -3533,6 +3576,9 @@ struct ixgbe_link_operations { struct ixgbe_link_info { struct ixgbe_link_operations ops; u8 addr; + struct ixgbe_link_status link_info; + struct ixgbe_link_status link_info_old; + u8 get_link_info; }; struct ixgbe_eeprom_info { @@ -3575,6 +3621,7 @@ struct ixgbe_mac_info { u8 san_mac_rar_index; struct ixgbe_thermal_sensor_data thermal_sensor_data; bool set_lben; + u32 max_link_up_time; u8 led_link_act; }; @@ -3599,6 +3646,10 @@ struct ixgbe_phy_info { bool reset_if_overtemp; bool qsfp_shared_i2c_bus; u32 nw_mng_if_sel; + u64 phy_type_low; + u64 phy_type_high; + u16 curr_user_speed_req; + struct ixgbe_aci_cmd_set_phy_cfg_data curr_user_phy_cfg; }; struct ixgbe_mbx_stats { @@ -3643,6 +3694,19 @@ struct ixgbe_hw { bool allow_unsupported_sfp; bool wol_enabled; bool need_crosstalk_fix; + u8 api_branch; + u8 api_maj_ver; + u8 api_min_ver; + u8 api_patch; + u8 fw_branch; + u8 fw_maj_ver; + u8 fw_min_ver; + u8 fw_patch; + u32 fw_build; + struct ixgbe_aci_info aci; + struct ixgbe_flash_info flash; + struct ixgbe_hw_dev_caps dev_caps; + struct ixgbe_hw_func_caps func_caps; }; struct ixgbe_info { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h new file mode 100644 index 0000000000000..5978cb06f732a --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -0,0 +1,1066 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2024 Intel Corporation. */ + +#ifndef _IXGBE_TYPE_E610_H_ +#define _IXGBE_TYPE_E610_H_ + +#define BYTES_PER_DWORD 4 + +/* General E610 defines */ +#define IXGBE_MAX_VSI 768 + +/* Checksum and Shadow RAM pointers */ +#define E610_SR_SW_CHECKSUM_WORD 0x3F + +/* Firmware Status Register (GL_FWSTS) */ +#define GL_FWSTS 0x00083048 /* Reset Source: POR */ +#define GL_FWSTS_EP_PF0 BIT(24) +#define GL_FWSTS_EP_PF1 BIT(25) + +/* Flash Access Register */ +#define IXGBE_GLNVM_FLA 0x000B6108 /* Reset Source: POR */ +#define IXGBE_GLNVM_FLA_LOCKED_S 6 +#define IXGBE_GLNVM_FLA_LOCKED_M BIT(6) + +/* Admin Command Interface (ACI) registers */ +#define IXGBE_PF_HIDA(_i) (0x00085000 + ((_i) * 4)) +#define IXGBE_PF_HIDA_2(_i) (0x00085020 + ((_i) * 4)) +#define IXGBE_PF_HIBA(_i) (0x00084000 + ((_i) * 4)) +#define IXGBE_PF_HICR 0x00082048 + +#define IXGBE_PF_HICR_EN BIT(0) +#define IXGBE_PF_HICR_C BIT(1) +#define IXGBE_PF_HICR_SV BIT(2) +#define IXGBE_PF_HICR_EV BIT(3) + +#define IXGBE_ACI_DESC_SIZE 32 +#define IXGBE_ACI_DESC_SIZE_IN_DWORDS (IXGBE_ACI_DESC_SIZE / BYTES_PER_DWORD) + +#define IXGBE_ACI_MAX_BUFFER_SIZE 4096 /* Size in bytes */ +#define IXGBE_ACI_SEND_DELAY_TIME_MS 10 +#define IXGBE_ACI_SEND_MAX_EXECUTE 3 +#define IXGBE_ACI_SEND_TIMEOUT_MS \ + (IXGBE_ACI_SEND_MAX_EXECUTE * IXGBE_ACI_SEND_DELAY_TIME_MS) +/* [ms] timeout of waiting for sync response */ +#define IXGBE_ACI_SYNC_RESPONSE_TIMEOUT 100000 +/* [ms] timeout of waiting for async response */ +#define IXGBE_ACI_ASYNC_RESPONSE_TIMEOUT 150000 +/* [ms] timeout of waiting for resource release */ +#define IXGBE_ACI_RELEASE_RES_TIMEOUT 10000 + +/* FW defined boundary for a large buffer, 4k >= Large buffer > 512 bytes */ +#define IXGBE_ACI_LG_BUF 512 + +/* Flags sub-structure + * |0 |1 |2 |3 |4 |5 |6 |7 |8 |9 |10 |11 |12 |13 |14 |15 | + * |DD |CMP|ERR|VFE| * * RESERVED * * |LB |RD |VFC|BUF|SI |EI |FE | + */ + +#define IXGBE_ACI_FLAG_DD BIT(0) /* 0x1 */ +#define IXGBE_ACI_FLAG_CMP BIT(1) /* 0x2 */ +#define IXGBE_ACI_FLAG_ERR BIT(2) /* 0x4 */ +#define IXGBE_ACI_FLAG_VFE BIT(3) /* 0x8 */ +#define IXGBE_ACI_FLAG_LB BIT(9) /* 0x200 */ +#define IXGBE_ACI_FLAG_RD BIT(10) /* 0x400 */ +#define IXGBE_ACI_FLAG_VFC BIT(11) /* 0x800 */ +#define IXGBE_ACI_FLAG_BUF BIT(12) /* 0x1000 */ +#define IXGBE_ACI_FLAG_SI BIT(13) /* 0x2000 */ +#define IXGBE_ACI_FLAG_EI BIT(14) /* 0x4000 */ +#define IXGBE_ACI_FLAG_FE BIT(15) /* 0x8000 */ + +/* Admin Command Interface (ACI) error codes */ +enum ixgbe_aci_err { + IXGBE_ACI_RC_OK = 0, /* Success */ + IXGBE_ACI_RC_EPERM = 1, /* Operation not permitted */ + IXGBE_ACI_RC_ENOENT = 2, /* No such element */ + IXGBE_ACI_RC_ESRCH = 3, /* Bad opcode */ + IXGBE_ACI_RC_EINTR = 4, /* Operation interrupted */ + IXGBE_ACI_RC_EIO = 5, /* I/O error */ + IXGBE_ACI_RC_ENXIO = 6, /* No such resource */ + IXGBE_ACI_RC_E2BIG = 7, /* Arg too long */ + IXGBE_ACI_RC_EAGAIN = 8, /* Try again */ + IXGBE_ACI_RC_ENOMEM = 9, /* Out of memory */ + IXGBE_ACI_RC_EACCES = 10, /* Permission denied */ + IXGBE_ACI_RC_EFAULT = 11, /* Bad address */ + IXGBE_ACI_RC_EBUSY = 12, /* Device or resource busy */ + IXGBE_ACI_RC_EEXIST = 13, /* Object already exists */ + IXGBE_ACI_RC_EINVAL = 14, /* Invalid argument */ + IXGBE_ACI_RC_ENOTTY = 15, /* Not a typewriter */ + IXGBE_ACI_RC_ENOSPC = 16, /* No space left or alloc failure */ + IXGBE_ACI_RC_ENOSYS = 17, /* Function not implemented */ + IXGBE_ACI_RC_ERANGE = 18, /* Parameter out of range */ + IXGBE_ACI_RC_EFLUSHED = 19, /* Cmd flushed due to prev cmd error */ + IXGBE_ACI_RC_BAD_ADDR = 20, /* Descriptor contains a bad pointer */ + IXGBE_ACI_RC_EMODE = 21, /* Op not allowed in current dev mode */ + IXGBE_ACI_RC_EFBIG = 22, /* File too big */ + IXGBE_ACI_RC_ESBCOMP = 23, /* SB-IOSF completion unsuccessful */ + IXGBE_ACI_RC_ENOSEC = 24, /* Missing security manifest */ + IXGBE_ACI_RC_EBADSIG = 25, /* Bad RSA signature */ + IXGBE_ACI_RC_ESVN = 26, /* SVN number prohibits this package */ + IXGBE_ACI_RC_EBADMAN = 27, /* Manifest hash mismatch */ + IXGBE_ACI_RC_EBADBUF = 28, /* Buffer hash mismatches manifest */ + IXGBE_ACI_RC_EACCES_BMCU = 29, /* BMC Update in progress */ +}; + +/* Admin Command Interface (ACI) opcodes */ +enum ixgbe_aci_opc { + ixgbe_aci_opc_get_ver = 0x0001, + ixgbe_aci_opc_driver_ver = 0x0002, + ixgbe_aci_opc_get_exp_err = 0x0005, + + /* resource ownership */ + ixgbe_aci_opc_req_res = 0x0008, + ixgbe_aci_opc_release_res = 0x0009, + + /* device/function capabilities */ + ixgbe_aci_opc_list_func_caps = 0x000A, + ixgbe_aci_opc_list_dev_caps = 0x000B, + + /* safe disable of RXEN */ + ixgbe_aci_opc_disable_rxen = 0x000C, + + /* FW events */ + ixgbe_aci_opc_get_fw_event = 0x0014, + + /* PHY commands */ + ixgbe_aci_opc_get_phy_caps = 0x0600, + ixgbe_aci_opc_set_phy_cfg = 0x0601, + ixgbe_aci_opc_restart_an = 0x0605, + ixgbe_aci_opc_get_link_status = 0x0607, + ixgbe_aci_opc_set_event_mask = 0x0613, + ixgbe_aci_opc_get_link_topo = 0x06E0, + ixgbe_aci_opc_get_link_topo_pin = 0x06E1, + ixgbe_aci_opc_read_i2c = 0x06E2, + ixgbe_aci_opc_write_i2c = 0x06E3, + ixgbe_aci_opc_read_mdio = 0x06E4, + ixgbe_aci_opc_write_mdio = 0x06E5, + ixgbe_aci_opc_set_gpio_by_func = 0x06E6, + ixgbe_aci_opc_get_gpio_by_func = 0x06E7, + ixgbe_aci_opc_set_gpio = 0x06EC, + ixgbe_aci_opc_get_gpio = 0x06ED, + ixgbe_aci_opc_sff_eeprom = 0x06EE, + ixgbe_aci_opc_prog_topo_dev_nvm = 0x06F2, + ixgbe_aci_opc_read_topo_dev_nvm = 0x06F3, + + /* NVM commands */ + ixgbe_aci_opc_nvm_read = 0x0701, + ixgbe_aci_opc_nvm_erase = 0x0702, + ixgbe_aci_opc_nvm_write = 0x0703, + ixgbe_aci_opc_nvm_cfg_read = 0x0704, + ixgbe_aci_opc_nvm_cfg_write = 0x0705, + ixgbe_aci_opc_nvm_checksum = 0x0706, + ixgbe_aci_opc_nvm_write_activate = 0x0707, + ixgbe_aci_opc_nvm_sr_dump = 0x0707, + ixgbe_aci_opc_nvm_save_factory_settings = 0x0708, + ixgbe_aci_opc_nvm_update_empr = 0x0709, + ixgbe_aci_opc_nvm_pkg_data = 0x070A, + ixgbe_aci_opc_nvm_pass_component_tbl = 0x070B, + + /* Alternate Structure Commands */ + ixgbe_aci_opc_write_alt_direct = 0x0900, + ixgbe_aci_opc_write_alt_indirect = 0x0901, + ixgbe_aci_opc_read_alt_direct = 0x0902, + ixgbe_aci_opc_read_alt_indirect = 0x0903, + ixgbe_aci_opc_done_alt_write = 0x0904, + ixgbe_aci_opc_clear_port_alt_write = 0x0906, + + /* debug commands */ + ixgbe_aci_opc_debug_dump_internals = 0xFF08, + + /* SystemDiagnostic commands */ + ixgbe_aci_opc_set_health_status_config = 0xFF20, + ixgbe_aci_opc_get_supported_health_status_codes = 0xFF21, + ixgbe_aci_opc_get_health_status = 0xFF22, + ixgbe_aci_opc_clear_health_status = 0xFF23, +}; + +/* Get version (direct 0x0001) */ +struct ixgbe_aci_cmd_get_ver { + __le32 rom_ver; + __le32 fw_build; + u8 fw_branch; + u8 fw_major; + u8 fw_minor; + u8 fw_patch; + u8 api_branch; + u8 api_major; + u8 api_minor; + u8 api_patch; +}; + +#define IXGBE_DRV_VER_STR_LEN_E610 32 + +/* Send driver version (indirect 0x0002) */ +struct ixgbe_aci_cmd_driver_ver { + u8 major_ver; + u8 minor_ver; + u8 build_ver; + u8 subbuild_ver; + u8 reserved[4]; + __le32 addr_high; + __le32 addr_low; +}; + +/* Get Expanded Error Code (0x0005, direct) */ +struct ixgbe_aci_cmd_get_exp_err { + __le32 reason; +#define IXGBE_ACI_EXPANDED_ERROR_NOT_PROVIDED 0xFFFFFFFF + __le32 identifier; + u8 rsvd[8]; +}; + +/* FW update timeout definitions are in milliseconds */ +#define IXGBE_NVM_TIMEOUT 180000 + +enum ixgbe_aci_res_access_type { + IXGBE_RES_READ = 1, + IXGBE_RES_WRITE +}; + +enum ixgbe_aci_res_ids { + IXGBE_NVM_RES_ID = 1, + IXGBE_SPD_RES_ID, + IXGBE_CHANGE_LOCK_RES_ID, + IXGBE_GLOBAL_CFG_LOCK_RES_ID +}; + +/* Request resource ownership (direct 0x0008) + * Release resource ownership (direct 0x0009) + */ +struct ixgbe_aci_cmd_req_res { + __le16 res_id; + __le16 access_type; + + /* Upon successful completion, FW writes this value and driver is + * expected to release resource before timeout. This value is provided + * in milliseconds. + */ + __le32 timeout; +#define IXGBE_ACI_RES_NVM_READ_DFLT_TIMEOUT_MS 3000 +#define IXGBE_ACI_RES_NVM_WRITE_DFLT_TIMEOUT_MS 180000 +#define IXGBE_ACI_RES_CHNG_LOCK_DFLT_TIMEOUT_MS 1000 +#define IXGBE_ACI_RES_GLBL_LOCK_DFLT_TIMEOUT_MS 3000 + /* For SDP: pin ID of the SDP */ + __le32 res_number; + __le16 status; +#define IXGBE_ACI_RES_GLBL_SUCCESS 0 +#define IXGBE_ACI_RES_GLBL_IN_PROG 1 +#define IXGBE_ACI_RES_GLBL_DONE 2 + u8 reserved[2]; +}; + +/* Get function capabilities (indirect 0x000A) + * Get device capabilities (indirect 0x000B) + */ +struct ixgbe_aci_cmd_list_caps { + u8 cmd_flags; + u8 pf_index; + u8 reserved[2]; + __le32 count; + __le32 addr_high; + __le32 addr_low; +}; + +/* Device/Function buffer entry, repeated per reported capability */ +struct ixgbe_aci_cmd_list_caps_elem { + __le16 cap; +#define IXGBE_ACI_CAPS_VALID_FUNCTIONS 0x0005 +#define IXGBE_ACI_MAX_VALID_FUNCTIONS 0x8 +#define IXGBE_ACI_CAPS_SRIOV 0x0012 +#define IXGBE_ACI_CAPS_VF 0x0013 +#define IXGBE_ACI_CAPS_VMDQ 0x0014 +#define IXGBE_ACI_CAPS_VSI 0x0017 +#define IXGBE_ACI_CAPS_DCB 0x0018 +#define IXGBE_ACI_CAPS_RSS 0x0040 +#define IXGBE_ACI_CAPS_RXQS 0x0041 +#define IXGBE_ACI_CAPS_TXQS 0x0042 +#define IXGBE_ACI_CAPS_MSIX 0x0043 +#define IXGBE_ACI_CAPS_FD 0x0045 +#define IXGBE_ACI_CAPS_1588 0x0046 +#define IXGBE_ACI_CAPS_MAX_MTU 0x0047 +#define IXGBE_ACI_CAPS_NVM_VER 0x0048 +#define IXGBE_ACI_CAPS_PENDING_NVM_VER 0x0049 +#define IXGBE_ACI_CAPS_OROM_VER 0x004A +#define IXGBE_ACI_CAPS_PENDING_OROM_VER 0x004B +#define IXGBE_ACI_CAPS_PENDING_NET_VER 0x004D +#define IXGBE_ACI_CAPS_INLINE_IPSEC 0x0070 +#define IXGBE_ACI_CAPS_NUM_ENABLED_PORTS 0x0072 +#define IXGBE_ACI_CAPS_PCIE_RESET_AVOIDANCE 0x0076 +#define IXGBE_ACI_CAPS_POST_UPDATE_RESET_RESTRICT 0x0077 +#define IXGBE_ACI_CAPS_NVM_MGMT 0x0080 +#define IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG0 0x0081 +#define IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG1 0x0082 +#define IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG2 0x0083 +#define IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG3 0x0084 + u8 major_ver; + u8 minor_ver; + /* Number of resources described by this capability */ + __le32 number; + /* Only meaningful for some types of resources */ + __le32 logical_id; + /* Only meaningful for some types of resources */ + __le32 phys_id; + __le64 rsvd1; + __le64 rsvd2; +}; + +/* Disable RXEN (direct 0x000C) */ +struct ixgbe_aci_cmd_disable_rxen { + u8 lport_num; + u8 reserved[15]; +}; + +/* Get PHY capabilities (indirect 0x0600) */ +struct ixgbe_aci_cmd_get_phy_caps { + u8 lport_num; + u8 reserved; + __le16 param0; + /* 18.0 - Report qualified modules */ +#define IXGBE_ACI_GET_PHY_RQM BIT(0) + /* 18.1 - 18.3 : Report mode + * 000b - Report topology capabilities, without media + * 001b - Report topology capabilities, with media + * 010b - Report Active configuration + * 011b - Report PHY Type and FEC mode capabilities + * 100b - Report Default capabilities + */ +#define IXGBE_ACI_REPORT_MODE_M GENMASK(3, 1) +#define IXGBE_ACI_REPORT_TOPO_CAP_NO_MEDIA 0 +#define IXGBE_ACI_REPORT_TOPO_CAP_MEDIA BIT(1) +#define IXGBE_ACI_REPORT_ACTIVE_CFG BIT(2) +#define IXGBE_ACI_REPORT_DFLT_CFG BIT(3) + __le32 reserved1; + __le32 addr_high; + __le32 addr_low; +}; + +/* This is #define of PHY type (Extended): + * The first set of defines is for phy_type_low. + */ +#define IXGBE_PHY_TYPE_LOW_100BASE_TX BIT_ULL(0) +#define IXGBE_PHY_TYPE_LOW_100M_SGMII BIT_ULL(1) +#define IXGBE_PHY_TYPE_LOW_1000BASE_T BIT_ULL(2) +#define IXGBE_PHY_TYPE_LOW_1000BASE_SX BIT_ULL(3) +#define IXGBE_PHY_TYPE_LOW_1000BASE_LX BIT_ULL(4) +#define IXGBE_PHY_TYPE_LOW_1000BASE_KX BIT_ULL(5) +#define IXGBE_PHY_TYPE_LOW_1G_SGMII BIT_ULL(6) +#define IXGBE_PHY_TYPE_LOW_2500BASE_T BIT_ULL(7) +#define IXGBE_PHY_TYPE_LOW_2500BASE_X BIT_ULL(8) +#define IXGBE_PHY_TYPE_LOW_2500BASE_KX BIT_ULL(9) +#define IXGBE_PHY_TYPE_LOW_5GBASE_T BIT_ULL(10) +#define IXGBE_PHY_TYPE_LOW_5GBASE_KR BIT_ULL(11) +#define IXGBE_PHY_TYPE_LOW_10GBASE_T BIT_ULL(12) +#define IXGBE_PHY_TYPE_LOW_10G_SFI_DA BIT_ULL(13) +#define IXGBE_PHY_TYPE_LOW_10GBASE_SR BIT_ULL(14) +#define IXGBE_PHY_TYPE_LOW_10GBASE_LR BIT_ULL(15) +#define IXGBE_PHY_TYPE_LOW_10GBASE_KR_CR1 BIT_ULL(16) +#define IXGBE_PHY_TYPE_LOW_10G_SFI_AOC_ACC BIT_ULL(17) +#define IXGBE_PHY_TYPE_LOW_10G_SFI_C2C BIT_ULL(18) +#define IXGBE_PHY_TYPE_LOW_25GBASE_T BIT_ULL(19) +#define IXGBE_PHY_TYPE_LOW_25GBASE_CR BIT_ULL(20) +#define IXGBE_PHY_TYPE_LOW_25GBASE_CR_S BIT_ULL(21) +#define IXGBE_PHY_TYPE_LOW_25GBASE_CR1 BIT_ULL(22) +#define IXGBE_PHY_TYPE_LOW_25GBASE_SR BIT_ULL(23) +#define IXGBE_PHY_TYPE_LOW_25GBASE_LR BIT_ULL(24) +#define IXGBE_PHY_TYPE_LOW_25GBASE_KR BIT_ULL(25) +#define IXGBE_PHY_TYPE_LOW_25GBASE_KR_S BIT_ULL(26) +#define IXGBE_PHY_TYPE_LOW_25GBASE_KR1 BIT_ULL(27) +#define IXGBE_PHY_TYPE_LOW_25G_AUI_AOC_ACC BIT_ULL(28) +#define IXGBE_PHY_TYPE_LOW_25G_AUI_C2C BIT_ULL(29) +#define IXGBE_PHY_TYPE_LOW_MAX_INDEX 29 +/* The second set of defines is for phy_type_high. */ +#define IXGBE_PHY_TYPE_HIGH_10BASE_T BIT_ULL(1) +#define IXGBE_PHY_TYPE_HIGH_10M_SGMII BIT_ULL(2) +#define IXGBE_PHY_TYPE_HIGH_2500M_SGMII BIT_ULL(56) +#define IXGBE_PHY_TYPE_HIGH_100M_USXGMII BIT_ULL(57) +#define IXGBE_PHY_TYPE_HIGH_1G_USXGMII BIT_ULL(58) +#define IXGBE_PHY_TYPE_HIGH_2500M_USXGMII BIT_ULL(59) +#define IXGBE_PHY_TYPE_HIGH_5G_USXGMII BIT_ULL(60) +#define IXGBE_PHY_TYPE_HIGH_10G_USXGMII BIT_ULL(61) +#define IXGBE_PHY_TYPE_HIGH_MAX_INDEX 61 + +struct ixgbe_aci_cmd_get_phy_caps_data { + __le64 phy_type_low; /* Use values from IXGBE_PHY_TYPE_LOW_* */ + __le64 phy_type_high; /* Use values from IXGBE_PHY_TYPE_HIGH_* */ + u8 caps; +#define IXGBE_ACI_PHY_EN_TX_LINK_PAUSE BIT(0) +#define IXGBE_ACI_PHY_EN_RX_LINK_PAUSE BIT(1) +#define IXGBE_ACI_PHY_LOW_POWER_MODE BIT(2) +#define IXGBE_ACI_PHY_EN_LINK BIT(3) +#define IXGBE_ACI_PHY_AN_MODE BIT(4) +#define IXGBE_ACI_PHY_EN_MOD_QUAL BIT(5) +#define IXGBE_ACI_PHY_EN_LESM BIT(6) +#define IXGBE_ACI_PHY_EN_AUTO_FEC BIT(7) +#define IXGBE_ACI_PHY_CAPS_MASK GENMASK(7, 0) + u8 low_power_ctrl_an; +#define IXGBE_ACI_PHY_EN_D3COLD_LOW_POWER_AUTONEG BIT(0) +#define IXGBE_ACI_PHY_AN_EN_CLAUSE28 BIT(1) +#define IXGBE_ACI_PHY_AN_EN_CLAUSE73 BIT(2) +#define IXGBE_ACI_PHY_AN_EN_CLAUSE37 BIT(3) + __le16 eee_cap; +#define IXGBE_ACI_PHY_EEE_EN_100BASE_TX BIT(0) +#define IXGBE_ACI_PHY_EEE_EN_1000BASE_T BIT(1) +#define IXGBE_ACI_PHY_EEE_EN_10GBASE_T BIT(2) +#define IXGBE_ACI_PHY_EEE_EN_1000BASE_KX BIT(3) +#define IXGBE_ACI_PHY_EEE_EN_10GBASE_KR BIT(4) +#define IXGBE_ACI_PHY_EEE_EN_25GBASE_KR BIT(5) +#define IXGBE_ACI_PHY_EEE_EN_10BASE_T BIT(11) + __le16 eeer_value; + u8 phy_id_oui[4]; /* PHY/Module ID connected on the port */ + u8 phy_fw_ver[8]; + u8 link_fec_options; +#define IXGBE_ACI_PHY_FEC_10G_KR_40G_KR4_EN BIT(0) +#define IXGBE_ACI_PHY_FEC_10G_KR_40G_KR4_REQ BIT(1) +#define IXGBE_ACI_PHY_FEC_25G_RS_528_REQ BIT(2) +#define IXGBE_ACI_PHY_FEC_25G_KR_REQ BIT(3) +#define IXGBE_ACI_PHY_FEC_25G_RS_544_REQ BIT(4) +#define IXGBE_ACI_PHY_FEC_25G_RS_CLAUSE91_EN BIT(6) +#define IXGBE_ACI_PHY_FEC_25G_KR_CLAUSE74_EN BIT(7) +#define IXGBE_ACI_PHY_FEC_MASK 0xdf + u8 module_compliance_enforcement; +#define IXGBE_ACI_MOD_ENFORCE_STRICT_MODE BIT(0) + u8 extended_compliance_code; +#define IXGBE_ACI_MODULE_TYPE_TOTAL_BYTE 3 + u8 module_type[IXGBE_ACI_MODULE_TYPE_TOTAL_BYTE]; +#define IXGBE_ACI_MOD_TYPE_BYTE0_SFP_PLUS 0xA0 +#define IXGBE_ACI_MOD_TYPE_BYTE0_QSFP_PLUS 0x80 +#define IXGBE_ACI_MOD_TYPE_IDENT 1 +#define IXGBE_ACI_MOD_TYPE_BYTE1_SFP_PLUS_CU_PASSIVE BIT(0) +#define IXGBE_ACI_MOD_TYPE_BYTE1_SFP_PLUS_CU_ACTIVE BIT(1) +#define IXGBE_ACI_MOD_TYPE_BYTE1_10G_BASE_SR BIT(4) +#define IXGBE_ACI_MOD_TYPE_BYTE1_10G_BASE_LR BIT(5) +#define IXGBE_ACI_MOD_TYPE_BYTE1_10G_BASE_LRM BIT(6) +#define IXGBE_ACI_MOD_TYPE_BYTE1_10G_BASE_ER BIT(7) +#define IXGBE_ACI_MOD_TYPE_BYTE2_SFP_PLUS 0xA0 +#define IXGBE_ACI_MOD_TYPE_BYTE2_QSFP_PLUS 0x86 + u8 qualified_module_count; + u8 rsvd2[7]; /* Bytes 47:41 reserved */ +#define IXGBE_ACI_QUAL_MOD_COUNT_MAX 16 + struct { + u8 v_oui[3]; + u8 rsvd3; + u8 v_part[16]; + __le32 v_rev; + __le64 rsvd4; + } qual_modules[IXGBE_ACI_QUAL_MOD_COUNT_MAX]; +}; + +/* Set PHY capabilities (direct 0x0601) + * NOTE: This command must be followed by setup link and restart auto-neg + */ +struct ixgbe_aci_cmd_set_phy_cfg { + u8 lport_num; + u8 reserved[7]; + __le32 addr_high; + __le32 addr_low; +}; + +/* Set PHY config command data structure */ +struct ixgbe_aci_cmd_set_phy_cfg_data { + __le64 phy_type_low; /* Use values from IXGBE_PHY_TYPE_LOW_* */ + __le64 phy_type_high; /* Use values from IXGBE_PHY_TYPE_HIGH_* */ + u8 caps; +#define IXGBE_ACI_PHY_ENA_VALID_MASK 0xef +#define IXGBE_ACI_PHY_ENA_TX_PAUSE_ABILITY BIT(0) +#define IXGBE_ACI_PHY_ENA_RX_PAUSE_ABILITY BIT(1) +#define IXGBE_ACI_PHY_ENA_LOW_POWER BIT(2) +#define IXGBE_ACI_PHY_ENA_LINK BIT(3) +#define IXGBE_ACI_PHY_ENA_AUTO_LINK_UPDT BIT(5) +#define IXGBE_ACI_PHY_ENA_LESM BIT(6) +#define IXGBE_ACI_PHY_ENA_AUTO_FEC BIT(7) + u8 low_power_ctrl_an; + __le16 eee_cap; /* Value from ixgbe_aci_get_phy_caps */ + __le16 eeer_value; /* Use defines from ixgbe_aci_get_phy_caps */ + u8 link_fec_opt; /* Use defines from ixgbe_aci_get_phy_caps */ + u8 module_compliance_enforcement; +}; + +/* Restart AN command data structure (direct 0x0605) + * Also used for response, with only the lport_num field present. + */ +struct ixgbe_aci_cmd_restart_an { + u8 lport_num; + u8 reserved; + u8 cmd_flags; +#define IXGBE_ACI_RESTART_AN_LINK_RESTART BIT(1) +#define IXGBE_ACI_RESTART_AN_LINK_ENABLE BIT(2) + u8 reserved2[13]; +}; + +/* Get link status (indirect 0x0607), also used for Link Status Event */ +struct ixgbe_aci_cmd_get_link_status { + u8 lport_num; + u8 reserved; + __le16 cmd_flags; +#define IXGBE_ACI_LSE_M GENMASK(1, 0) +#define IXGBE_ACI_LSE_NOP 0x0 +#define IXGBE_ACI_LSE_DIS 0x2 +#define IXGBE_ACI_LSE_ENA 0x3 + /* only response uses this flag */ +#define IXGBE_ACI_LSE_IS_ENABLED 0x1 + __le32 reserved2; + __le32 addr_high; + __le32 addr_low; +}; + +/* Get link status response data structure, also used for Link Status Event */ +struct ixgbe_aci_cmd_get_link_status_data { + u8 topo_media_conflict; +#define IXGBE_ACI_LINK_TOPO_CONFLICT BIT(0) +#define IXGBE_ACI_LINK_MEDIA_CONFLICT BIT(1) +#define IXGBE_ACI_LINK_TOPO_CORRUPT BIT(2) +#define IXGBE_ACI_LINK_TOPO_UNREACH_PRT BIT(4) +#define IXGBE_ACI_LINK_TOPO_UNDRUTIL_PRT BIT(5) +#define IXGBE_ACI_LINK_TOPO_UNDRUTIL_MEDIA BIT(6) +#define IXGBE_ACI_LINK_TOPO_UNSUPP_MEDIA BIT(7) + u8 link_cfg_err; +#define IXGBE_ACI_LINK_CFG_ERR BIT(0) +#define IXGBE_ACI_LINK_CFG_COMPLETED BIT(1) +#define IXGBE_ACI_LINK_ACT_PORT_OPT_INVAL BIT(2) +#define IXGBE_ACI_LINK_FEAT_ID_OR_CONFIG_ID_INVAL BIT(3) +#define IXGBE_ACI_LINK_TOPO_CRITICAL_SDP_ERR BIT(4) +#define IXGBE_ACI_LINK_MODULE_POWER_UNSUPPORTED BIT(5) +#define IXGBE_ACI_LINK_EXTERNAL_PHY_LOAD_FAILURE BIT(6) +#define IXGBE_ACI_LINK_INVAL_MAX_POWER_LIMIT BIT(7) + u8 link_info; +#define IXGBE_ACI_LINK_UP BIT(0) /* Link Status */ +#define IXGBE_ACI_LINK_FAULT BIT(1) +#define IXGBE_ACI_LINK_FAULT_TX BIT(2) +#define IXGBE_ACI_LINK_FAULT_RX BIT(3) +#define IXGBE_ACI_LINK_FAULT_REMOTE BIT(4) +#define IXGBE_ACI_LINK_UP_PORT BIT(5) /* External Port Link Status */ +#define IXGBE_ACI_MEDIA_AVAILABLE BIT(6) +#define IXGBE_ACI_SIGNAL_DETECT BIT(7) + u8 an_info; +#define IXGBE_ACI_AN_COMPLETED BIT(0) +#define IXGBE_ACI_LP_AN_ABILITY BIT(1) +#define IXGBE_ACI_PD_FAULT BIT(2) /* Parallel Detection Fault */ +#define IXGBE_ACI_FEC_EN BIT(3) +#define IXGBE_ACI_PHY_LOW_POWER BIT(4) /* Low Power State */ +#define IXGBE_ACI_LINK_PAUSE_TX BIT(5) +#define IXGBE_ACI_LINK_PAUSE_RX BIT(6) +#define IXGBE_ACI_QUALIFIED_MODULE BIT(7) + u8 ext_info; +#define IXGBE_ACI_LINK_PHY_TEMP_ALARM BIT(0) +#define IXGBE_ACI_LINK_EXCESSIVE_ERRORS BIT(1) /* Excessive Link Errors */ + /* Port Tx Suspended */ +#define IXGBE_ACI_LINK_TX_ACTIVE 0 +#define IXGBE_ACI_LINK_TX_DRAINED 1 +#define IXGBE_ACI_LINK_TX_FLUSHED 3 + u8 lb_status; +#define IXGBE_ACI_LINK_LB_PHY_LCL BIT(0) +#define IXGBE_ACI_LINK_LB_PHY_RMT BIT(1) +#define IXGBE_ACI_LINK_LB_MAC_LCL BIT(2) + __le16 max_frame_size; + u8 cfg; +#define IXGBE_ACI_LINK_25G_KR_FEC_EN BIT(0) +#define IXGBE_ACI_LINK_25G_RS_528_FEC_EN BIT(1) +#define IXGBE_ACI_LINK_25G_RS_544_FEC_EN BIT(2) +#define IXGBE_ACI_FEC_MASK GENMASK(2, 0) + /* Pacing Config */ +#define IXGBE_ACI_CFG_PACING_M GENMASK(6, 3) +#define IXGBE_ACI_CFG_PACING_TYPE_M BIT(7) +#define IXGBE_ACI_CFG_PACING_TYPE_AVG 0 +#define IXGBE_ACI_CFG_PACING_TYPE_FIXED IXGBE_ACI_CFG_PACING_TYPE_M + /* External Device Power Ability */ + u8 power_desc; +#define IXGBE_ACI_PWR_CLASS_M GENMASK(5, 0) +#define IXGBE_ACI_LINK_PWR_BASET_LOW_HIGH 0 +#define IXGBE_ACI_LINK_PWR_BASET_HIGH 1 +#define IXGBE_ACI_LINK_PWR_QSFP_CLASS_1 0 +#define IXGBE_ACI_LINK_PWR_QSFP_CLASS_2 1 +#define IXGBE_ACI_LINK_PWR_QSFP_CLASS_3 2 +#define IXGBE_ACI_LINK_PWR_QSFP_CLASS_4 3 + __le16 link_speed; +#define IXGBE_ACI_LINK_SPEED_M GENMASK(10, 0) +#define IXGBE_ACI_LINK_SPEED_10MB BIT(0) +#define IXGBE_ACI_LINK_SPEED_100MB BIT(1) +#define IXGBE_ACI_LINK_SPEED_1000MB BIT(2) +#define IXGBE_ACI_LINK_SPEED_2500MB BIT(3) +#define IXGBE_ACI_LINK_SPEED_5GB BIT(4) +#define IXGBE_ACI_LINK_SPEED_10GB BIT(5) +#define IXGBE_ACI_LINK_SPEED_20GB BIT(6) +#define IXGBE_ACI_LINK_SPEED_25GB BIT(7) +#define IXGBE_ACI_LINK_SPEED_40GB BIT(8) +#define IXGBE_ACI_LINK_SPEED_50GB BIT(9) +#define IXGBE_ACI_LINK_SPEED_100GB BIT(10) +#define IXGBE_ACI_LINK_SPEED_200GB BIT(11) +#define IXGBE_ACI_LINK_SPEED_UNKNOWN BIT(15) + __le16 reserved3; + u8 ext_fec_status; +#define IXGBE_ACI_LINK_RS_272_FEC_EN BIT(0) /* RS 272 FEC enabled */ + u8 reserved4; + __le64 phy_type_low; /* Use values from ICE_PHY_TYPE_LOW_* */ + __le64 phy_type_high; /* Use values from ICE_PHY_TYPE_HIGH_* */ + /* Get link status version 2 link partner data */ + __le64 lp_phy_type_low; /* Use values from ICE_PHY_TYPE_LOW_* */ + __le64 lp_phy_type_high; /* Use values from ICE_PHY_TYPE_HIGH_* */ + u8 lp_fec_adv; +#define IXGBE_ACI_LINK_LP_10G_KR_FEC_CAP BIT(0) +#define IXGBE_ACI_LINK_LP_25G_KR_FEC_CAP BIT(1) +#define IXGBE_ACI_LINK_LP_RS_528_FEC_CAP BIT(2) +#define IXGBE_ACI_LINK_LP_50G_KR_272_FEC_CAP BIT(3) +#define IXGBE_ACI_LINK_LP_100G_KR_272_FEC_CAP BIT(4) +#define IXGBE_ACI_LINK_LP_200G_KR_272_FEC_CAP BIT(5) + u8 lp_fec_req; +#define IXGBE_ACI_LINK_LP_10G_KR_FEC_REQ BIT(0) +#define IXGBE_ACI_LINK_LP_25G_KR_FEC_REQ BIT(1) +#define IXGBE_ACI_LINK_LP_RS_528_FEC_REQ BIT(2) +#define IXGBE_ACI_LINK_LP_KR_272_FEC_REQ BIT(3) + u8 lp_flowcontrol; +#define IXGBE_ACI_LINK_LP_PAUSE_ADV BIT(0) +#define IXGBE_ACI_LINK_LP_ASM_DIR_ADV BIT(1) + u8 reserved5[5]; +} __packed; + +/* Set event mask command (direct 0x0613) */ +struct ixgbe_aci_cmd_set_event_mask { + u8 lport_num; + u8 reserved[7]; + __le16 event_mask; +#define IXGBE_ACI_LINK_EVENT_UPDOWN BIT(1) +#define IXGBE_ACI_LINK_EVENT_MEDIA_NA BIT(2) +#define IXGBE_ACI_LINK_EVENT_LINK_FAULT BIT(3) +#define IXGBE_ACI_LINK_EVENT_PHY_TEMP_ALARM BIT(4) +#define IXGBE_ACI_LINK_EVENT_EXCESSIVE_ERRORS BIT(5) +#define IXGBE_ACI_LINK_EVENT_SIGNAL_DETECT BIT(6) +#define IXGBE_ACI_LINK_EVENT_AN_COMPLETED BIT(7) +#define IXGBE_ACI_LINK_EVENT_MODULE_QUAL_FAIL BIT(8) +#define IXGBE_ACI_LINK_EVENT_PORT_TX_SUSPENDED BIT(9) +#define IXGBE_ACI_LINK_EVENT_TOPO_CONFLICT BIT(10) +#define IXGBE_ACI_LINK_EVENT_MEDIA_CONFLICT BIT(11) +#define IXGBE_ACI_LINK_EVENT_PHY_FW_LOAD_FAIL BIT(12) + u8 reserved1[6]; +}; + +struct ixgbe_aci_cmd_link_topo_params { + u8 lport_num; + u8 lport_num_valid; +#define IXGBE_ACI_LINK_TOPO_PORT_NUM_VALID BIT(0) + u8 node_type_ctx; +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_M GENMASK(3, 0) +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_PHY 0 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_GPIO_CTRL 1 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_MUX_CTRL 2 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_LED_CTRL 3 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_LED 4 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_THERMAL 5 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_CAGE 6 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_MEZZ 7 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_ID_EEPROM 8 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_CLK_CTRL 9 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_CLK_MUX 10 +#define IXGBE_ACI_LINK_TOPO_NODE_TYPE_GPS 11 +#define IXGBE_ACI_LINK_TOPO_NODE_CTX_S 4 +#define IXGBE_ACI_LINK_TOPO_NODE_CTX_GLOBAL 0 +#define IXGBE_ACI_LINK_TOPO_NODE_CTX_BOARD 1 +#define IXGBE_ACI_LINK_TOPO_NODE_CTX_PORT 2 +#define IXGBE_ACI_LINK_TOPO_NODE_CTX_NODE 3 +#define IXGBE_ACI_LINK_TOPO_NODE_CTX_NODE_HANDLE 4 +#define IXGBE_ACI_LINK_TOPO_NODE_CTX_DIRECT_BUS_ACCESS 5 +#define IXGBE_ACI_LINK_TOPO_NODE_CTX_NODE_HANDLE_BUS_ADDRESS 6 + u8 index; +}; + +struct ixgbe_aci_cmd_link_topo_addr { + struct ixgbe_aci_cmd_link_topo_params topo_params; + __le16 handle; +/* Used to decode the handle field */ +#define IXGBE_ACI_LINK_TOPO_HANDLE_BRD_TYPE_M BIT(9) +#define IXGBE_ACI_LINK_TOPO_HANDLE_BRD_TYPE_LOM BIT(9) +#define IXGBE_ACI_LINK_TOPO_HANDLE_BRD_TYPE_MEZZ 0 +}; + +/* Get Link Topology Handle (direct, 0x06E0) */ +struct ixgbe_aci_cmd_get_link_topo { + struct ixgbe_aci_cmd_link_topo_addr addr; + u8 node_part_num; +#define IXGBE_ACI_GET_LINK_TOPO_NODE_NR_PCA9575 0x21 +#define IXGBE_ACI_GET_LINK_TOPO_NODE_NR_ZL30632_80032 0x24 +#define IXGBE_ACI_GET_LINK_TOPO_NODE_NR_SI5384 0x25 +#define IXGBE_ACI_GET_LINK_TOPO_NODE_NR_C827 0x31 +#define IXGBE_ACI_GET_LINK_TOPO_NODE_NR_GEN_CLK_MUX 0x47 +#define IXGBE_ACI_GET_LINK_TOPO_NODE_NR_GEN_GPS 0x48 +#define IXGBE_ACI_GET_LINK_TOPO_NODE_NR_E610_PTC 0x49 + u8 rsvd[9]; +}; + +/* Get Link Topology Pin (direct, 0x06E1) */ +struct ixgbe_aci_cmd_get_link_topo_pin { + struct ixgbe_aci_cmd_link_topo_addr addr; + u8 input_io_params; +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_GPIO 0 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_RESET_N 1 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_INT_N 2 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_PRESENT_N 3 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_TX_DIS 4 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_MODSEL_N 5 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_LPMODE 6 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_TX_FAULT 7 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_RX_LOSS 8 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_RS0 9 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_RS1 10 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_EEPROM_WP 11 +/* 12 repeats intentionally due to two different uses depending on context */ +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_LED 12 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_RED_LED 12 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_GREEN_LED 13 +#define IXGBE_ACI_LINK_TOPO_IO_FUNC_BLUE_LED 14 +#define IXGBE_ACI_LINK_TOPO_INPUT_IO_TYPE_GPIO 3 +/* Use IXGBE_ACI_LINK_TOPO_NODE_TYPE_* for the type values */ + u8 output_io_params; +/* Use IXGBE_ACI_LINK_TOPO_NODE_TYPE_* for the type values */ + u8 output_io_flags; +#define IXGBE_ACI_LINK_TOPO_OUTPUT_POLARITY BIT(5) +#define IXGBE_ACI_LINK_TOPO_OUTPUT_VALUE BIT(6) +#define IXGBE_ACI_LINK_TOPO_OUTPUT_DRIVEN BIT(7) + u8 rsvd[7]; +}; + +/* Read/Write SFF EEPROM command (indirect 0x06EE) */ +struct ixgbe_aci_cmd_sff_eeprom { + u8 lport_num; + u8 lport_num_valid; +#define IXGBE_ACI_SFF_PORT_NUM_VALID BIT(0) + __le16 i2c_bus_addr; +#define IXGBE_ACI_SFF_I2CBUS_7BIT_M GENMASK(6, 0) +#define IXGBE_ACI_SFF_I2CBUS_10BIT_M GENMASK(9, 0) +#define IXGBE_ACI_SFF_I2CBUS_TYPE_M BIT(10) +#define IXGBE_ACI_SFF_I2CBUS_TYPE_7BIT 0 +#define IXGBE_ACI_SFF_I2CBUS_TYPE_10BIT IXGBE_ACI_SFF_I2CBUS_TYPE_M +#define IXGBE_ACI_SFF_NO_PAGE_BANK_UPDATE 0 +#define IXGBE_ACI_SFF_UPDATE_PAGE 1 +#define IXGBE_ACI_SFF_UPDATE_BANK 2 +#define IXGBE_ACI_SFF_UPDATE_PAGE_BANK 3 +#define IXGBE_ACI_SFF_IS_WRITE BIT(15) + __le16 i2c_offset; + u8 module_bank; + u8 module_page; + __le32 addr_high; + __le32 addr_low; +}; + +/* NVM Read command (indirect 0x0701) + * NVM Erase commands (direct 0x0702) + * NVM Write commands (indirect 0x0703) + * NVM Write Activate commands (direct 0x0707) + * NVM Shadow RAM Dump commands (direct 0x0707) + */ +struct ixgbe_aci_cmd_nvm { +#define IXGBE_ACI_NVM_MAX_OFFSET 0xFFFFFF + __le16 offset_low; + u8 offset_high; /* For Write Activate offset_high is used as flags2 */ + u8 cmd_flags; +#define IXGBE_ACI_NVM_LAST_CMD BIT(0) +#define IXGBE_ACI_NVM_PCIR_REQ BIT(0) /* Used by NVM Write reply */ +#define IXGBE_ACI_NVM_PRESERVE_ALL BIT(1) +#define IXGBE_ACI_NVM_ACTIV_SEL_NVM BIT(3) /* Write Activate/SR Dump only */ +#define IXGBE_ACI_NVM_ACTIV_SEL_OROM BIT(4) +#define IXGBE_ACI_NVM_ACTIV_SEL_NETLIST BIT(5) +#define IXGBE_ACI_NVM_SPECIAL_UPDATE BIT(6) +#define IXGBE_ACI_NVM_REVERT_LAST_ACTIV BIT(6) /* Write Activate only */ +#define IXGBE_ACI_NVM_FLASH_ONLY BIT(7) +#define IXGBE_ACI_NVM_RESET_LVL_M GENMASK(1, 0) /* Write reply only */ +#define IXGBE_ACI_NVM_POR_FLAG 0 +#define IXGBE_ACI_NVM_PERST_FLAG 1 +#define IXGBE_ACI_NVM_EMPR_FLAG 2 +#define IXGBE_ACI_NVM_EMPR_ENA BIT(0) /* Write Activate reply only */ + /* For Write Activate, several flags are sent as part of a separate + * flags2 field using a separate byte. For simplicity of the software + * interface, we pass the flags as a 16 bit value so these flags are + * all offset by 8 bits + */ +#define IXGBE_ACI_NVM_ACTIV_REQ_EMPR BIT(8) /* NVM Write Activate only */ + __le16 module_typeid; + __le16 length; +#define IXGBE_ACI_NVM_ERASE_LEN 0xFFFF + __le32 addr_high; + __le32 addr_low; +}; + +/* NVM Module_Type ID, needed offset and read_len for + * struct ixgbe_aci_cmd_nvm. + */ +#define IXGBE_ACI_NVM_START_POINT 0 + +/* NVM Checksum Command (direct, 0x0706) */ +struct ixgbe_aci_cmd_nvm_checksum { + u8 flags; +#define IXGBE_ACI_NVM_CHECKSUM_VERIFY BIT(0) +#define IXGBE_ACI_NVM_CHECKSUM_RECALC BIT(1) + u8 rsvd; + __le16 checksum; /* Used only by response */ +#define IXGBE_ACI_NVM_CHECKSUM_CORRECT 0xBABA + u8 rsvd2[12]; +}; + +/** + * struct ixgbe_aci_desc - Admin Command (AC) descriptor + * @flags: IXGBE_ACI_FLAG_* flags + * @opcode: Admin command opcode + * @datalen: length in bytes of indirect/external data buffer + * @retval: return value from firmware + * @cookie_high: opaque data high-half + * @cookie_low: opaque data low-half + * @params: command-specific parameters + * + * Descriptor format for commands the driver posts via the + * Admin Command Interface (ACI). + * The firmware writes back onto the command descriptor and returns + * the result of the command. Asynchronous events that are not an immediate + * result of the command are written to the Admin Command Interface (ACI) using + * the same descriptor format. Descriptors are in little-endian notation with + * 32-bit words. + */ +struct ixgbe_aci_desc { + __le16 flags; + __le16 opcode; + __le16 datalen; + __le16 retval; + __le32 cookie_high; + __le32 cookie_low; + union { + u8 raw[16]; + struct ixgbe_aci_cmd_get_ver get_ver; + struct ixgbe_aci_cmd_driver_ver driver_ver; + struct ixgbe_aci_cmd_get_exp_err exp_err; + struct ixgbe_aci_cmd_req_res res_owner; + struct ixgbe_aci_cmd_list_caps get_cap; + struct ixgbe_aci_cmd_disable_rxen disable_rxen; + struct ixgbe_aci_cmd_get_phy_caps get_phy; + struct ixgbe_aci_cmd_set_phy_cfg set_phy; + struct ixgbe_aci_cmd_restart_an restart_an; + struct ixgbe_aci_cmd_get_link_status get_link_status; + struct ixgbe_aci_cmd_set_event_mask set_event_mask; + struct ixgbe_aci_cmd_get_link_topo get_link_topo; + struct ixgbe_aci_cmd_get_link_topo_pin get_link_topo_pin; + struct ixgbe_aci_cmd_sff_eeprom read_write_sff_param; + struct ixgbe_aci_cmd_nvm nvm; + struct ixgbe_aci_cmd_nvm_checksum nvm_checksum; + } params; +}; + +/* E610-specific adapter context structures */ + +struct ixgbe_link_status { + /* Refer to ixgbe_aci_phy_type for bits definition */ + u64 phy_type_low; + u64 phy_type_high; + u16 max_frame_size; + u16 link_speed; + u16 req_speeds; + u8 topo_media_conflict; + u8 link_cfg_err; + u8 lse_ena; /* Link Status Event notification */ + u8 link_info; + u8 an_info; + u8 ext_info; + u8 fec_info; + u8 pacing; + /* Refer to #define from module_type[IXGBE_ACI_MODULE_TYPE_TOTAL_BYTE] + * of ixgbe_aci_get_phy_caps structure + */ + u8 module_type[IXGBE_ACI_MODULE_TYPE_TOTAL_BYTE]; +}; + +/* Common HW capabilities for SW use */ +struct ixgbe_hw_caps { + /* Write CSR protection */ + u64 wr_csr_prot; + u32 switching_mode; + /* switching mode supported - EVB switching (including cloud) */ +#define IXGBE_NVM_IMAGE_TYPE_EVB 0x0 + + /* Manageability mode & supported protocols over MCTP */ + u32 mgmt_mode; +#define IXGBE_MGMT_MODE_PASS_THRU_MODE_M GENMASK(3, 0) +#define IXGBE_MGMT_MODE_CTL_INTERFACE_M GENMASK(7, 4) +#define IXGBE_MGMT_MODE_REDIR_SB_INTERFACE_M GENMASK(11, 8) + + u32 mgmt_protocols_mctp; +#define IXGBE_MGMT_MODE_PROTO_RSVD BIT(0) +#define IXGBE_MGMT_MODE_PROTO_PLDM BIT(1) +#define IXGBE_MGMT_MODE_PROTO_OEM BIT(2) +#define IXGBE_MGMT_MODE_PROTO_NC_SI BIT(3) + + u32 os2bmc; + u32 valid_functions; + /* DCB capabilities */ + u32 active_tc_bitmap; + u32 maxtc; + + /* RSS related capabilities */ + u32 rss_table_size; /* 512 for PFs and 64 for VFs */ + u32 rss_table_entry_width; /* RSS Entry width in bits */ + + /* Tx/Rx queues */ + u32 num_rxq; /* Number/Total Rx queues */ + u32 rxq_first_id; /* First queue ID for Rx queues */ + u32 num_txq; /* Number/Total Tx queues */ + u32 txq_first_id; /* First queue ID for Tx queues */ + + /* MSI-X vectors */ + u32 num_msix_vectors; + u32 msix_vector_first_id; + + /* Max MTU for function or device */ + u32 max_mtu; + + /* WOL related */ + u32 num_wol_proxy_fltr; + u32 wol_proxy_vsi_seid; + + /* LED/SDP pin count */ + u32 led_pin_num; + u32 sdp_pin_num; + + /* LED/SDP - Supports up to 12 LED pins and 8 SDP signals */ +#define IXGBE_MAX_SUPPORTED_GPIO_LED 12 +#define IXGBE_MAX_SUPPORTED_GPIO_SDP 8 + u8 led[IXGBE_MAX_SUPPORTED_GPIO_LED]; + u8 sdp[IXGBE_MAX_SUPPORTED_GPIO_SDP]; + /* SR-IOV virtualization */ + u8 sr_iov_1_1; /* SR-IOV enabled */ + /* VMDQ */ + u8 vmdq; /* VMDQ supported */ + + /* EVB capabilities */ + u8 evb_802_1_qbg; /* Edge Virtual Bridging */ + u8 evb_802_1_qbh; /* Bridge Port Extension */ + + u8 dcb; + u8 iscsi; + u8 ieee_1588; + u8 mgmt_cem; + + /* WoL and APM support */ +#define IXGBE_WOL_SUPPORT_M BIT(0) +#define IXGBE_ACPI_PROG_MTHD_M BIT(1) +#define IXGBE_PROXY_SUPPORT_M BIT(2) + u8 apm_wol_support; + u8 acpi_prog_mthd; + u8 proxy_support; + bool nvm_update_pending_nvm; + bool nvm_update_pending_orom; + bool nvm_update_pending_netlist; +#define IXGBE_NVM_PENDING_NVM_IMAGE BIT(0) +#define IXGBE_NVM_PENDING_OROM BIT(1) +#define IXGBE_NVM_PENDING_NETLIST BIT(2) + bool sec_rev_disabled; + bool update_disabled; + bool nvm_unified_update; + bool netlist_auth; +#define IXGBE_NVM_MGMT_SEC_REV_DISABLED BIT(0) +#define IXGBE_NVM_MGMT_UPDATE_DISABLED BIT(1) +#define IXGBE_NVM_MGMT_UNIFIED_UPD_SUPPORT BIT(3) +#define IXGBE_NVM_MGMT_NETLIST_AUTH_SUPPORT BIT(5) + bool no_drop_policy_support; + /* PCIe reset avoidance */ + bool pcie_reset_avoidance; /* false: not supported, true: supported */ + /* Post update reset restriction */ + bool reset_restrict_support; /* false: not supported, true: supported */ + + /* External topology device images within the NVM */ +#define IXGBE_EXT_TOPO_DEV_IMG_COUNT 4 + u32 ext_topo_dev_img_ver_high[IXGBE_EXT_TOPO_DEV_IMG_COUNT]; + u32 ext_topo_dev_img_ver_low[IXGBE_EXT_TOPO_DEV_IMG_COUNT]; + u8 ext_topo_dev_img_part_num[IXGBE_EXT_TOPO_DEV_IMG_COUNT]; +#define IXGBE_EXT_TOPO_DEV_IMG_PART_NUM_S 8 +#define IXGBE_EXT_TOPO_DEV_IMG_PART_NUM_M GENMASK(15, 8) + bool ext_topo_dev_img_load_en[IXGBE_EXT_TOPO_DEV_IMG_COUNT]; +#define IXGBE_EXT_TOPO_DEV_IMG_LOAD_EN BIT(0) + bool ext_topo_dev_img_prog_en[IXGBE_EXT_TOPO_DEV_IMG_COUNT]; +#define IXGBE_EXT_TOPO_DEV_IMG_PROG_EN BIT(1) +} __packed; + +/* Function specific capabilities */ +struct ixgbe_hw_func_caps { + u32 num_allocd_vfs; /* Number of allocated VFs */ + u32 vf_base_id; /* Logical ID of the first VF */ + u32 guar_num_vsi; + struct ixgbe_hw_caps common_cap; + bool no_drop_policy_ena; +}; + +/* Device wide capabilities */ +struct ixgbe_hw_dev_caps { + struct ixgbe_hw_caps common_cap; + u32 num_vfs_exposed; /* Total number of VFs exposed */ + u32 num_vsi_allocd_to_host; /* Excluding EMP VSI */ + u32 num_flow_director_fltr; /* Number of FD filters available */ + u32 num_funcs; +}; + +/* ACI event information */ +struct ixgbe_aci_event { + struct ixgbe_aci_desc desc; + u8 *msg_buf; + u16 msg_len; + u16 buf_len; +}; + +struct ixgbe_aci_info { + struct mutex lock; /* admin command interface lock */ + enum ixgbe_aci_err last_status; /* last status of sent admin command */ +}; + +/* Option ROM version information */ +struct ixgbe_orom_info { + u8 major; /* Major version of OROM */ + u8 patch; /* Patch version of OROM */ + u16 build; /* Build version of OROM */ + u32 srev; /* Security revision */ +}; + +/* NVM version information */ +struct ixgbe_nvm_info { + u32 eetrack; + u32 srev; + u8 major; + u8 minor; +} __packed; + +/* netlist version information */ +struct ixgbe_netlist_info { + u32 major; /* major high/low */ + u32 minor; /* minor high/low */ + u32 type; /* type high/low */ + u32 rev; /* revision high/low */ + u32 hash; /* SHA-1 hash word */ + u16 cust_ver; /* customer version */ +} __packed; + +/* Enumeration of possible flash banks for the NVM, OROM, and Netlist modules + * of the flash image. + */ +enum ixgbe_flash_bank { + IXGBE_INVALID_FLASH_BANK, + IXGBE_1ST_FLASH_BANK, + IXGBE_2ND_FLASH_BANK, +}; + +/* information for accessing NVM, OROM, and Netlist flash banks */ +struct ixgbe_bank_info { + u32 nvm_ptr; /* Pointer to 1st NVM bank */ + u32 nvm_size; /* Size of NVM bank */ + u32 orom_ptr; /* Pointer to 1st OROM bank */ + u32 orom_size; /* Size of OROM bank */ + u32 netlist_ptr; /* Ptr to 1st Netlist bank */ + u32 netlist_size; /* Size of Netlist bank */ + enum ixgbe_flash_bank nvm_bank; /* Active NVM bank */ + enum ixgbe_flash_bank orom_bank; /* Active OROM bank */ + enum ixgbe_flash_bank netlist_bank; /* Active Netlist bank */ +}; + +/* Flash Chip Information */ +struct ixgbe_flash_info { + struct ixgbe_orom_info orom; /* Option ROM version info */ + u32 flash_size; /* Available flash size in bytes */ + struct ixgbe_nvm_info nvm; /* NVM version information */ + struct ixgbe_netlist_info netlist; /* Netlist version info */ + struct ixgbe_bank_info banks; /* Flash Bank information */ + u16 sr_words; /* Shadow RAM size in words */ + u8 blank_nvm_mode; /* is NVM empty (no FW present) */ +}; + +#endif /* _IXGBE_TYPE_E610_H_ */ From 7c3aa0fccb1944921f13a6e0084d3aa070b4ff7b Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Thu, 5 Dec 2024 09:44:44 +0100 Subject: [PATCH 0860/1450] ixgbe: Add support for E610 device capabilities detection Add low level support for E610 device capabilities detection. The capabilities are discovered via the Admin Command Interface. Discover the following capabilities: - function caps: vmdq, dcb, rss, rx/tx qs, msix, nvm, orom, reset - device caps: vsi, fdir, 1588 - phy caps Co-developed-by: Stefan Wegrzyn Signed-off-by: Stefan Wegrzyn Co-developed-by: Jedrzej Jagielski Signed-off-by: Jedrzej Jagielski Reviewed-by: Jan Sokolowski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Piotr Kwapulinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 529 ++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 12 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 + 3 files changed, 548 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 6a26f6b4d3d5b..9e15b9c111205 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -495,3 +495,532 @@ void ixgbe_release_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res) total_delay++; } } + +/** + * ixgbe_parse_e610_caps - Parse common device/function capabilities + * @hw: pointer to the HW struct + * @caps: pointer to common capabilities structure + * @elem: the capability element to parse + * @prefix: message prefix for tracing capabilities + * + * Given a capability element, extract relevant details into the common + * capability structure. + * + * Return: true if the capability matches one of the common capability ids, + * false otherwise. + */ +static bool ixgbe_parse_e610_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_caps *caps, + struct ixgbe_aci_cmd_list_caps_elem *elem, + const char *prefix) +{ + u32 logical_id = le32_to_cpu(elem->logical_id); + u32 phys_id = le32_to_cpu(elem->phys_id); + u32 number = le32_to_cpu(elem->number); + u16 cap = le16_to_cpu(elem->cap); + + switch (cap) { + case IXGBE_ACI_CAPS_VALID_FUNCTIONS: + caps->valid_functions = number; + break; + case IXGBE_ACI_CAPS_SRIOV: + caps->sr_iov_1_1 = (number == 1); + break; + case IXGBE_ACI_CAPS_VMDQ: + caps->vmdq = (number == 1); + break; + case IXGBE_ACI_CAPS_DCB: + caps->dcb = (number == 1); + caps->active_tc_bitmap = logical_id; + caps->maxtc = phys_id; + break; + case IXGBE_ACI_CAPS_RSS: + caps->rss_table_size = number; + caps->rss_table_entry_width = logical_id; + break; + case IXGBE_ACI_CAPS_RXQS: + caps->num_rxq = number; + caps->rxq_first_id = phys_id; + break; + case IXGBE_ACI_CAPS_TXQS: + caps->num_txq = number; + caps->txq_first_id = phys_id; + break; + case IXGBE_ACI_CAPS_MSIX: + caps->num_msix_vectors = number; + caps->msix_vector_first_id = phys_id; + break; + case IXGBE_ACI_CAPS_NVM_VER: + break; + case IXGBE_ACI_CAPS_MAX_MTU: + caps->max_mtu = number; + break; + case IXGBE_ACI_CAPS_PCIE_RESET_AVOIDANCE: + caps->pcie_reset_avoidance = (number > 0); + break; + case IXGBE_ACI_CAPS_POST_UPDATE_RESET_RESTRICT: + caps->reset_restrict_support = (number == 1); + break; + case IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG0: + case IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG1: + case IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG2: + case IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG3: + { + u8 index = cap - IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG0; + + caps->ext_topo_dev_img_ver_high[index] = number; + caps->ext_topo_dev_img_ver_low[index] = logical_id; + caps->ext_topo_dev_img_part_num[index] = + FIELD_GET(IXGBE_EXT_TOPO_DEV_IMG_PART_NUM_M, phys_id); + caps->ext_topo_dev_img_load_en[index] = + (phys_id & IXGBE_EXT_TOPO_DEV_IMG_LOAD_EN) != 0; + caps->ext_topo_dev_img_prog_en[index] = + (phys_id & IXGBE_EXT_TOPO_DEV_IMG_PROG_EN) != 0; + break; + } + default: + /* Not one of the recognized common capabilities */ + return false; + } + + return true; +} + +/** + * ixgbe_parse_valid_functions_cap - Parse IXGBE_ACI_CAPS_VALID_FUNCTIONS caps + * @hw: pointer to the HW struct + * @dev_p: pointer to device capabilities structure + * @cap: capability element to parse + * + * Parse IXGBE_ACI_CAPS_VALID_FUNCTIONS for device capabilities. + */ +static void +ixgbe_parse_valid_functions_cap(struct ixgbe_hw *hw, + struct ixgbe_hw_dev_caps *dev_p, + struct ixgbe_aci_cmd_list_caps_elem *cap) +{ + dev_p->num_funcs = hweight32(le32_to_cpu(cap->number)); +} + +/** + * ixgbe_parse_vf_dev_caps - Parse IXGBE_ACI_CAPS_VF device caps + * @hw: pointer to the HW struct + * @dev_p: pointer to device capabilities structure + * @cap: capability element to parse + * + * Parse IXGBE_ACI_CAPS_VF for device capabilities. + */ +static void ixgbe_parse_vf_dev_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_dev_caps *dev_p, + struct ixgbe_aci_cmd_list_caps_elem *cap) +{ + dev_p->num_vfs_exposed = le32_to_cpu(cap->number); +} + +/** + * ixgbe_parse_vsi_dev_caps - Parse IXGBE_ACI_CAPS_VSI device caps + * @hw: pointer to the HW struct + * @dev_p: pointer to device capabilities structure + * @cap: capability element to parse + * + * Parse IXGBE_ACI_CAPS_VSI for device capabilities. + */ +static void ixgbe_parse_vsi_dev_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_dev_caps *dev_p, + struct ixgbe_aci_cmd_list_caps_elem *cap) +{ + dev_p->num_vsi_allocd_to_host = le32_to_cpu(cap->number); +} + +/** + * ixgbe_parse_fdir_dev_caps - Parse IXGBE_ACI_CAPS_FD device caps + * @hw: pointer to the HW struct + * @dev_p: pointer to device capabilities structure + * @cap: capability element to parse + * + * Parse IXGBE_ACI_CAPS_FD for device capabilities. + */ +static void ixgbe_parse_fdir_dev_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_dev_caps *dev_p, + struct ixgbe_aci_cmd_list_caps_elem *cap) +{ + dev_p->num_flow_director_fltr = le32_to_cpu(cap->number); +} + +/** + * ixgbe_parse_dev_caps - Parse device capabilities + * @hw: pointer to the HW struct + * @dev_p: pointer to device capabilities structure + * @buf: buffer containing the device capability records + * @cap_count: the number of capabilities + * + * Helper device to parse device (0x000B) capabilities list. For + * capabilities shared between device and function, this relies on + * ixgbe_parse_e610_caps. + * + * Loop through the list of provided capabilities and extract the relevant + * data into the device capabilities structured. + */ +static void ixgbe_parse_dev_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_dev_caps *dev_p, + void *buf, u32 cap_count) +{ + struct ixgbe_aci_cmd_list_caps_elem *cap_resp; + u32 i; + + cap_resp = (struct ixgbe_aci_cmd_list_caps_elem *)buf; + + memset(dev_p, 0, sizeof(*dev_p)); + + for (i = 0; i < cap_count; i++) { + u16 cap = le16_to_cpu(cap_resp[i].cap); + + ixgbe_parse_e610_caps(hw, &dev_p->common_cap, &cap_resp[i], + "dev caps"); + + switch (cap) { + case IXGBE_ACI_CAPS_VALID_FUNCTIONS: + ixgbe_parse_valid_functions_cap(hw, dev_p, + &cap_resp[i]); + break; + case IXGBE_ACI_CAPS_VF: + ixgbe_parse_vf_dev_caps(hw, dev_p, &cap_resp[i]); + break; + case IXGBE_ACI_CAPS_VSI: + ixgbe_parse_vsi_dev_caps(hw, dev_p, &cap_resp[i]); + break; + case IXGBE_ACI_CAPS_FD: + ixgbe_parse_fdir_dev_caps(hw, dev_p, &cap_resp[i]); + break; + default: + /* Don't list common capabilities as unknown */ + break; + } + } +} + +/** + * ixgbe_parse_vf_func_caps - Parse IXGBE_ACI_CAPS_VF function caps + * @hw: pointer to the HW struct + * @func_p: pointer to function capabilities structure + * @cap: pointer to the capability element to parse + * + * Extract function capabilities for IXGBE_ACI_CAPS_VF. + */ +static void ixgbe_parse_vf_func_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_func_caps *func_p, + struct ixgbe_aci_cmd_list_caps_elem *cap) +{ + func_p->num_allocd_vfs = le32_to_cpu(cap->number); + func_p->vf_base_id = le32_to_cpu(cap->logical_id); +} + +/** + * ixgbe_get_num_per_func - determine number of resources per PF + * @hw: pointer to the HW structure + * @max: value to be evenly split between each PF + * + * Determine the number of valid functions by going through the bitmap returned + * from parsing capabilities and use this to calculate the number of resources + * per PF based on the max value passed in. + * + * Return: the number of resources per PF or 0, if no PH are available. + */ +static u32 ixgbe_get_num_per_func(struct ixgbe_hw *hw, u32 max) +{ +#define IXGBE_CAPS_VALID_FUNCS_M GENMASK(7, 0) + u8 funcs = hweight8(hw->dev_caps.common_cap.valid_functions & + IXGBE_CAPS_VALID_FUNCS_M); + + return funcs ? (max / funcs) : 0; +} + +/** + * ixgbe_parse_vsi_func_caps - Parse IXGBE_ACI_CAPS_VSI function caps + * @hw: pointer to the HW struct + * @func_p: pointer to function capabilities structure + * @cap: pointer to the capability element to parse + * + * Extract function capabilities for IXGBE_ACI_CAPS_VSI. + */ +static void ixgbe_parse_vsi_func_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_func_caps *func_p, + struct ixgbe_aci_cmd_list_caps_elem *cap) +{ + func_p->guar_num_vsi = ixgbe_get_num_per_func(hw, IXGBE_MAX_VSI); +} + +/** + * ixgbe_parse_func_caps - Parse function capabilities + * @hw: pointer to the HW struct + * @func_p: pointer to function capabilities structure + * @buf: buffer containing the function capability records + * @cap_count: the number of capabilities + * + * Helper function to parse function (0x000A) capabilities list. For + * capabilities shared between device and function, this relies on + * ixgbe_parse_e610_caps. + * + * Loop through the list of provided capabilities and extract the relevant + * data into the function capabilities structured. + */ +static void ixgbe_parse_func_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_func_caps *func_p, + void *buf, u32 cap_count) +{ + struct ixgbe_aci_cmd_list_caps_elem *cap_resp; + u32 i; + + cap_resp = (struct ixgbe_aci_cmd_list_caps_elem *)buf; + + memset(func_p, 0, sizeof(*func_p)); + + for (i = 0; i < cap_count; i++) { + u16 cap = le16_to_cpu(cap_resp[i].cap); + + ixgbe_parse_e610_caps(hw, &func_p->common_cap, + &cap_resp[i], "func caps"); + + switch (cap) { + case IXGBE_ACI_CAPS_VF: + ixgbe_parse_vf_func_caps(hw, func_p, &cap_resp[i]); + break; + case IXGBE_ACI_CAPS_VSI: + ixgbe_parse_vsi_func_caps(hw, func_p, &cap_resp[i]); + break; + default: + /* Don't list common capabilities as unknown */ + break; + } + } +} + +/** + * ixgbe_aci_list_caps - query function/device capabilities + * @hw: pointer to the HW struct + * @buf: a buffer to hold the capabilities + * @buf_size: size of the buffer + * @cap_count: if not NULL, set to the number of capabilities reported + * @opc: capabilities type to discover, device or function + * + * Get the function (0x000A) or device (0x000B) capabilities description from + * firmware and store it in the buffer. + * + * If the cap_count pointer is not NULL, then it is set to the number of + * capabilities firmware will report. Note that if the buffer size is too + * small, it is possible the command will return -ENOMEM. The + * cap_count will still be updated in this case. It is recommended that the + * buffer size be set to IXGBE_ACI_MAX_BUFFER_SIZE (the largest possible + * buffer that firmware could return) to avoid this. + * + * Return: the exit code of the operation. + * Exit code of -ENOMEM means the buffer size is too small. + */ +int ixgbe_aci_list_caps(struct ixgbe_hw *hw, void *buf, u16 buf_size, + u32 *cap_count, enum ixgbe_aci_opc opc) +{ + struct ixgbe_aci_cmd_list_caps *cmd; + struct ixgbe_aci_desc desc; + int err; + + cmd = &desc.params.get_cap; + + if (opc != ixgbe_aci_opc_list_func_caps && + opc != ixgbe_aci_opc_list_dev_caps) + return -EINVAL; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, opc); + err = ixgbe_aci_send_cmd(hw, &desc, buf, buf_size); + + if (cap_count) + *cap_count = le32_to_cpu(cmd->count); + + return err; +} + +/** + * ixgbe_discover_dev_caps - Read and extract device capabilities + * @hw: pointer to the hardware structure + * @dev_caps: pointer to device capabilities structure + * + * Read the device capabilities and extract them into the dev_caps structure + * for later use. + * + * Return: the exit code of the operation. + */ +int ixgbe_discover_dev_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_dev_caps *dev_caps) +{ + u32 cap_count; + u8 *cbuf; + int err; + + cbuf = kzalloc(IXGBE_ACI_MAX_BUFFER_SIZE, GFP_KERNEL); + if (!cbuf) + return -ENOMEM; + + /* Although the driver doesn't know the number of capabilities the + * device will return, we can simply send a 4KB buffer, the maximum + * possible size that firmware can return. + */ + cap_count = IXGBE_ACI_MAX_BUFFER_SIZE / + sizeof(struct ixgbe_aci_cmd_list_caps_elem); + + err = ixgbe_aci_list_caps(hw, cbuf, IXGBE_ACI_MAX_BUFFER_SIZE, + &cap_count, + ixgbe_aci_opc_list_dev_caps); + if (!err) + ixgbe_parse_dev_caps(hw, dev_caps, cbuf, cap_count); + + kfree(cbuf); + + return 0; +} + +/** + * ixgbe_discover_func_caps - Read and extract function capabilities + * @hw: pointer to the hardware structure + * @func_caps: pointer to function capabilities structure + * + * Read the function capabilities and extract them into the func_caps structure + * for later use. + * + * Return: the exit code of the operation. + */ +int ixgbe_discover_func_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_func_caps *func_caps) +{ + u32 cap_count; + u8 *cbuf; + int err; + + cbuf = kzalloc(IXGBE_ACI_MAX_BUFFER_SIZE, GFP_KERNEL); + if (!cbuf) + return -ENOMEM; + + /* Although the driver doesn't know the number of capabilities the + * device will return, we can simply send a 4KB buffer, the maximum + * possible size that firmware can return. + */ + cap_count = IXGBE_ACI_MAX_BUFFER_SIZE / + sizeof(struct ixgbe_aci_cmd_list_caps_elem); + + err = ixgbe_aci_list_caps(hw, cbuf, IXGBE_ACI_MAX_BUFFER_SIZE, + &cap_count, + ixgbe_aci_opc_list_func_caps); + if (!err) + ixgbe_parse_func_caps(hw, func_caps, cbuf, cap_count); + + kfree(cbuf); + + return 0; +} + +/** + * ixgbe_get_caps - get info about the HW + * @hw: pointer to the hardware structure + * + * Retrieve both device and function capabilities. + * + * Return: the exit code of the operation. + */ +int ixgbe_get_caps(struct ixgbe_hw *hw) +{ + int err; + + err = ixgbe_discover_dev_caps(hw, &hw->dev_caps); + if (err) + return err; + + return ixgbe_discover_func_caps(hw, &hw->func_caps); +} + +/** + * ixgbe_aci_disable_rxen - disable RX + * @hw: pointer to the HW struct + * + * Request a safe disable of Receive Enable using ACI command (0x000C). + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_disable_rxen(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_disable_rxen *cmd; + struct ixgbe_aci_desc desc; + + cmd = &desc.params.disable_rxen; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_disable_rxen); + + cmd->lport_num = hw->bus.func; + + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + +/** + * ixgbe_aci_get_phy_caps - returns PHY capabilities + * @hw: pointer to the HW struct + * @qual_mods: report qualified modules + * @report_mode: report mode capabilities + * @pcaps: structure for PHY capabilities to be filled + * + * Returns the various PHY capabilities supported on the Port + * using ACI command (0x0600). + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_get_phy_caps(struct ixgbe_hw *hw, bool qual_mods, u8 report_mode, + struct ixgbe_aci_cmd_get_phy_caps_data *pcaps) +{ + struct ixgbe_aci_cmd_get_phy_caps *cmd; + u16 pcaps_size = sizeof(*pcaps); + struct ixgbe_aci_desc desc; + int err; + + cmd = &desc.params.get_phy; + + if (!pcaps || (report_mode & ~IXGBE_ACI_REPORT_MODE_M)) + return -EINVAL; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_phy_caps); + + if (qual_mods) + cmd->param0 |= cpu_to_le16(IXGBE_ACI_GET_PHY_RQM); + + cmd->param0 |= cpu_to_le16(report_mode); + err = ixgbe_aci_send_cmd(hw, &desc, pcaps, pcaps_size); + if (!err && report_mode == IXGBE_ACI_REPORT_TOPO_CAP_MEDIA) { + hw->phy.phy_type_low = le64_to_cpu(pcaps->phy_type_low); + hw->phy.phy_type_high = le64_to_cpu(pcaps->phy_type_high); + memcpy(hw->link.link_info.module_type, &pcaps->module_type, + sizeof(hw->link.link_info.module_type)); + } + + return err; +} + +/** + * ixgbe_copy_phy_caps_to_cfg - Copy PHY ability data to configuration data + * @caps: PHY ability structure to copy data from + * @cfg: PHY configuration structure to copy data to + * + * Helper function to copy data from PHY capabilities data structure + * to PHY configuration data structure + */ +void ixgbe_copy_phy_caps_to_cfg(struct ixgbe_aci_cmd_get_phy_caps_data *caps, + struct ixgbe_aci_cmd_set_phy_cfg_data *cfg) +{ + if (!caps || !cfg) + return; + + memset(cfg, 0, sizeof(*cfg)); + cfg->phy_type_low = caps->phy_type_low; + cfg->phy_type_high = caps->phy_type_high; + cfg->caps = caps->caps; + cfg->low_power_ctrl_an = caps->low_power_ctrl_an; + cfg->eee_cap = caps->eee_cap; + cfg->eeer_value = caps->eeer_value; + cfg->link_fec_opt = caps->link_fec_options; + cfg->module_compliance_enforcement = + caps->module_compliance_enforcement; +} diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 18b831b6797d8..5c5a6769b566d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -15,5 +15,17 @@ void ixgbe_fill_dflt_direct_cmd_desc(struct ixgbe_aci_desc *desc, u16 opcode); int ixgbe_acquire_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res, enum ixgbe_aci_res_access_type access, u32 timeout); void ixgbe_release_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res); +int ixgbe_aci_list_caps(struct ixgbe_hw *hw, void *buf, u16 buf_size, + u32 *cap_count, enum ixgbe_aci_opc opc); +int ixgbe_discover_dev_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_dev_caps *dev_caps); +int ixgbe_discover_func_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_func_caps *func_caps); +int ixgbe_get_caps(struct ixgbe_hw *hw); +int ixgbe_aci_disable_rxen(struct ixgbe_hw *hw); +int ixgbe_aci_get_phy_caps(struct ixgbe_hw *hw, bool qual_mods, u8 report_mode, + struct ixgbe_aci_cmd_get_phy_caps_data *pcaps); +void ixgbe_copy_phy_caps_to_cfg(struct ixgbe_aci_cmd_get_phy_caps_data *caps, + struct ixgbe_aci_cmd_set_phy_cfg_data *cfg); #endif /* _IXGBE_E610_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 2e38e8f6fac18..13b777d702a22 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -42,6 +42,7 @@ #include "ixgbe.h" #include "ixgbe_common.h" +#include "ixgbe_e610.h" #include "ixgbe_dcb_82599.h" #include "ixgbe_mbx.h" #include "ixgbe_phy.h" @@ -10933,6 +10934,12 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; + if (adapter->hw.mac.type == ixgbe_mac_e610) { + err = ixgbe_get_caps(&adapter->hw); + if (err) + dev_err(&pdev->dev, "ixgbe_get_caps failed %d\n", err); + } + if (adapter->hw.mac.type == ixgbe_mac_82599EB) adapter->flags2 |= IXGBE_FLAG2_AUTO_DISABLE_VF; From 23c0e5a16bccd120caf1e7a9bfdf002fea107fa8 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Thu, 5 Dec 2024 09:44:45 +0100 Subject: [PATCH 0861/1450] ixgbe: Add link management support for E610 device Add low level link management support for E610 device. Link management operations are handled via the Admin Command Interface. Add the following link management operations: - get link capabilities - set up link - get media type - get link status, link status events - link power management Co-developed-by: Stefan Wegrzyn Signed-off-by: Stefan Wegrzyn Co-developed-by: Jedrzej Jagielski Signed-off-by: Jedrzej Jagielski Reviewed-by: Jan Glaza Tested-by: Bharath R Signed-off-by: Piotr Kwapulinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 1081 +++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 32 + .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 1 + 3 files changed, 1114 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 9e15b9c111205..0b578c4006b1a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -1024,3 +1024,1084 @@ void ixgbe_copy_phy_caps_to_cfg(struct ixgbe_aci_cmd_get_phy_caps_data *caps, cfg->module_compliance_enforcement = caps->module_compliance_enforcement; } + +/** + * ixgbe_aci_set_phy_cfg - set PHY configuration + * @hw: pointer to the HW struct + * @cfg: structure with PHY configuration data to be set + * + * Set the various PHY configuration parameters supported on the Port + * using ACI command (0x0601). + * One or more of the Set PHY config parameters may be ignored in an MFP + * mode as the PF may not have the privilege to set some of the PHY Config + * parameters. + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_set_phy_cfg(struct ixgbe_hw *hw, + struct ixgbe_aci_cmd_set_phy_cfg_data *cfg) +{ + struct ixgbe_aci_desc desc; + int err; + + if (!cfg) + return -EINVAL; + + /* Ensure that only valid bits of cfg->caps can be turned on. */ + cfg->caps &= IXGBE_ACI_PHY_ENA_VALID_MASK; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_set_phy_cfg); + desc.params.set_phy.lport_num = hw->bus.func; + desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD); + + err = ixgbe_aci_send_cmd(hw, &desc, cfg, sizeof(*cfg)); + if (!err) + hw->phy.curr_user_phy_cfg = *cfg; + + return err; +} + +/** + * ixgbe_aci_set_link_restart_an - set up link and restart AN + * @hw: pointer to the HW struct + * @ena_link: if true: enable link, if false: disable link + * + * Function sets up the link and restarts the Auto-Negotiation over the link. + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_set_link_restart_an(struct ixgbe_hw *hw, bool ena_link) +{ + struct ixgbe_aci_cmd_restart_an *cmd; + struct ixgbe_aci_desc desc; + + cmd = &desc.params.restart_an; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_restart_an); + + cmd->cmd_flags = IXGBE_ACI_RESTART_AN_LINK_RESTART; + cmd->lport_num = hw->bus.func; + if (ena_link) + cmd->cmd_flags |= IXGBE_ACI_RESTART_AN_LINK_ENABLE; + else + cmd->cmd_flags &= ~IXGBE_ACI_RESTART_AN_LINK_ENABLE; + + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + +/** + * ixgbe_is_media_cage_present - check if media cage is present + * @hw: pointer to the HW struct + * + * Identify presence of media cage using the ACI command (0x06E0). + * + * Return: true if media cage is present, else false. If no cage, then + * media type is backplane or BASE-T. + */ +static bool ixgbe_is_media_cage_present(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_get_link_topo *cmd; + struct ixgbe_aci_desc desc; + + cmd = &desc.params.get_link_topo; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_link_topo); + + cmd->addr.topo_params.node_type_ctx = + FIELD_PREP(IXGBE_ACI_LINK_TOPO_NODE_CTX_M, + IXGBE_ACI_LINK_TOPO_NODE_CTX_PORT); + + /* Set node type. */ + cmd->addr.topo_params.node_type_ctx |= + FIELD_PREP(IXGBE_ACI_LINK_TOPO_NODE_TYPE_M, + IXGBE_ACI_LINK_TOPO_NODE_TYPE_CAGE); + + /* Node type cage can be used to determine if cage is present. If AQC + * returns error (ENOENT), then no cage present. If no cage present then + * connection type is backplane or BASE-T. + */ + return ixgbe_aci_get_netlist_node(hw, cmd, NULL, NULL); +} + +/** + * ixgbe_get_media_type_from_phy_type - Gets media type based on phy type + * @hw: pointer to the HW struct + * + * Try to identify the media type based on the phy type. + * If more than one media type, the ixgbe_media_type_unknown is returned. + * First, phy_type_low is checked, then phy_type_high. + * If none are identified, the ixgbe_media_type_unknown is returned + * + * Return: type of a media based on phy type in form of enum. + */ +static enum ixgbe_media_type +ixgbe_get_media_type_from_phy_type(struct ixgbe_hw *hw) +{ + struct ixgbe_link_status *hw_link_info; + + if (!hw) + return ixgbe_media_type_unknown; + + hw_link_info = &hw->link.link_info; + if (hw_link_info->phy_type_low && hw_link_info->phy_type_high) + /* If more than one media type is selected, report unknown */ + return ixgbe_media_type_unknown; + + if (hw_link_info->phy_type_low) { + /* 1G SGMII is a special case where some DA cable PHYs + * may show this as an option when it really shouldn't + * be since SGMII is meant to be between a MAC and a PHY + * in a backplane. Try to detect this case and handle it + */ + if (hw_link_info->phy_type_low == IXGBE_PHY_TYPE_LOW_1G_SGMII && + (hw_link_info->module_type[IXGBE_ACI_MOD_TYPE_IDENT] == + IXGBE_ACI_MOD_TYPE_BYTE1_SFP_PLUS_CU_ACTIVE || + hw_link_info->module_type[IXGBE_ACI_MOD_TYPE_IDENT] == + IXGBE_ACI_MOD_TYPE_BYTE1_SFP_PLUS_CU_PASSIVE)) + return ixgbe_media_type_da; + + switch (hw_link_info->phy_type_low) { + case IXGBE_PHY_TYPE_LOW_1000BASE_SX: + case IXGBE_PHY_TYPE_LOW_1000BASE_LX: + case IXGBE_PHY_TYPE_LOW_10GBASE_SR: + case IXGBE_PHY_TYPE_LOW_10GBASE_LR: + case IXGBE_PHY_TYPE_LOW_25GBASE_SR: + case IXGBE_PHY_TYPE_LOW_25GBASE_LR: + return ixgbe_media_type_fiber; + case IXGBE_PHY_TYPE_LOW_10G_SFI_AOC_ACC: + case IXGBE_PHY_TYPE_LOW_25G_AUI_AOC_ACC: + return ixgbe_media_type_fiber; + case IXGBE_PHY_TYPE_LOW_100BASE_TX: + case IXGBE_PHY_TYPE_LOW_1000BASE_T: + case IXGBE_PHY_TYPE_LOW_2500BASE_T: + case IXGBE_PHY_TYPE_LOW_5GBASE_T: + case IXGBE_PHY_TYPE_LOW_10GBASE_T: + case IXGBE_PHY_TYPE_LOW_25GBASE_T: + return ixgbe_media_type_copper; + case IXGBE_PHY_TYPE_LOW_10G_SFI_DA: + case IXGBE_PHY_TYPE_LOW_25GBASE_CR: + case IXGBE_PHY_TYPE_LOW_25GBASE_CR_S: + case IXGBE_PHY_TYPE_LOW_25GBASE_CR1: + return ixgbe_media_type_da; + case IXGBE_PHY_TYPE_LOW_25G_AUI_C2C: + if (ixgbe_is_media_cage_present(hw)) + return ixgbe_media_type_aui; + fallthrough; + case IXGBE_PHY_TYPE_LOW_1000BASE_KX: + case IXGBE_PHY_TYPE_LOW_2500BASE_KX: + case IXGBE_PHY_TYPE_LOW_2500BASE_X: + case IXGBE_PHY_TYPE_LOW_5GBASE_KR: + case IXGBE_PHY_TYPE_LOW_10GBASE_KR_CR1: + case IXGBE_PHY_TYPE_LOW_10G_SFI_C2C: + case IXGBE_PHY_TYPE_LOW_25GBASE_KR: + case IXGBE_PHY_TYPE_LOW_25GBASE_KR1: + case IXGBE_PHY_TYPE_LOW_25GBASE_KR_S: + return ixgbe_media_type_backplane; + } + } else { + switch (hw_link_info->phy_type_high) { + case IXGBE_PHY_TYPE_HIGH_10BASE_T: + return ixgbe_media_type_copper; + } + } + return ixgbe_media_type_unknown; +} + +/** + * ixgbe_update_link_info - update status of the HW network link + * @hw: pointer to the HW struct + * + * Update the status of the HW network link. + * + * Return: the exit code of the operation. + */ +int ixgbe_update_link_info(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_get_phy_caps_data *pcaps; + struct ixgbe_link_status *li; + int err; + + if (!hw) + return -EINVAL; + + li = &hw->link.link_info; + + err = ixgbe_aci_get_link_info(hw, true, NULL); + if (err) + return err; + + if (!(li->link_info & IXGBE_ACI_MEDIA_AVAILABLE)) + return 0; + + pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL); + if (!pcaps) + return -ENOMEM; + + err = ixgbe_aci_get_phy_caps(hw, false, IXGBE_ACI_REPORT_TOPO_CAP_MEDIA, + pcaps); + + if (!err) + memcpy(li->module_type, &pcaps->module_type, + sizeof(li->module_type)); + + kfree(pcaps); + + return err; +} + +/** + * ixgbe_get_link_status - get status of the HW network link + * @hw: pointer to the HW struct + * @link_up: pointer to bool (true/false = linkup/linkdown) + * + * Variable link_up is true if link is up, false if link is down. + * The variable link_up is invalid if status is non zero. As a + * result of this call, link status reporting becomes enabled + * + * Return: the exit code of the operation. + */ +int ixgbe_get_link_status(struct ixgbe_hw *hw, bool *link_up) +{ + if (!hw || !link_up) + return -EINVAL; + + if (hw->link.get_link_info) { + int err = ixgbe_update_link_info(hw); + + if (err) + return err; + } + + *link_up = hw->link.link_info.link_info & IXGBE_ACI_LINK_UP; + + return 0; +} + +/** + * ixgbe_aci_get_link_info - get the link status + * @hw: pointer to the HW struct + * @ena_lse: enable/disable LinkStatusEvent reporting + * @link: pointer to link status structure - optional + * + * Get the current Link Status using ACI command (0x607). + * The current link can be optionally provided to update + * the status. + * + * Return: the link status of the adapter. + */ +int ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse, + struct ixgbe_link_status *link) +{ + struct ixgbe_aci_cmd_get_link_status_data link_data = {}; + struct ixgbe_aci_cmd_get_link_status *resp; + struct ixgbe_link_status *li_old, *li; + struct ixgbe_fc_info *hw_fc_info; + struct ixgbe_aci_desc desc; + bool tx_pause, rx_pause; + u8 cmd_flags; + int err; + + if (!hw) + return -EINVAL; + + li_old = &hw->link.link_info_old; + li = &hw->link.link_info; + hw_fc_info = &hw->fc; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_link_status); + cmd_flags = (ena_lse) ? IXGBE_ACI_LSE_ENA : IXGBE_ACI_LSE_DIS; + resp = &desc.params.get_link_status; + resp->cmd_flags = cpu_to_le16(cmd_flags); + resp->lport_num = hw->bus.func; + + err = ixgbe_aci_send_cmd(hw, &desc, &link_data, sizeof(link_data)); + if (err) + return err; + + /* Save off old link status information. */ + *li_old = *li; + + /* Update current link status information. */ + li->link_speed = le16_to_cpu(link_data.link_speed); + li->phy_type_low = le64_to_cpu(link_data.phy_type_low); + li->phy_type_high = le64_to_cpu(link_data.phy_type_high); + li->link_info = link_data.link_info; + li->link_cfg_err = link_data.link_cfg_err; + li->an_info = link_data.an_info; + li->ext_info = link_data.ext_info; + li->max_frame_size = le16_to_cpu(link_data.max_frame_size); + li->fec_info = link_data.cfg & IXGBE_ACI_FEC_MASK; + li->topo_media_conflict = link_data.topo_media_conflict; + li->pacing = link_data.cfg & (IXGBE_ACI_CFG_PACING_M | + IXGBE_ACI_CFG_PACING_TYPE_M); + + /* Update fc info. */ + tx_pause = !!(link_data.an_info & IXGBE_ACI_LINK_PAUSE_TX); + rx_pause = !!(link_data.an_info & IXGBE_ACI_LINK_PAUSE_RX); + if (tx_pause && rx_pause) + hw_fc_info->current_mode = ixgbe_fc_full; + else if (tx_pause) + hw_fc_info->current_mode = ixgbe_fc_tx_pause; + else if (rx_pause) + hw_fc_info->current_mode = ixgbe_fc_rx_pause; + else + hw_fc_info->current_mode = ixgbe_fc_none; + + li->lse_ena = !!(le16_to_cpu(resp->cmd_flags) & + IXGBE_ACI_LSE_IS_ENABLED); + + /* Save link status information. */ + if (link) + *link = *li; + + /* Flag cleared so calling functions don't call AQ again. */ + hw->link.get_link_info = false; + + return 0; +} + +/** + * ixgbe_aci_set_event_mask - set event mask + * @hw: pointer to the HW struct + * @port_num: port number of the physical function + * @mask: event mask to be set + * + * Set the event mask using ACI command (0x0613). + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_set_event_mask(struct ixgbe_hw *hw, u8 port_num, u16 mask) +{ + struct ixgbe_aci_cmd_set_event_mask *cmd; + struct ixgbe_aci_desc desc; + + cmd = &desc.params.set_event_mask; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_set_event_mask); + + cmd->lport_num = port_num; + + cmd->event_mask = cpu_to_le16(mask); + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + +/** + * ixgbe_configure_lse - enable/disable link status events + * @hw: pointer to the HW struct + * @activate: true for enable lse, false otherwise + * @mask: event mask to be set; a set bit means deactivation of the + * corresponding event + * + * Set the event mask and then enable or disable link status events + * + * Return: the exit code of the operation. + */ +int ixgbe_configure_lse(struct ixgbe_hw *hw, bool activate, u16 mask) +{ + int err; + + err = ixgbe_aci_set_event_mask(hw, (u8)hw->bus.func, mask); + if (err) + return err; + + /* Enabling link status events generation by fw. */ + return ixgbe_aci_get_link_info(hw, activate, NULL); +} + +/** + * ixgbe_get_media_type_e610 - Gets media type + * @hw: pointer to the HW struct + * + * In order to get the media type, the function gets PHY + * capabilities and later on use them to identify the PHY type + * checking phy_type_high and phy_type_low. + * + * Return: the type of media in form of ixgbe_media_type enum + * or ixgbe_media_type_unknown in case of an error. + */ +enum ixgbe_media_type ixgbe_get_media_type_e610(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_get_phy_caps_data pcaps; + int rc; + + rc = ixgbe_update_link_info(hw); + if (rc) + return ixgbe_media_type_unknown; + + /* If there is no link but PHY (dongle) is available SW should use + * Get PHY Caps admin command instead of Get Link Status, find most + * significant bit that is set in PHY types reported by the command + * and use it to discover media type. + */ + if (!(hw->link.link_info.link_info & IXGBE_ACI_LINK_UP) && + (hw->link.link_info.link_info & IXGBE_ACI_MEDIA_AVAILABLE)) { + int highest_bit; + + /* Get PHY Capabilities */ + rc = ixgbe_aci_get_phy_caps(hw, false, + IXGBE_ACI_REPORT_TOPO_CAP_MEDIA, + &pcaps); + if (rc) + return ixgbe_media_type_unknown; + + highest_bit = fls64(le64_to_cpu(pcaps.phy_type_high)); + if (highest_bit) { + hw->link.link_info.phy_type_high = + BIT_ULL(highest_bit - 1); + hw->link.link_info.phy_type_low = 0; + } else { + highest_bit = fls64(le64_to_cpu(pcaps.phy_type_low)); + if (highest_bit) + hw->link.link_info.phy_type_low = + BIT_ULL(highest_bit - 1); + } + } + + /* Based on link status or search above try to discover media type. */ + hw->phy.media_type = ixgbe_get_media_type_from_phy_type(hw); + + return hw->phy.media_type; +} + +/** + * ixgbe_setup_link_e610 - Set up link + * @hw: pointer to hardware structure + * @speed: new link speed + * @autoneg_wait: true when waiting for completion is needed + * + * Set up the link with the specified speed. + * + * Return: the exit code of the operation. + */ +int ixgbe_setup_link_e610(struct ixgbe_hw *hw, ixgbe_link_speed speed, + bool autoneg_wait) +{ + /* Simply request FW to perform proper PHY setup */ + return hw->phy.ops.setup_link_speed(hw, speed, autoneg_wait); +} + +/** + * ixgbe_check_link_e610 - Determine link and speed status + * @hw: pointer to hardware structure + * @speed: pointer to link speed + * @link_up: true when link is up + * @link_up_wait_to_complete: bool used to wait for link up or not + * + * Determine if the link is up and the current link speed + * using ACI command (0x0607). + * + * Return: the exit code of the operation. + */ +int ixgbe_check_link_e610(struct ixgbe_hw *hw, ixgbe_link_speed *speed, + bool *link_up, bool link_up_wait_to_complete) +{ + int err; + u32 i; + + if (!speed || !link_up) + return -EINVAL; + + /* Set get_link_info flag to ensure that fresh + * link information will be obtained from FW + * by sending Get Link Status admin command. + */ + hw->link.get_link_info = true; + + /* Update link information in adapter context. */ + err = ixgbe_get_link_status(hw, link_up); + if (err) + return err; + + /* Wait for link up if it was requested. */ + if (link_up_wait_to_complete && !(*link_up)) { + for (i = 0; i < hw->mac.max_link_up_time; i++) { + msleep(100); + hw->link.get_link_info = true; + err = ixgbe_get_link_status(hw, link_up); + if (err) + return err; + if (*link_up) + break; + } + } + + /* Use link information in adapter context updated by the call + * to ixgbe_get_link_status() to determine current link speed. + * Link speed information is valid only when link up was + * reported by FW. + */ + if (*link_up) { + switch (hw->link.link_info.link_speed) { + case IXGBE_ACI_LINK_SPEED_10MB: + *speed = IXGBE_LINK_SPEED_10_FULL; + break; + case IXGBE_ACI_LINK_SPEED_100MB: + *speed = IXGBE_LINK_SPEED_100_FULL; + break; + case IXGBE_ACI_LINK_SPEED_1000MB: + *speed = IXGBE_LINK_SPEED_1GB_FULL; + break; + case IXGBE_ACI_LINK_SPEED_2500MB: + *speed = IXGBE_LINK_SPEED_2_5GB_FULL; + break; + case IXGBE_ACI_LINK_SPEED_5GB: + *speed = IXGBE_LINK_SPEED_5GB_FULL; + break; + case IXGBE_ACI_LINK_SPEED_10GB: + *speed = IXGBE_LINK_SPEED_10GB_FULL; + break; + default: + *speed = IXGBE_LINK_SPEED_UNKNOWN; + break; + } + } else { + *speed = IXGBE_LINK_SPEED_UNKNOWN; + } + + return 0; +} + +/** + * ixgbe_get_link_capabilities_e610 - Determine link capabilities + * @hw: pointer to hardware structure + * @speed: pointer to link speed + * @autoneg: true when autoneg or autotry is enabled + * + * Determine speed and AN parameters of a link. + * + * Return: the exit code of the operation. + */ +int ixgbe_get_link_capabilities_e610(struct ixgbe_hw *hw, + ixgbe_link_speed *speed, + bool *autoneg) +{ + if (!speed || !autoneg) + return -EINVAL; + + *autoneg = true; + *speed = hw->phy.speeds_supported; + + return 0; +} + +/** + * ixgbe_cfg_phy_fc - Configure PHY Flow Control (FC) data based on FC mode + * @hw: pointer to hardware structure + * @cfg: PHY configuration data to set FC mode + * @req_mode: FC mode to configure + * + * Configures PHY Flow Control according to the provided configuration. + * + * Return: the exit code of the operation. + */ +int ixgbe_cfg_phy_fc(struct ixgbe_hw *hw, + struct ixgbe_aci_cmd_set_phy_cfg_data *cfg, + enum ixgbe_fc_mode req_mode) +{ + u8 pause_mask = 0x0; + + if (!cfg) + return -EINVAL; + + switch (req_mode) { + case ixgbe_fc_full: + pause_mask |= IXGBE_ACI_PHY_EN_TX_LINK_PAUSE; + pause_mask |= IXGBE_ACI_PHY_EN_RX_LINK_PAUSE; + break; + case ixgbe_fc_rx_pause: + pause_mask |= IXGBE_ACI_PHY_EN_RX_LINK_PAUSE; + break; + case ixgbe_fc_tx_pause: + pause_mask |= IXGBE_ACI_PHY_EN_TX_LINK_PAUSE; + break; + default: + break; + } + + /* Clear the old pause settings. */ + cfg->caps &= ~(IXGBE_ACI_PHY_EN_TX_LINK_PAUSE | + IXGBE_ACI_PHY_EN_RX_LINK_PAUSE); + + /* Set the new capabilities. */ + cfg->caps |= pause_mask; + + return 0; +} + +/** + * ixgbe_setup_fc_e610 - Set up flow control + * @hw: pointer to hardware structure + * + * Set up flow control. This has to be done during init time. + * + * Return: the exit code of the operation. + */ +int ixgbe_setup_fc_e610(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_get_phy_caps_data pcaps = {}; + struct ixgbe_aci_cmd_set_phy_cfg_data cfg = {}; + int err; + + /* Get the current PHY config */ + err = ixgbe_aci_get_phy_caps(hw, false, + IXGBE_ACI_REPORT_ACTIVE_CFG, &pcaps); + if (err) + return err; + + ixgbe_copy_phy_caps_to_cfg(&pcaps, &cfg); + + /* Configure the set PHY data */ + err = ixgbe_cfg_phy_fc(hw, &cfg, hw->fc.requested_mode); + if (err) + return err; + + /* If the capabilities have changed, then set the new config */ + if (cfg.caps != pcaps.caps) { + cfg.caps |= IXGBE_ACI_PHY_ENA_AUTO_LINK_UPDT; + + err = ixgbe_aci_set_phy_cfg(hw, &cfg); + if (err) + return err; + } + + return err; +} + +/** + * ixgbe_fc_autoneg_e610 - Configure flow control + * @hw: pointer to hardware structure + * + * Configure Flow Control. + */ +void ixgbe_fc_autoneg_e610(struct ixgbe_hw *hw) +{ + int err; + + /* Get current link err. + * Current FC mode will be stored in the hw context. + */ + err = ixgbe_aci_get_link_info(hw, false, NULL); + if (err) + goto no_autoneg; + + /* Check if the link is up */ + if (!(hw->link.link_info.link_info & IXGBE_ACI_LINK_UP)) + goto no_autoneg; + + /* Check if auto-negotiation has completed */ + if (!(hw->link.link_info.an_info & IXGBE_ACI_AN_COMPLETED)) + goto no_autoneg; + + hw->fc.fc_was_autonegged = true; + return; + +no_autoneg: + hw->fc.fc_was_autonegged = false; + hw->fc.current_mode = hw->fc.requested_mode; +} + +/** + * ixgbe_disable_rx_e610 - Disable RX unit + * @hw: pointer to hardware structure + * + * Disable RX DMA unit on E610 with use of ACI command (0x000C). + * + * Return: the exit code of the operation. + */ +void ixgbe_disable_rx_e610(struct ixgbe_hw *hw) +{ + u32 rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL); + u32 pfdtxgswc; + int err; + + if (!(rxctrl & IXGBE_RXCTRL_RXEN)) + return; + + pfdtxgswc = IXGBE_READ_REG(hw, IXGBE_PFDTXGSWC); + if (pfdtxgswc & IXGBE_PFDTXGSWC_VT_LBEN) { + pfdtxgswc &= ~IXGBE_PFDTXGSWC_VT_LBEN; + IXGBE_WRITE_REG(hw, IXGBE_PFDTXGSWC, pfdtxgswc); + hw->mac.set_lben = true; + } else { + hw->mac.set_lben = false; + } + + err = ixgbe_aci_disable_rxen(hw); + + /* If we fail - disable RX using register write */ + if (err) { + rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL); + if (rxctrl & IXGBE_RXCTRL_RXEN) { + rxctrl &= ~IXGBE_RXCTRL_RXEN; + IXGBE_WRITE_REG(hw, IXGBE_RXCTRL, rxctrl); + } + } +} + +/** + * ixgbe_init_phy_ops_e610 - PHY specific init + * @hw: pointer to hardware structure + * + * Initialize any function pointers that were not able to be + * set during init_shared_code because the PHY type was not known. + * + * Return: the exit code of the operation. + */ +int ixgbe_init_phy_ops_e610(struct ixgbe_hw *hw) +{ + struct ixgbe_mac_info *mac = &hw->mac; + struct ixgbe_phy_info *phy = &hw->phy; + + if (mac->ops.get_media_type(hw) == ixgbe_media_type_copper) + phy->ops.set_phy_power = ixgbe_set_phy_power_e610; + else + phy->ops.set_phy_power = NULL; + + /* Identify the PHY */ + return phy->ops.identify(hw); +} + +/** + * ixgbe_identify_phy_e610 - Identify PHY + * @hw: pointer to hardware structure + * + * Determine PHY type, supported speeds and PHY ID. + * + * Return: the exit code of the operation. + */ +int ixgbe_identify_phy_e610(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_get_phy_caps_data pcaps; + u64 phy_type_low, phy_type_high; + int err; + + /* Set PHY type */ + hw->phy.type = ixgbe_phy_fw; + + err = ixgbe_aci_get_phy_caps(hw, false, + IXGBE_ACI_REPORT_TOPO_CAP_MEDIA, &pcaps); + if (err) + return err; + + if (!(pcaps.module_compliance_enforcement & + IXGBE_ACI_MOD_ENFORCE_STRICT_MODE)) { + /* Handle lenient mode */ + err = ixgbe_aci_get_phy_caps(hw, false, + IXGBE_ACI_REPORT_TOPO_CAP_NO_MEDIA, + &pcaps); + if (err) + return err; + } + + /* Determine supported speeds */ + hw->phy.speeds_supported = IXGBE_LINK_SPEED_UNKNOWN; + phy_type_high = le64_to_cpu(pcaps.phy_type_high); + phy_type_low = le64_to_cpu(pcaps.phy_type_low); + + if (phy_type_high & IXGBE_PHY_TYPE_HIGH_10BASE_T || + phy_type_high & IXGBE_PHY_TYPE_HIGH_10M_SGMII) + hw->phy.speeds_supported |= IXGBE_LINK_SPEED_10_FULL; + if (phy_type_low & IXGBE_PHY_TYPE_LOW_100BASE_TX || + phy_type_low & IXGBE_PHY_TYPE_LOW_100M_SGMII || + phy_type_high & IXGBE_PHY_TYPE_HIGH_100M_USXGMII) + hw->phy.speeds_supported |= IXGBE_LINK_SPEED_100_FULL; + if (phy_type_low & IXGBE_PHY_TYPE_LOW_1000BASE_T || + phy_type_low & IXGBE_PHY_TYPE_LOW_1000BASE_SX || + phy_type_low & IXGBE_PHY_TYPE_LOW_1000BASE_LX || + phy_type_low & IXGBE_PHY_TYPE_LOW_1000BASE_KX || + phy_type_low & IXGBE_PHY_TYPE_LOW_1G_SGMII || + phy_type_high & IXGBE_PHY_TYPE_HIGH_1G_USXGMII) + hw->phy.speeds_supported |= IXGBE_LINK_SPEED_1GB_FULL; + if (phy_type_low & IXGBE_PHY_TYPE_LOW_10GBASE_T || + phy_type_low & IXGBE_PHY_TYPE_LOW_10G_SFI_DA || + phy_type_low & IXGBE_PHY_TYPE_LOW_10GBASE_SR || + phy_type_low & IXGBE_PHY_TYPE_LOW_10GBASE_LR || + phy_type_low & IXGBE_PHY_TYPE_LOW_10GBASE_KR_CR1 || + phy_type_low & IXGBE_PHY_TYPE_LOW_10G_SFI_AOC_ACC || + phy_type_low & IXGBE_PHY_TYPE_LOW_10G_SFI_C2C || + phy_type_high & IXGBE_PHY_TYPE_HIGH_10G_USXGMII) + hw->phy.speeds_supported |= IXGBE_LINK_SPEED_10GB_FULL; + + /* 2.5 and 5 Gbps link speeds must be excluded from the + * auto-negotiation set used during driver initialization due to + * compatibility issues with certain switches. Those issues do not + * exist in case of E610 2.5G SKU device (0x57b1). + */ + if (!hw->phy.autoneg_advertised && + hw->device_id != IXGBE_DEV_ID_E610_2_5G_T) + hw->phy.autoneg_advertised = hw->phy.speeds_supported; + + if (phy_type_low & IXGBE_PHY_TYPE_LOW_2500BASE_T || + phy_type_low & IXGBE_PHY_TYPE_LOW_2500BASE_X || + phy_type_low & IXGBE_PHY_TYPE_LOW_2500BASE_KX || + phy_type_high & IXGBE_PHY_TYPE_HIGH_2500M_SGMII || + phy_type_high & IXGBE_PHY_TYPE_HIGH_2500M_USXGMII) + hw->phy.speeds_supported |= IXGBE_LINK_SPEED_2_5GB_FULL; + + if (!hw->phy.autoneg_advertised && + hw->device_id == IXGBE_DEV_ID_E610_2_5G_T) + hw->phy.autoneg_advertised = hw->phy.speeds_supported; + + if (phy_type_low & IXGBE_PHY_TYPE_LOW_5GBASE_T || + phy_type_low & IXGBE_PHY_TYPE_LOW_5GBASE_KR || + phy_type_high & IXGBE_PHY_TYPE_HIGH_5G_USXGMII) + hw->phy.speeds_supported |= IXGBE_LINK_SPEED_5GB_FULL; + + /* Set PHY ID */ + memcpy(&hw->phy.id, pcaps.phy_id_oui, sizeof(u32)); + + hw->phy.eee_speeds_supported = IXGBE_LINK_SPEED_10_FULL | + IXGBE_LINK_SPEED_100_FULL | + IXGBE_LINK_SPEED_1GB_FULL; + hw->phy.eee_speeds_advertised = hw->phy.eee_speeds_supported; + + return 0; +} + +/** + * ixgbe_identify_module_e610 - Identify SFP module type + * @hw: pointer to hardware structure + * + * Identify the SFP module type. + * + * Return: the exit code of the operation. + */ +int ixgbe_identify_module_e610(struct ixgbe_hw *hw) +{ + bool media_available; + u8 module_type; + int err; + + err = ixgbe_update_link_info(hw); + if (err) + return err; + + media_available = + (hw->link.link_info.link_info & IXGBE_ACI_MEDIA_AVAILABLE); + + if (media_available) { + hw->phy.sfp_type = ixgbe_sfp_type_unknown; + + /* Get module type from hw context updated by + * ixgbe_update_link_info() + */ + module_type = hw->link.link_info.module_type[IXGBE_ACI_MOD_TYPE_IDENT]; + + if ((module_type & IXGBE_ACI_MOD_TYPE_BYTE1_SFP_PLUS_CU_PASSIVE) || + (module_type & IXGBE_ACI_MOD_TYPE_BYTE1_SFP_PLUS_CU_ACTIVE)) { + hw->phy.sfp_type = ixgbe_sfp_type_da_cu; + } else if (module_type & IXGBE_ACI_MOD_TYPE_BYTE1_10G_BASE_SR) { + hw->phy.sfp_type = ixgbe_sfp_type_sr; + } else if ((module_type & IXGBE_ACI_MOD_TYPE_BYTE1_10G_BASE_LR) || + (module_type & IXGBE_ACI_MOD_TYPE_BYTE1_10G_BASE_LRM)) { + hw->phy.sfp_type = ixgbe_sfp_type_lr; + } + } else { + hw->phy.sfp_type = ixgbe_sfp_type_not_present; + return -ENOENT; + } + + return 0; +} + +/** + * ixgbe_setup_phy_link_e610 - Sets up firmware-controlled PHYs + * @hw: pointer to hardware structure + * + * Set the parameters for the firmware-controlled PHYs. + * + * Return: the exit code of the operation. + */ +int ixgbe_setup_phy_link_e610(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_get_phy_caps_data pcaps; + struct ixgbe_aci_cmd_set_phy_cfg_data pcfg; + u8 rmode = IXGBE_ACI_REPORT_TOPO_CAP_MEDIA; + u64 sup_phy_type_low, sup_phy_type_high; + u64 phy_type_low = 0, phy_type_high = 0; + int err; + + err = ixgbe_aci_get_link_info(hw, false, NULL); + if (err) + return err; + + /* If media is not available get default config. */ + if (!(hw->link.link_info.link_info & IXGBE_ACI_MEDIA_AVAILABLE)) + rmode = IXGBE_ACI_REPORT_DFLT_CFG; + + err = ixgbe_aci_get_phy_caps(hw, false, rmode, &pcaps); + if (err) + return err; + + sup_phy_type_low = le64_to_cpu(pcaps.phy_type_low); + sup_phy_type_high = le64_to_cpu(pcaps.phy_type_high); + + /* Get Active configuration to avoid unintended changes. */ + err = ixgbe_aci_get_phy_caps(hw, false, IXGBE_ACI_REPORT_ACTIVE_CFG, + &pcaps); + if (err) + return err; + + ixgbe_copy_phy_caps_to_cfg(&pcaps, &pcfg); + + if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_10_FULL) { + phy_type_high |= IXGBE_PHY_TYPE_HIGH_10BASE_T; + phy_type_high |= IXGBE_PHY_TYPE_HIGH_10M_SGMII; + } + if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_100_FULL) { + phy_type_low |= IXGBE_PHY_TYPE_LOW_100BASE_TX; + phy_type_low |= IXGBE_PHY_TYPE_LOW_100M_SGMII; + phy_type_high |= IXGBE_PHY_TYPE_HIGH_100M_USXGMII; + } + if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_1GB_FULL) { + phy_type_low |= IXGBE_PHY_TYPE_LOW_1000BASE_T; + phy_type_low |= IXGBE_PHY_TYPE_LOW_1000BASE_SX; + phy_type_low |= IXGBE_PHY_TYPE_LOW_1000BASE_LX; + phy_type_low |= IXGBE_PHY_TYPE_LOW_1000BASE_KX; + phy_type_low |= IXGBE_PHY_TYPE_LOW_1G_SGMII; + phy_type_high |= IXGBE_PHY_TYPE_HIGH_1G_USXGMII; + } + if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_2_5GB_FULL) { + phy_type_low |= IXGBE_PHY_TYPE_LOW_2500BASE_T; + phy_type_low |= IXGBE_PHY_TYPE_LOW_2500BASE_X; + phy_type_low |= IXGBE_PHY_TYPE_LOW_2500BASE_KX; + phy_type_high |= IXGBE_PHY_TYPE_HIGH_2500M_SGMII; + phy_type_high |= IXGBE_PHY_TYPE_HIGH_2500M_USXGMII; + } + if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_5GB_FULL) { + phy_type_low |= IXGBE_PHY_TYPE_LOW_5GBASE_T; + phy_type_low |= IXGBE_PHY_TYPE_LOW_5GBASE_KR; + phy_type_high |= IXGBE_PHY_TYPE_HIGH_5G_USXGMII; + } + if (hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_10GB_FULL) { + phy_type_low |= IXGBE_PHY_TYPE_LOW_10GBASE_T; + phy_type_low |= IXGBE_PHY_TYPE_LOW_10G_SFI_DA; + phy_type_low |= IXGBE_PHY_TYPE_LOW_10GBASE_SR; + phy_type_low |= IXGBE_PHY_TYPE_LOW_10GBASE_LR; + phy_type_low |= IXGBE_PHY_TYPE_LOW_10GBASE_KR_CR1; + phy_type_low |= IXGBE_PHY_TYPE_LOW_10G_SFI_AOC_ACC; + phy_type_low |= IXGBE_PHY_TYPE_LOW_10G_SFI_C2C; + phy_type_high |= IXGBE_PHY_TYPE_HIGH_10G_USXGMII; + } + + /* Mask the set values to avoid requesting unsupported link types. */ + phy_type_low &= sup_phy_type_low; + pcfg.phy_type_low = cpu_to_le64(phy_type_low); + phy_type_high &= sup_phy_type_high; + pcfg.phy_type_high = cpu_to_le64(phy_type_high); + + if (pcfg.phy_type_high != pcaps.phy_type_high || + pcfg.phy_type_low != pcaps.phy_type_low || + pcfg.caps != pcaps.caps) { + pcfg.caps |= IXGBE_ACI_PHY_ENA_LINK; + pcfg.caps |= IXGBE_ACI_PHY_ENA_AUTO_LINK_UPDT; + + err = ixgbe_aci_set_phy_cfg(hw, &pcfg); + if (err) + return err; + } + + return 0; +} + +/** + * ixgbe_set_phy_power_e610 - Control power for copper PHY + * @hw: pointer to hardware structure + * @on: true for on, false for off + * + * Set the power on/off of the PHY + * by getting its capabilities and setting the appropriate + * configuration parameters. + * + * Return: the exit code of the operation. + */ +int ixgbe_set_phy_power_e610(struct ixgbe_hw *hw, bool on) +{ + struct ixgbe_aci_cmd_get_phy_caps_data phy_caps = {}; + struct ixgbe_aci_cmd_set_phy_cfg_data phy_cfg = {}; + int err; + + err = ixgbe_aci_get_phy_caps(hw, false, + IXGBE_ACI_REPORT_ACTIVE_CFG, + &phy_caps); + if (err) + return err; + + ixgbe_copy_phy_caps_to_cfg(&phy_caps, &phy_cfg); + + if (on) + phy_cfg.caps &= ~IXGBE_ACI_PHY_ENA_LOW_POWER; + else + phy_cfg.caps |= IXGBE_ACI_PHY_ENA_LOW_POWER; + + /* PHY is already in requested power mode. */ + if (phy_caps.caps == phy_cfg.caps) + return 0; + + phy_cfg.caps |= IXGBE_ACI_PHY_ENA_LINK; + phy_cfg.caps |= IXGBE_ACI_PHY_ENA_AUTO_LINK_UPDT; + + return ixgbe_aci_set_phy_cfg(hw, &phy_cfg); +} + +/** + * ixgbe_enter_lplu_e610 - Transition to low power states + * @hw: pointer to hardware structure + * + * Configures Low Power Link Up on transition to low power states + * (from D0 to non-D0). Link is required to enter LPLU so avoid resetting the + * X557 PHY immediately prior to entering LPLU. + * + * Return: the exit code of the operation. + */ +int ixgbe_enter_lplu_e610(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_get_phy_caps_data phy_caps = {}; + struct ixgbe_aci_cmd_set_phy_cfg_data phy_cfg = {}; + int err; + + err = ixgbe_aci_get_phy_caps(hw, false, + IXGBE_ACI_REPORT_ACTIVE_CFG, + &phy_caps); + if (err) + return err; + + ixgbe_copy_phy_caps_to_cfg(&phy_caps, &phy_cfg); + + phy_cfg.low_power_ctrl_an |= IXGBE_ACI_PHY_EN_D3COLD_LOW_POWER_AUTONEG; + + return ixgbe_aci_set_phy_cfg(hw, &phy_cfg); +} + +/** + * ixgbe_aci_get_netlist_node - get a node handle + * @hw: pointer to the hw struct + * @cmd: get_link_topo AQ structure + * @node_part_number: output node part number if node found + * @node_handle: output node handle parameter if node found + * + * Get the netlist node and assigns it to + * the provided handle using ACI command (0x06E0). + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_get_netlist_node(struct ixgbe_hw *hw, + struct ixgbe_aci_cmd_get_link_topo *cmd, + u8 *node_part_number, u16 *node_handle) +{ + struct ixgbe_aci_desc desc; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_link_topo); + desc.params.get_link_topo = *cmd; + + if (ixgbe_aci_send_cmd(hw, &desc, NULL, 0)) + return -EOPNOTSUPP; + + if (node_handle) + *node_handle = + le16_to_cpu(desc.params.get_link_topo.addr.handle); + if (node_part_number) + *node_part_number = desc.params.get_link_topo.node_part_num; + + return 0; +} diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 5c5a6769b566d..4a4f969b2100b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -27,5 +27,37 @@ int ixgbe_aci_get_phy_caps(struct ixgbe_hw *hw, bool qual_mods, u8 report_mode, struct ixgbe_aci_cmd_get_phy_caps_data *pcaps); void ixgbe_copy_phy_caps_to_cfg(struct ixgbe_aci_cmd_get_phy_caps_data *caps, struct ixgbe_aci_cmd_set_phy_cfg_data *cfg); +int ixgbe_aci_set_phy_cfg(struct ixgbe_hw *hw, + struct ixgbe_aci_cmd_set_phy_cfg_data *cfg); +int ixgbe_aci_set_link_restart_an(struct ixgbe_hw *hw, bool ena_link); +int ixgbe_update_link_info(struct ixgbe_hw *hw); +int ixgbe_get_link_status(struct ixgbe_hw *hw, bool *link_up); +int ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse, + struct ixgbe_link_status *link); +int ixgbe_aci_set_event_mask(struct ixgbe_hw *hw, u8 port_num, u16 mask); +int ixgbe_configure_lse(struct ixgbe_hw *hw, bool activate, u16 mask); +enum ixgbe_media_type ixgbe_get_media_type_e610(struct ixgbe_hw *hw); +int ixgbe_setup_link_e610(struct ixgbe_hw *hw, ixgbe_link_speed speed, + bool autoneg_wait); +int ixgbe_check_link_e610(struct ixgbe_hw *hw, ixgbe_link_speed *speed, + bool *link_up, bool link_up_wait_to_complete); +int ixgbe_get_link_capabilities_e610(struct ixgbe_hw *hw, + ixgbe_link_speed *speed, + bool *autoneg); +int ixgbe_cfg_phy_fc(struct ixgbe_hw *hw, + struct ixgbe_aci_cmd_set_phy_cfg_data *cfg, + enum ixgbe_fc_mode req_mode); +int ixgbe_setup_fc_e610(struct ixgbe_hw *hw); +void ixgbe_fc_autoneg_e610(struct ixgbe_hw *hw); +void ixgbe_disable_rx_e610(struct ixgbe_hw *hw); +int ixgbe_init_phy_ops_e610(struct ixgbe_hw *hw); +int ixgbe_identify_phy_e610(struct ixgbe_hw *hw); +int ixgbe_identify_module_e610(struct ixgbe_hw *hw); +int ixgbe_setup_phy_link_e610(struct ixgbe_hw *hw); +int ixgbe_set_phy_power_e610(struct ixgbe_hw *hw, bool on); +int ixgbe_enter_lplu_e610(struct ixgbe_hw *hw); +int ixgbe_aci_get_netlist_node(struct ixgbe_hw *hw, + struct ixgbe_aci_cmd_get_link_topo *cmd, + u8 *node_part_number, u16 *node_handle); #endif /* _IXGBE_E610_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index 5978cb06f732a..ecc3fc8c8d526 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -652,6 +652,7 @@ struct ixgbe_aci_cmd_link_topo_params { #define IXGBE_ACI_LINK_TOPO_NODE_TYPE_CLK_MUX 10 #define IXGBE_ACI_LINK_TOPO_NODE_TYPE_GPS 11 #define IXGBE_ACI_LINK_TOPO_NODE_CTX_S 4 +#define IXGBE_ACI_LINK_TOPO_NODE_CTX_M GENMASK(7, 4) #define IXGBE_ACI_LINK_TOPO_NODE_CTX_GLOBAL 0 #define IXGBE_ACI_LINK_TOPO_NODE_CTX_BOARD 1 #define IXGBE_ACI_LINK_TOPO_NODE_CTX_PORT 2 From d2483ebc9deb9de23fd85a2a45f9073ec9101f36 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Thu, 5 Dec 2024 09:44:46 +0100 Subject: [PATCH 0862/1450] ixgbe: Add support for NVM handling in E610 device Add low level support for accessing NVM in E610 device. NVM operations are handled via the Admin Command Interface. Add the following NVM specific operations: - acquire, release, read - validate checksum - read shadow ram Co-developed-by: Stefan Wegrzyn Signed-off-by: Stefan Wegrzyn Co-developed-by: Jedrzej Jagielski Signed-off-by: Jedrzej Jagielski Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Piotr Kwapulinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 291 ++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 12 + 2 files changed, 303 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 0b578c4006b1a..a35e28d992697 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -2105,3 +2105,294 @@ int ixgbe_aci_get_netlist_node(struct ixgbe_hw *hw, return 0; } + +/** + * ixgbe_acquire_nvm - Generic request for acquiring the NVM ownership + * @hw: pointer to the HW structure + * @access: NVM access type (read or write) + * + * Request NVM ownership. + * + * Return: the exit code of the operation. + */ +int ixgbe_acquire_nvm(struct ixgbe_hw *hw, + enum ixgbe_aci_res_access_type access) +{ + u32 fla; + + /* Skip if we are in blank NVM programming mode */ + fla = IXGBE_READ_REG(hw, IXGBE_GLNVM_FLA); + if ((fla & IXGBE_GLNVM_FLA_LOCKED_M) == 0) + return 0; + + return ixgbe_acquire_res(hw, IXGBE_NVM_RES_ID, access, + IXGBE_NVM_TIMEOUT); +} + +/** + * ixgbe_release_nvm - Generic request for releasing the NVM ownership + * @hw: pointer to the HW structure + * + * Release NVM ownership. + */ +void ixgbe_release_nvm(struct ixgbe_hw *hw) +{ + u32 fla; + + /* Skip if we are in blank NVM programming mode */ + fla = IXGBE_READ_REG(hw, IXGBE_GLNVM_FLA); + if ((fla & IXGBE_GLNVM_FLA_LOCKED_M) == 0) + return; + + ixgbe_release_res(hw, IXGBE_NVM_RES_ID); +} + +/** + * ixgbe_aci_read_nvm - read NVM + * @hw: pointer to the HW struct + * @module_typeid: module pointer location in words from the NVM beginning + * @offset: byte offset from the module beginning + * @length: length of the section to be read (in bytes from the offset) + * @data: command buffer (size [bytes] = length) + * @last_command: tells if this is the last command in a series + * @read_shadow_ram: tell if this is a shadow RAM read + * + * Read the NVM using ACI command (0x0701). + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset, + u16 length, void *data, bool last_command, + bool read_shadow_ram) +{ + struct ixgbe_aci_cmd_nvm *cmd; + struct ixgbe_aci_desc desc; + + if (offset > IXGBE_ACI_NVM_MAX_OFFSET) + return -EINVAL; + + cmd = &desc.params.nvm; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_read); + + if (!read_shadow_ram && module_typeid == IXGBE_ACI_NVM_START_POINT) + cmd->cmd_flags |= IXGBE_ACI_NVM_FLASH_ONLY; + + /* If this is the last command in a series, set the proper flag. */ + if (last_command) + cmd->cmd_flags |= IXGBE_ACI_NVM_LAST_CMD; + cmd->module_typeid = cpu_to_le16(module_typeid); + cmd->offset_low = cpu_to_le16(offset & 0xFFFF); + cmd->offset_high = (offset >> 16) & 0xFF; + cmd->length = cpu_to_le16(length); + + return ixgbe_aci_send_cmd(hw, &desc, data, length); +} + +/** + * ixgbe_nvm_validate_checksum - validate checksum + * @hw: pointer to the HW struct + * + * Verify NVM PFA checksum validity using ACI command (0x0706). + * If the checksum verification failed, IXGBE_ERR_NVM_CHECKSUM is returned. + * The function acquires and then releases the NVM ownership. + * + * Return: the exit code of the operation. + */ +int ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_nvm_checksum *cmd; + struct ixgbe_aci_desc desc; + int err; + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) + return err; + + cmd = &desc.params.nvm_checksum; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_checksum); + cmd->flags = IXGBE_ACI_NVM_CHECKSUM_VERIFY; + + err = ixgbe_aci_send_cmd(hw, &desc, NULL, 0); + + ixgbe_release_nvm(hw); + + if (!err && cmd->checksum != + cpu_to_le16(IXGBE_ACI_NVM_CHECKSUM_CORRECT)) { + struct ixgbe_adapter *adapter = container_of(hw, struct ixgbe_adapter, + hw); + + err = -EIO; + netdev_err(adapter->netdev, "Invalid Shadow Ram checksum"); + } + + return err; +} + +/** + * ixgbe_read_sr_word_aci - Reads Shadow RAM via ACI + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF) + * @data: word read from the Shadow RAM + * + * Reads one 16 bit word from the Shadow RAM using ixgbe_read_flat_nvm. + * + * Return: the exit code of the operation. + */ +int ixgbe_read_sr_word_aci(struct ixgbe_hw *hw, u16 offset, u16 *data) +{ + u32 bytes = sizeof(u16); + u16 data_local; + int err; + + err = ixgbe_read_flat_nvm(hw, offset * sizeof(u16), &bytes, + (u8 *)&data_local, true); + if (err) + return err; + + *data = data_local; + return 0; +} + +/** + * ixgbe_read_flat_nvm - Read portion of NVM by flat offset + * @hw: pointer to the HW struct + * @offset: offset from beginning of NVM + * @length: (in) number of bytes to read; (out) number of bytes actually read + * @data: buffer to return data in (sized to fit the specified length) + * @read_shadow_ram: if true, read from shadow RAM instead of NVM + * + * Reads a portion of the NVM, as a flat memory space. This function correctly + * breaks read requests across Shadow RAM sectors, prevents Shadow RAM size + * from being exceeded in case of Shadow RAM read requests and ensures that no + * single read request exceeds the maximum 4KB read for a single admin command. + * + * Returns an error code on failure. Note that the data pointer may be + * partially updated if some reads succeed before a failure. + * + * Return: the exit code of the operation. + */ +int ixgbe_read_flat_nvm(struct ixgbe_hw *hw, u32 offset, u32 *length, + u8 *data, bool read_shadow_ram) +{ + u32 inlen = *length; + u32 bytes_read = 0; + bool last_cmd; + int err; + + /* Verify the length of the read if this is for the Shadow RAM */ + if (read_shadow_ram && ((offset + inlen) > + (hw->eeprom.word_size * 2u))) + return -EINVAL; + + do { + u32 read_size, sector_offset; + + /* ixgbe_aci_read_nvm cannot read more than 4KB at a time. + * Additionally, a read from the Shadow RAM may not cross over + * a sector boundary. Conveniently, the sector size is also 4KB. + */ + sector_offset = offset % IXGBE_ACI_MAX_BUFFER_SIZE; + read_size = min_t(u32, + IXGBE_ACI_MAX_BUFFER_SIZE - sector_offset, + inlen - bytes_read); + + last_cmd = !(bytes_read + read_size < inlen); + + /* ixgbe_aci_read_nvm takes the length as a u16. Our read_size + * is calculated using a u32, but the IXGBE_ACI_MAX_BUFFER_SIZE + * maximum size guarantees that it will fit within the 2 bytes. + */ + err = ixgbe_aci_read_nvm(hw, IXGBE_ACI_NVM_START_POINT, + offset, (u16)read_size, + data + bytes_read, last_cmd, + read_shadow_ram); + if (err) + break; + + bytes_read += read_size; + offset += read_size; + } while (!last_cmd); + + *length = bytes_read; + return err; +} + +/** + * ixgbe_read_ee_aci_e610 - Read EEPROM word using the admin command. + * @hw: pointer to hardware structure + * @offset: offset of word in the EEPROM to read + * @data: word read from the EEPROM + * + * Reads a 16 bit word from the EEPROM using the ACI. + * If the EEPROM params are not initialized, the function + * initialize them before proceeding with reading. + * The function acquires and then releases the NVM ownership. + * + * Return: the exit code of the operation. + */ +int ixgbe_read_ee_aci_e610(struct ixgbe_hw *hw, u16 offset, u16 *data) +{ + int err; + + if (hw->eeprom.type == ixgbe_eeprom_uninitialized) { + err = hw->eeprom.ops.init_params(hw); + if (err) + return err; + } + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) + return err; + + err = ixgbe_read_sr_word_aci(hw, offset, data); + ixgbe_release_nvm(hw); + + return err; +} + +/** + * ixgbe_validate_eeprom_checksum_e610 - Validate EEPROM checksum + * @hw: pointer to hardware structure + * @checksum_val: calculated checksum + * + * Performs checksum calculation and validates the EEPROM checksum. If the + * caller does not need checksum_val, the value can be NULL. + * If the EEPROM params are not initialized, the function + * initialize them before proceeding. + * The function acquires and then releases the NVM ownership. + * + * Return: the exit code of the operation. + */ +int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val) +{ + int err; + + if (hw->eeprom.type == ixgbe_eeprom_uninitialized) { + err = hw->eeprom.ops.init_params(hw); + if (err) + return err; + } + + err = ixgbe_nvm_validate_checksum(hw); + if (err) + return err; + + if (checksum_val) { + u16 tmp_checksum; + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) + return err; + + err = ixgbe_read_sr_word_aci(hw, E610_SR_SW_CHECKSUM_WORD, + &tmp_checksum); + ixgbe_release_nvm(hw); + + if (!err) + *checksum_val = tmp_checksum; + } + + return err; +} diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 4a4f969b2100b..412ddd123cd16 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -59,5 +59,17 @@ int ixgbe_enter_lplu_e610(struct ixgbe_hw *hw); int ixgbe_aci_get_netlist_node(struct ixgbe_hw *hw, struct ixgbe_aci_cmd_get_link_topo *cmd, u8 *node_part_number, u16 *node_handle); +int ixgbe_acquire_nvm(struct ixgbe_hw *hw, + enum ixgbe_aci_res_access_type access); +void ixgbe_release_nvm(struct ixgbe_hw *hw); +int ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset, + u16 length, void *data, bool last_command, + bool read_shadow_ram); +int ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw); +int ixgbe_read_sr_word_aci(struct ixgbe_hw *hw, u16 offset, u16 *data); +int ixgbe_read_flat_nvm(struct ixgbe_hw *hw, u32 offset, u32 *length, + u8 *data, bool read_shadow_ram); +int ixgbe_read_ee_aci_e610(struct ixgbe_hw *hw, u16 offset, u16 *data); +int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val); #endif /* _IXGBE_E610_H_ */ From e5b132b4f4d97a4737d152df0f97906e542be7ee Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Thu, 5 Dec 2024 09:44:47 +0100 Subject: [PATCH 0863/1450] ixgbe: Add support for EEPROM dump in E610 device Add low level support for EEPROM dump for the specified network device. Co-developed-by: Stefan Wegrzyn Signed-off-by: Stefan Wegrzyn Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Piotr Kwapulinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 95 +++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 5 + .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 7 ++ 3 files changed, 107 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index a35e28d992697..6bf3562b3ce29 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -2073,6 +2073,38 @@ int ixgbe_enter_lplu_e610(struct ixgbe_hw *hw) return ixgbe_aci_set_phy_cfg(hw, &phy_cfg); } +/** + * ixgbe_init_eeprom_params_e610 - Initialize EEPROM params + * @hw: pointer to hardware structure + * + * Initialize the EEPROM parameters ixgbe_eeprom_info within the ixgbe_hw + * struct in order to set up EEPROM access. + * + * Return: the operation exit code. + */ +int ixgbe_init_eeprom_params_e610(struct ixgbe_hw *hw) +{ + struct ixgbe_eeprom_info *eeprom = &hw->eeprom; + u32 gens_stat; + u8 sr_size; + + if (eeprom->type != ixgbe_eeprom_uninitialized) + return 0; + + eeprom->type = ixgbe_flash; + + gens_stat = IXGBE_READ_REG(hw, GLNVM_GENS); + sr_size = FIELD_GET(GLNVM_GENS_SR_SIZE_M, gens_stat); + + /* Switching to words (sr_size contains power of 2). */ + eeprom->word_size = BIT(sr_size) * IXGBE_SR_WORDS_IN_1KB; + + hw_dbg(hw, "Eeprom params: type = %d, size = %d\n", eeprom->type, + eeprom->word_size); + + return 0; +} + /** * ixgbe_aci_get_netlist_node - get a node handle * @hw: pointer to the hw struct @@ -2319,6 +2351,36 @@ int ixgbe_read_flat_nvm(struct ixgbe_hw *hw, u32 offset, u32 *length, return err; } +/** + * ixgbe_read_sr_buf_aci - Read Shadow RAM buffer via ACI + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM words to read (0x000000 - 0x001FFF) + * @words: (in) number of words to read; (out) number of words actually read + * @data: words read from the Shadow RAM + * + * Read 16 bit words (data buf) from the Shadow RAM. Acquire/release the NVM + * ownership. + * + * Return: the operation exit code. + */ +int ixgbe_read_sr_buf_aci(struct ixgbe_hw *hw, u16 offset, u16 *words, + u16 *data) +{ + u32 bytes = *words * 2; + int err; + + err = ixgbe_read_flat_nvm(hw, offset * 2, &bytes, (u8 *)data, true); + if (err) + return err; + + *words = bytes / 2; + + for (int i = 0; i < *words; i++) + data[i] = le16_to_cpu(((__le16 *)data)[i]); + + return 0; +} + /** * ixgbe_read_ee_aci_e610 - Read EEPROM word using the admin command. * @hw: pointer to hardware structure @@ -2352,6 +2414,39 @@ int ixgbe_read_ee_aci_e610(struct ixgbe_hw *hw, u16 offset, u16 *data) return err; } +/** + * ixgbe_read_ee_aci_buffer_e610 - Read EEPROM words via ACI + * @hw: pointer to hardware structure + * @offset: offset of words in the EEPROM to read + * @words: number of words to read + * @data: words to read from the EEPROM + * + * Read 16 bit words from the EEPROM via the ACI. Initialize the EEPROM params + * prior to the read. Acquire/release the NVM ownership. + * + * Return: the operation exit code. + */ +int ixgbe_read_ee_aci_buffer_e610(struct ixgbe_hw *hw, u16 offset, + u16 words, u16 *data) +{ + int err; + + if (hw->eeprom.type == ixgbe_eeprom_uninitialized) { + err = hw->eeprom.ops.init_params(hw); + if (err) + return err; + } + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) + return err; + + err = ixgbe_read_sr_buf_aci(hw, offset, &words, data); + ixgbe_release_nvm(hw); + + return err; +} + /** * ixgbe_validate_eeprom_checksum_e610 - Validate EEPROM checksum * @hw: pointer to hardware structure diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 412ddd123cd16..9cfcfeec6e0b7 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -56,6 +56,7 @@ int ixgbe_identify_module_e610(struct ixgbe_hw *hw); int ixgbe_setup_phy_link_e610(struct ixgbe_hw *hw); int ixgbe_set_phy_power_e610(struct ixgbe_hw *hw, bool on); int ixgbe_enter_lplu_e610(struct ixgbe_hw *hw); +int ixgbe_init_eeprom_params_e610(struct ixgbe_hw *hw); int ixgbe_aci_get_netlist_node(struct ixgbe_hw *hw, struct ixgbe_aci_cmd_get_link_topo *cmd, u8 *node_part_number, u16 *node_handle); @@ -69,7 +70,11 @@ int ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw); int ixgbe_read_sr_word_aci(struct ixgbe_hw *hw, u16 offset, u16 *data); int ixgbe_read_flat_nvm(struct ixgbe_hw *hw, u32 offset, u32 *length, u8 *data, bool read_shadow_ram); +int ixgbe_read_sr_buf_aci(struct ixgbe_hw *hw, u16 offset, u16 *words, + u16 *data); int ixgbe_read_ee_aci_e610(struct ixgbe_hw *hw, u16 offset, u16 *data); +int ixgbe_read_ee_aci_buffer_e610(struct ixgbe_hw *hw, u16 offset, + u16 words, u16 *data); int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val); #endif /* _IXGBE_E610_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index ecc3fc8c8d526..8d06ade3c7cd9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -12,11 +12,18 @@ /* Checksum and Shadow RAM pointers */ #define E610_SR_SW_CHECKSUM_WORD 0x3F +/* Shadow RAM related */ +#define IXGBE_SR_WORDS_IN_1KB 512 + /* Firmware Status Register (GL_FWSTS) */ #define GL_FWSTS 0x00083048 /* Reset Source: POR */ #define GL_FWSTS_EP_PF0 BIT(24) #define GL_FWSTS_EP_PF1 BIT(25) +/* Global NVM General Status Register */ +#define GLNVM_GENS 0x000B6100 /* Reset Source: POR */ +#define GLNVM_GENS_SR_SIZE_M GENMASK(7, 5) + /* Flash Access Register */ #define IXGBE_GLNVM_FLA 0x000B6108 /* Reset Source: POR */ #define IXGBE_GLNVM_FLA_LOCKED_S 6 From a0834bd521eaf1f2014041a8ad5a0cb233ac4fda Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Thu, 5 Dec 2024 09:44:48 +0100 Subject: [PATCH 0864/1450] ixgbe: Add ixgbe_x540 multiple header inclusion protection Required to adopt x540 specific functions by E610 device. Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Piotr Kwapulinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h index b69a680d3ab56..6ed360c5b605b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h @@ -1,5 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ + +#ifndef _IXGBE_X540_H_ +#define _IXGBE_X540_H_ #include "ixgbe_type.h" @@ -17,3 +20,5 @@ int ixgbe_acquire_swfw_sync_X540(struct ixgbe_hw *hw, u32 mask); void ixgbe_release_swfw_sync_X540(struct ixgbe_hw *hw, u32 mask); void ixgbe_init_swfw_sync_X540(struct ixgbe_hw *hw); int ixgbe_init_eeprom_params_X540(struct ixgbe_hw *hw); + +#endif /* _IXGBE_X540_H_ */ From 34b41577077198953e156f5e4bf8cdf734485e1f Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Thu, 5 Dec 2024 09:44:49 +0100 Subject: [PATCH 0865/1450] ixgbe: Clean up the E610 link management related code Required for enabling the link management in E610 device. Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Piotr Kwapulinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 17 +++++++++++------ drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 12 ++++++------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 13b777d702a22..2656f2617a8f9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -237,6 +237,9 @@ static int ixgbe_get_parent_bus_info(struct ixgbe_adapter *adapter) * bandwidth details should be gathered from the parent bus instead of from the * device. Used to ensure that various locations all have the correct device ID * checks. + * + * Return: true if information should be collected from the parent bus, false + * otherwise */ static inline bool ixgbe_pcie_from_parent(struct ixgbe_hw *hw) { @@ -5533,7 +5536,9 @@ static void ixgbe_sfp_link_config(struct ixgbe_adapter *adapter) * ixgbe_non_sfp_link_config - set up non-SFP+ link * @hw: pointer to private hardware struct * - * Returns 0 on success, negative on failure + * Configure non-SFP link. + * + * Return: 0 on success, negative on failure **/ static int ixgbe_non_sfp_link_config(struct ixgbe_hw *hw) { @@ -7222,11 +7227,11 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) for (i = 0; i < 16; i++) { hwstats->qptc[i] += IXGBE_READ_REG(hw, IXGBE_QPTC(i)); hwstats->qprc[i] += IXGBE_READ_REG(hw, IXGBE_QPRC(i)); - if ((hw->mac.type == ixgbe_mac_82599EB) || - (hw->mac.type == ixgbe_mac_X540) || - (hw->mac.type == ixgbe_mac_X550) || - (hw->mac.type == ixgbe_mac_X550EM_x) || - (hw->mac.type == ixgbe_mac_x550em_a)) { + if (hw->mac.type == ixgbe_mac_82599EB || + hw->mac.type == ixgbe_mac_X540 || + hw->mac.type == ixgbe_mac_X550 || + hw->mac.type == ixgbe_mac_X550EM_x || + hw->mac.type == ixgbe_mac_x550em_a) { hwstats->qbtc[i] += IXGBE_READ_REG(hw, IXGBE_QBTC_L(i)); IXGBE_READ_REG(hw, IXGBE_QBTC_H(i)); /* to clear */ hwstats->qbrc[i] += IXGBE_READ_REG(hw, IXGBE_QBRC_L(i)); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c index d9a8cf018d3bb..1de05443d3a28 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c @@ -3505,13 +3505,13 @@ static int ixgbe_reset_hw_X550em(struct ixgbe_hw *hw) return status; } -/** ixgbe_set_ethertype_anti_spoofing_X550 - Enable/Disable Ethertype +/** ixgbe_set_ethertype_anti_spoofing_x550 - Enable/Disable Ethertype * anti-spoofing * @hw: pointer to hardware structure * @enable: enable or disable switch for Ethertype anti-spoofing * @vf: Virtual Function pool - VF Pool to set for Ethertype anti-spoofing **/ -static void ixgbe_set_ethertype_anti_spoofing_X550(struct ixgbe_hw *hw, +static void ixgbe_set_ethertype_anti_spoofing_x550(struct ixgbe_hw *hw, bool enable, int vf) { int vf_target_reg = vf >> 3; @@ -3527,12 +3527,12 @@ static void ixgbe_set_ethertype_anti_spoofing_X550(struct ixgbe_hw *hw, IXGBE_WRITE_REG(hw, IXGBE_PFVFSPOOF(vf_target_reg), pfvfspoof); } -/** ixgbe_set_source_address_pruning_X550 - Enable/Disbale src address pruning +/** ixgbe_set_source_address_pruning_x550 - Enable/Disable src address pruning * @hw: pointer to hardware structure * @enable: enable or disable source address pruning * @pool: Rx pool to set source address pruning for **/ -static void ixgbe_set_source_address_pruning_X550(struct ixgbe_hw *hw, +static void ixgbe_set_source_address_pruning_x550(struct ixgbe_hw *hw, bool enable, unsigned int pool) { @@ -3831,9 +3831,9 @@ static int ixgbe_write_phy_reg_x550a(struct ixgbe_hw *hw, u32 reg_addr, .set_mac_anti_spoofing = &ixgbe_set_mac_anti_spoofing, \ .set_vlan_anti_spoofing = &ixgbe_set_vlan_anti_spoofing, \ .set_source_address_pruning = \ - &ixgbe_set_source_address_pruning_X550, \ + &ixgbe_set_source_address_pruning_x550, \ .set_ethertype_anti_spoofing = \ - &ixgbe_set_ethertype_anti_spoofing_X550, \ + &ixgbe_set_ethertype_anti_spoofing_x550, \ .disable_rx_buff = &ixgbe_disable_rx_buff_generic, \ .enable_rx_buff = &ixgbe_enable_rx_buff_generic, \ .get_thermal_sensor_data = NULL, \ From 4600cdf9f5aca3d2559d858c414e09cf64370da1 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Thu, 5 Dec 2024 09:44:50 +0100 Subject: [PATCH 0866/1450] ixgbe: Enable link management in E610 device Add high level link management support for E610 device. Enable the following features: - driver load - bring up network interface - IP address assignment - pass traffic - show statistics (e.g. via ethtool) - disable network interface - driver unload Co-developed-by: Carolyn Wyborny Signed-off-by: Carolyn Wyborny Co-developed-by: Jedrzej Jagielski Signed-off-by: Jedrzej Jagielski Reviewed-by: Jan Glaza Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Piotr Kwapulinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 13 +- .../net/ethernet/intel/ixgbe/ixgbe_82599.c | 3 +- .../net/ethernet/intel/ixgbe/ixgbe_common.c | 19 +- .../net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c | 3 +- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 165 +++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 1 + .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 6 +- drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 3 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 414 +++++++++++++++++- drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c | 4 +- drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c | 5 +- drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c | 12 +- drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 21 +- drivers/net/ethernet/intel/ixgbe/ixgbe_x550.h | 20 + 14 files changed, 659 insertions(+), 30 deletions(-) create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.h diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 559b443c409f7..e6a380d4929bd 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #ifndef _IXGBE_H_ #define _IXGBE_H_ @@ -20,6 +20,7 @@ #include "ixgbe_type.h" #include "ixgbe_common.h" #include "ixgbe_dcb.h" +#include "ixgbe_e610.h" #if IS_ENABLED(CONFIG_FCOE) #define IXGBE_FCOE #include "ixgbe_fcoe.h" @@ -173,6 +174,7 @@ enum ixgbe_tx_flags { #define VMDQ_P(p) ((p) + adapter->ring_feature[RING_F_VMDQ].offset) #define IXGBE_82599_VF_DEVICE_ID 0x10ED #define IXGBE_X540_VF_DEVICE_ID 0x1515 +#define IXGBE_E610_VF_DEVICE_ID 0x57AD #define UPDATE_VF_COUNTER_32bit(reg, last_counter, counter) \ { \ @@ -654,6 +656,7 @@ struct ixgbe_adapter { #define IXGBE_FLAG2_RSS_FIELD_IPV6_UDP BIT(9) #define IXGBE_FLAG2_PTP_PPS_ENABLED BIT(10) #define IXGBE_FLAG2_PHY_INTERRUPT BIT(11) +#define IXGBE_FLAG2_FW_ASYNC_EVENT BIT(12) #define IXGBE_FLAG2_VLAN_PROMISC BIT(13) #define IXGBE_FLAG2_EEE_CAPABLE BIT(14) #define IXGBE_FLAG2_EEE_ENABLED BIT(15) @@ -661,6 +664,9 @@ struct ixgbe_adapter { #define IXGBE_FLAG2_IPSEC_ENABLED BIT(17) #define IXGBE_FLAG2_VF_IPSEC_ENABLED BIT(18) #define IXGBE_FLAG2_AUTO_DISABLE_VF BIT(19) +#define IXGBE_FLAG2_PHY_FW_LOAD_FAILED BIT(20) +#define IXGBE_FLAG2_NO_MEDIA BIT(21) +#define IXGBE_FLAG2_MOD_POWER_UNSUPPORTED BIT(22) /* Tx fast path data */ int num_tx_queues; @@ -793,6 +799,7 @@ struct ixgbe_adapter { u32 vferr_refcount; struct ixgbe_mac_addr *mac_table; struct kobject *info_kobj; + u16 lse_mask; #ifdef CONFIG_IXGBE_HWMON struct hwmon_buff *ixgbe_hwmon_buff; #endif /* CONFIG_IXGBE_HWMON */ @@ -849,6 +856,7 @@ static inline u8 ixgbe_max_rss_indices(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: return IXGBE_MAX_RSS_INDICES_X550; default: return 0; @@ -874,6 +882,7 @@ enum ixgbe_state_t { __IXGBE_PTP_RUNNING, __IXGBE_PTP_TX_IN_PROGRESS, __IXGBE_RESET_REQUESTED, + __IXGBE_PHY_INIT_COMPLETE, }; struct ixgbe_cb { @@ -896,6 +905,7 @@ enum ixgbe_boards { board_x550em_x_fw, board_x550em_a, board_x550em_a_fw, + board_e610, }; extern const struct ixgbe_info ixgbe_82598_info; @@ -906,6 +916,7 @@ extern const struct ixgbe_info ixgbe_X550EM_x_info; extern const struct ixgbe_info ixgbe_x550em_x_fw_info; extern const struct ixgbe_info ixgbe_x550em_a_info; extern const struct ixgbe_info ixgbe_x550em_a_fw_info; +extern const struct ixgbe_info ixgbe_e610_info; #ifdef CONFIG_IXGBE_DCB extern const struct dcbnl_rtnl_ops ixgbe_dcbnl_ops; #endif diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c index cdaf087b4e855..964988b4d58b8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include #include @@ -1615,6 +1615,7 @@ int ixgbe_fdir_set_input_mask_82599(struct ixgbe_hw *hw, case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: IXGBE_WRITE_REG(hw, IXGBE_FDIRSCTPM, ~fdirtcpm); break; default: diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index bfab2c0ee0aab..7beaf6ea57f9e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include #include @@ -58,6 +58,7 @@ bool ixgbe_device_supports_autoneg_fc(struct ixgbe_hw *hw) switch (hw->device_id) { case IXGBE_DEV_ID_X550EM_A_SFP: case IXGBE_DEV_ID_X550EM_A_SFP_N: + case IXGBE_DEV_ID_E610_SFP: supported = false; break; default: @@ -88,6 +89,8 @@ bool ixgbe_device_supports_autoneg_fc(struct ixgbe_hw *hw) case IXGBE_DEV_ID_X550EM_A_10G_T: case IXGBE_DEV_ID_X550EM_A_1G_T: case IXGBE_DEV_ID_X550EM_A_1G_T_L: + case IXGBE_DEV_ID_E610_10G_T: + case IXGBE_DEV_ID_E610_2_5G_T: supported = true; break; default: @@ -469,9 +472,14 @@ int ixgbe_clear_hw_cntrs_generic(struct ixgbe_hw *hw) } } - if (hw->mac.type == ixgbe_mac_X550 || hw->mac.type == ixgbe_mac_X540) { + if (hw->mac.type == ixgbe_mac_X550 || + hw->mac.type == ixgbe_mac_X540 || + hw->mac.type == ixgbe_mac_e610) { if (hw->phy.id == 0) hw->phy.ops.identify(hw); + } + + if (hw->mac.type == ixgbe_mac_X550 || hw->mac.type == ixgbe_mac_X540) { hw->phy.ops.read_reg(hw, IXGBE_PCRC8ECL, MDIO_MMD_PCS, &i); hw->phy.ops.read_reg(hw, IXGBE_PCRC8ECH, MDIO_MMD_PCS, &i); hw->phy.ops.read_reg(hw, IXGBE_LDPCECL, MDIO_MMD_PCS, &i); @@ -2922,6 +2930,10 @@ u16 ixgbe_get_pcie_msix_count_generic(struct ixgbe_hw *hw) pcie_offset = IXGBE_PCIE_MSIX_82599_CAPS; max_msix_count = IXGBE_MAX_MSIX_VECTORS_82599; break; + case ixgbe_mac_e610: + pcie_offset = IXGBE_PCIE_MSIX_E610_CAPS; + max_msix_count = IXGBE_MAX_MSIX_VECTORS_82599; + break; default: return 1; } @@ -3370,7 +3382,8 @@ int ixgbe_check_mac_link_generic(struct ixgbe_hw *hw, ixgbe_link_speed *speed, *speed = IXGBE_LINK_SPEED_1GB_FULL; break; case IXGBE_LINKS_SPEED_100_82599: - if ((hw->mac.type >= ixgbe_mac_X550) && + if ((hw->mac.type >= ixgbe_mac_X550 || + hw->mac.type == ixgbe_mac_e610) && (links_reg & IXGBE_LINKS_SPEED_NON_STD)) *speed = IXGBE_LINK_SPEED_5GB_FULL; else diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c index f2709b10c2e5a..19d6b6fa8fb38 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include "ixgbe.h" #include @@ -154,6 +154,7 @@ static void ixgbe_dcbnl_get_perm_hw_addr(struct net_device *netdev, case ixgbe_mac_82599EB: case ixgbe_mac_X540: case ixgbe_mac_X550: + case ixgbe_mac_e610: for (j = 0; j < netdev->addr_len; j++, i++) perm_addr[i] = adapter->hw.mac.san_addr[j]; break; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 6bf3562b3ce29..683c668672d65 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -3,8 +3,10 @@ #include "ixgbe_common.h" #include "ixgbe_e610.h" +#include "ixgbe_x550.h" #include "ixgbe_type.h" #include "ixgbe_x540.h" +#include "ixgbe_mbx.h" #include "ixgbe_phy.h" /** @@ -2491,3 +2493,166 @@ int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val) return err; } + +/** + * ixgbe_reset_hw_e610 - Perform hardware reset + * @hw: pointer to hardware structure + * + * Resets the hardware by resetting the transmit and receive units, masks + * and clears all interrupts, and performs a reset. + * + * Return: the exit code of the operation. + */ +int ixgbe_reset_hw_e610(struct ixgbe_hw *hw) +{ + u32 swfw_mask = hw->phy.phy_semaphore_mask; + u32 ctrl, i; + int err; + + /* Call adapter stop to disable tx/rx and clear interrupts */ + err = hw->mac.ops.stop_adapter(hw); + if (err) + goto reset_hw_out; + + /* Flush pending Tx transactions. */ + ixgbe_clear_tx_pending(hw); + + hw->phy.ops.init(hw); +mac_reset_top: + err = hw->mac.ops.acquire_swfw_sync(hw, swfw_mask); + if (err) + return -EBUSY; + ctrl = IXGBE_CTRL_RST; + ctrl |= IXGBE_READ_REG(hw, IXGBE_CTRL); + IXGBE_WRITE_REG(hw, IXGBE_CTRL, ctrl); + IXGBE_WRITE_FLUSH(hw); + hw->mac.ops.release_swfw_sync(hw, swfw_mask); + + /* Poll for reset bit to self-clear indicating reset is complete */ + for (i = 0; i < 10; i++) { + udelay(1); + ctrl = IXGBE_READ_REG(hw, IXGBE_CTRL); + if (!(ctrl & IXGBE_CTRL_RST_MASK)) + break; + } + + if (ctrl & IXGBE_CTRL_RST_MASK) { + struct ixgbe_adapter *adapter = container_of(hw, struct ixgbe_adapter, + hw); + + err = -EIO; + netdev_err(adapter->netdev, "Reset polling failed to complete."); + } + + /* Double resets are required for recovery from certain error + * conditions. Between resets, it is necessary to stall to allow time + * for any pending HW events to complete. + */ + msleep(100); + if (hw->mac.flags & IXGBE_FLAGS_DOUBLE_RESET_REQUIRED) { + hw->mac.flags &= ~IXGBE_FLAGS_DOUBLE_RESET_REQUIRED; + goto mac_reset_top; + } + + /* Set the Rx packet buffer size. */ + IXGBE_WRITE_REG(hw, IXGBE_RXPBSIZE(0), GENMASK(18, 17)); + + /* Store the permanent mac address */ + hw->mac.ops.get_mac_addr(hw, hw->mac.perm_addr); + + /* Maximum number of Receive Address Registers. */ +#define IXGBE_MAX_NUM_RAR 128 + + /* Store MAC address from RAR0, clear receive address registers, and + * clear the multicast table. Also reset num_rar_entries to the + * maximum number of Receive Address Registers, since we modify this + * value when programming the SAN MAC address. + */ + hw->mac.num_rar_entries = IXGBE_MAX_NUM_RAR; + hw->mac.ops.init_rx_addrs(hw); + + /* Initialize bus function number */ + hw->mac.ops.set_lan_id(hw); + +reset_hw_out: + return err; +} + +static const struct ixgbe_mac_operations mac_ops_e610 = { + .init_hw = ixgbe_init_hw_generic, + .start_hw = ixgbe_start_hw_X540, + .clear_hw_cntrs = ixgbe_clear_hw_cntrs_generic, + .enable_rx_dma = ixgbe_enable_rx_dma_generic, + .get_mac_addr = ixgbe_get_mac_addr_generic, + .get_device_caps = ixgbe_get_device_caps_generic, + .stop_adapter = ixgbe_stop_adapter_generic, + .set_lan_id = ixgbe_set_lan_id_multi_port_pcie, + .set_rxpba = ixgbe_set_rxpba_generic, + .check_link = ixgbe_check_link_e610, + .blink_led_start = ixgbe_blink_led_start_X540, + .blink_led_stop = ixgbe_blink_led_stop_X540, + .set_rar = ixgbe_set_rar_generic, + .clear_rar = ixgbe_clear_rar_generic, + .set_vmdq = ixgbe_set_vmdq_generic, + .set_vmdq_san_mac = ixgbe_set_vmdq_san_mac_generic, + .clear_vmdq = ixgbe_clear_vmdq_generic, + .init_rx_addrs = ixgbe_init_rx_addrs_generic, + .update_mc_addr_list = ixgbe_update_mc_addr_list_generic, + .enable_mc = ixgbe_enable_mc_generic, + .disable_mc = ixgbe_disable_mc_generic, + .clear_vfta = ixgbe_clear_vfta_generic, + .set_vfta = ixgbe_set_vfta_generic, + .fc_enable = ixgbe_fc_enable_generic, + .set_fw_drv_ver = ixgbe_set_fw_drv_ver_x550, + .init_uta_tables = ixgbe_init_uta_tables_generic, + .set_mac_anti_spoofing = ixgbe_set_mac_anti_spoofing, + .set_vlan_anti_spoofing = ixgbe_set_vlan_anti_spoofing, + .set_source_address_pruning = + ixgbe_set_source_address_pruning_x550, + .set_ethertype_anti_spoofing = + ixgbe_set_ethertype_anti_spoofing_x550, + .disable_rx_buff = ixgbe_disable_rx_buff_generic, + .enable_rx_buff = ixgbe_enable_rx_buff_generic, + .enable_rx = ixgbe_enable_rx_generic, + .disable_rx = ixgbe_disable_rx_e610, + .led_on = ixgbe_led_on_generic, + .led_off = ixgbe_led_off_generic, + .init_led_link_act = ixgbe_init_led_link_act_generic, + .reset_hw = ixgbe_reset_hw_e610, + .get_media_type = ixgbe_get_media_type_e610, + .setup_link = ixgbe_setup_link_e610, + .get_link_capabilities = ixgbe_get_link_capabilities_e610, + .get_bus_info = ixgbe_get_bus_info_generic, + .acquire_swfw_sync = ixgbe_acquire_swfw_sync_X540, + .release_swfw_sync = ixgbe_release_swfw_sync_X540, + .init_swfw_sync = ixgbe_init_swfw_sync_X540, + .prot_autoc_read = prot_autoc_read_generic, + .prot_autoc_write = prot_autoc_write_generic, + .setup_fc = ixgbe_setup_fc_e610, + .fc_autoneg = ixgbe_fc_autoneg_e610, +}; + +static const struct ixgbe_phy_operations phy_ops_e610 = { + .init = ixgbe_init_phy_ops_e610, + .identify = ixgbe_identify_phy_e610, + .identify_sfp = ixgbe_identify_module_e610, + .setup_link_speed = ixgbe_setup_phy_link_speed_generic, + .setup_link = ixgbe_setup_phy_link_e610, + .enter_lplu = ixgbe_enter_lplu_e610, +}; + +static const struct ixgbe_eeprom_operations eeprom_ops_e610 = { + .read = ixgbe_read_ee_aci_e610, + .read_buffer = ixgbe_read_ee_aci_buffer_e610, + .validate_checksum = ixgbe_validate_eeprom_checksum_e610, +}; + +const struct ixgbe_info ixgbe_e610_info = { + .mac = ixgbe_mac_e610, + .get_invariants = ixgbe_get_invariants_X540, + .mac_ops = &mac_ops_e610, + .eeprom_ops = &eeprom_ops_e610, + .phy_ops = &phy_ops_e610, + .mbx_ops = &mbx_ops_generic, + .mvals = ixgbe_mvals_x550em_a, +}; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 9cfcfeec6e0b7..ba8c06b738105 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -76,5 +76,6 @@ int ixgbe_read_ee_aci_e610(struct ixgbe_hw *hw, u16 offset, u16 *data); int ixgbe_read_ee_aci_buffer_e610(struct ixgbe_hw *hw, u16 offset, u16 words, u16 *data); int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val); +int ixgbe_reset_hw_e610(struct ixgbe_hw *hw); #endif /* _IXGBE_E610_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 9482e0cca8b7d..da91c582d439b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ /* ethtool support for ixgbe */ @@ -690,6 +690,7 @@ static void ixgbe_get_regs(struct net_device *netdev, case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: regs_buff[35 + i] = IXGBE_READ_REG(hw, IXGBE_FCRTL_82599(i)); regs_buff[43 + i] = IXGBE_READ_REG(hw, IXGBE_FCRTH_82599(i)); break; @@ -1613,6 +1614,7 @@ static int ixgbe_reg_test(struct ixgbe_adapter *adapter, u64 *data) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: toggle = 0x7FFFF30F; test = reg_test_82599; break; @@ -1874,6 +1876,7 @@ static int ixgbe_setup_desc_rings(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: reg_data = IXGBE_READ_REG(&adapter->hw, IXGBE_DMATXCTL); reg_data |= IXGBE_DMATXCTL_TE; IXGBE_WRITE_REG(&adapter->hw, IXGBE_DMATXCTL, reg_data); @@ -1935,6 +1938,7 @@ static int ixgbe_setup_loopback_test(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: reg_data = IXGBE_READ_REG(hw, IXGBE_MACC); reg_data |= IXGBE_MACC_FLU; IXGBE_WRITE_REG(hw, IXGBE_MACC, reg_data); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c index 16fa621ce0ffb..336d47ffb95a8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include "ixgbe.h" #include "ixgbe_sriov.h" @@ -107,6 +107,7 @@ static void ixgbe_get_first_reg_idx(struct ixgbe_adapter *adapter, u8 tc, case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: if (num_tcs > 4) { /* * TCs : TC0/1 TC2/3 TC4-7 diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 2656f2617a8f9..336e08d35f971 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include #include @@ -74,6 +74,7 @@ static const struct ixgbe_info *ixgbe_info_tbl[] = { [board_x550em_x_fw] = &ixgbe_x550em_x_fw_info, [board_x550em_a] = &ixgbe_x550em_a_info, [board_x550em_a_fw] = &ixgbe_x550em_a_fw_info, + [board_e610] = &ixgbe_e610_info, }; /* ixgbe_pci_tbl - PCI Device ID Table @@ -132,6 +133,11 @@ static const struct pci_device_id ixgbe_pci_tbl[] = { {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_SFP), board_x550em_a }, {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_1G_T), board_x550em_a_fw }, {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_1G_T_L), board_x550em_a_fw }, + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_E610_BACKPLANE), board_e610}, + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_E610_SFP), board_e610}, + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_E610_10G_T), board_e610}, + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_E610_2_5G_T), board_e610}, + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_E610_SGMII), board_e610}, /* required last entry */ {0, } }; @@ -174,6 +180,8 @@ static struct workqueue_struct *ixgbe_wq; static bool ixgbe_check_cfg_remove(struct ixgbe_hw *hw, struct pci_dev *pdev); static void ixgbe_watchdog_link_is_down(struct ixgbe_adapter *); +static void ixgbe_watchdog_link_is_up(struct ixgbe_adapter *); +static void ixgbe_watchdog_update_link(struct ixgbe_adapter *); static const struct net_device_ops ixgbe_netdev_ops; @@ -241,7 +249,7 @@ static int ixgbe_get_parent_bus_info(struct ixgbe_adapter *adapter) * Return: true if information should be collected from the parent bus, false * otherwise */ -static inline bool ixgbe_pcie_from_parent(struct ixgbe_hw *hw) +static bool ixgbe_pcie_from_parent(struct ixgbe_hw *hw) { switch (hw->device_id) { case IXGBE_DEV_ID_82599_SFP_SF_QP: @@ -880,6 +888,7 @@ static void ixgbe_set_ivar(struct ixgbe_adapter *adapter, s8 direction, case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: if (direction == -1) { /* other causes */ msix_vector |= IXGBE_IVAR_ALLOC_VAL; @@ -919,6 +928,7 @@ void ixgbe_irq_rearm_queues(struct ixgbe_adapter *adapter, case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: mask = (qmask & 0xFFFFFFFF); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EICS_EX(0), mask); mask = (qmask >> 32); @@ -1029,7 +1039,7 @@ static u64 ixgbe_get_tx_pending(struct ixgbe_ring *ring) return ((head <= tail) ? tail : tail + ring->count) - head; } -static inline bool ixgbe_check_tx_hang(struct ixgbe_ring *tx_ring) +static bool ixgbe_check_tx_hang(struct ixgbe_ring *tx_ring) { u32 tx_done = ixgbe_get_tx_completed(tx_ring); u32 tx_done_old = tx_ring->tx_stats.tx_done_old; @@ -2519,6 +2529,7 @@ static void ixgbe_configure_msix(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: ixgbe_set_ivar(adapter, -1, 1, v_idx); break; default: @@ -2532,6 +2543,9 @@ static void ixgbe_configure_msix(struct ixgbe_adapter *adapter) IXGBE_EIMS_MAILBOX | IXGBE_EIMS_LSC); + if (adapter->hw.mac.type == ixgbe_mac_e610) + mask &= ~IXGBE_EIMS_FW_EVENT; + IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIAC, mask); } @@ -2748,6 +2762,7 @@ void ixgbe_write_eitr(struct ixgbe_q_vector *q_vector) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: /* * set the WDIS bit to not clear the timer bits and cause an * immediate assertion of the interrupt @@ -2970,6 +2985,218 @@ static void ixgbe_check_lsc(struct ixgbe_adapter *adapter) } } +/** + * ixgbe_check_phy_fw_load - check if PHY FW load failed + * @adapter: pointer to adapter structure + * @link_cfg_err: bitmap from the link info structure + * + * Check if external PHY FW load failed and print an error message if it did. + */ +static void ixgbe_check_phy_fw_load(struct ixgbe_adapter *adapter, + u8 link_cfg_err) +{ + if (!(link_cfg_err & IXGBE_ACI_LINK_EXTERNAL_PHY_LOAD_FAILURE)) { + adapter->flags2 &= ~IXGBE_FLAG2_PHY_FW_LOAD_FAILED; + return; + } + + if (adapter->flags2 & IXGBE_FLAG2_PHY_FW_LOAD_FAILED) + return; + + if (link_cfg_err & IXGBE_ACI_LINK_EXTERNAL_PHY_LOAD_FAILURE) { + netdev_err(adapter->netdev, "Device failed to load the FW for the external PHY. Please download and install the latest NVM for your device and try again\n"); + adapter->flags2 |= IXGBE_FLAG2_PHY_FW_LOAD_FAILED; + } +} + +/** + * ixgbe_check_module_power - check module power level + * @adapter: pointer to adapter structure + * @link_cfg_err: bitmap from the link info structure + * + * Check module power level returned by a previous call to aci_get_link_info + * and print error messages if module power level is not supported. + */ +static void ixgbe_check_module_power(struct ixgbe_adapter *adapter, + u8 link_cfg_err) +{ + /* If module power level is supported, clear the flag. */ + if (!(link_cfg_err & (IXGBE_ACI_LINK_INVAL_MAX_POWER_LIMIT | + IXGBE_ACI_LINK_MODULE_POWER_UNSUPPORTED))) { + adapter->flags2 &= ~IXGBE_FLAG2_MOD_POWER_UNSUPPORTED; + return; + } + + /* If IXGBE_FLAG2_MOD_POWER_UNSUPPORTED was previously set and the + * above block didn't clear this bit, there's nothing to do. + */ + if (adapter->flags2 & IXGBE_FLAG2_MOD_POWER_UNSUPPORTED) + return; + + if (link_cfg_err & IXGBE_ACI_LINK_INVAL_MAX_POWER_LIMIT) { + netdev_err(adapter->netdev, "The installed module is incompatible with the device's NVM image. Cannot start link.\n"); + adapter->flags2 |= IXGBE_FLAG2_MOD_POWER_UNSUPPORTED; + } else if (link_cfg_err & IXGBE_ACI_LINK_MODULE_POWER_UNSUPPORTED) { + netdev_err(adapter->netdev, "The module's power requirements exceed the device's power supply. Cannot start link.\n"); + adapter->flags2 |= IXGBE_FLAG2_MOD_POWER_UNSUPPORTED; + } +} + +/** + * ixgbe_check_link_cfg_err - check if link configuration failed + * @adapter: pointer to adapter structure + * @link_cfg_err: bitmap from the link info structure + * + * Print if any link configuration failure happens due to the value in the + * link_cfg_err parameter in the link info structure. + */ +static void ixgbe_check_link_cfg_err(struct ixgbe_adapter *adapter, + u8 link_cfg_err) +{ + ixgbe_check_module_power(adapter, link_cfg_err); + ixgbe_check_phy_fw_load(adapter, link_cfg_err); +} + +/** + * ixgbe_process_link_status_event - process the link event + * @adapter: pointer to adapter structure + * @link_up: true if the physical link is up and false if it is down + * @link_speed: current link speed received from the link event + * + * Return: 0 on success or negative value on failure. + */ +static int +ixgbe_process_link_status_event(struct ixgbe_adapter *adapter, bool link_up, + u16 link_speed) +{ + struct ixgbe_hw *hw = &adapter->hw; + int status; + + /* Update the link info structures and re-enable link events, + * don't bail on failure due to other book keeping needed. + */ + status = ixgbe_update_link_info(hw); + if (status) + e_dev_err("Failed to update link status, err %d aq_err %d\n", + status, hw->aci.last_status); + + ixgbe_check_link_cfg_err(adapter, hw->link.link_info.link_cfg_err); + + /* Check if the link state is up after updating link info, and treat + * this event as an UP event since the link is actually UP now. + */ + if (hw->link.link_info.link_info & IXGBE_ACI_LINK_UP) + link_up = true; + + /* Turn off PHY if media was removed. */ + if (!(adapter->flags2 & IXGBE_FLAG2_NO_MEDIA) && + !(hw->link.link_info.link_info & IXGBE_ACI_MEDIA_AVAILABLE)) + adapter->flags2 |= IXGBE_FLAG2_NO_MEDIA; + + if (link_up == adapter->link_up && + link_up == netif_carrier_ok(adapter->netdev) && + link_speed == adapter->link_speed) + return 0; + + adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE; + adapter->link_check_timeout = jiffies; + ixgbe_watchdog_update_link(adapter); + + if (link_up) + ixgbe_watchdog_link_is_up(adapter); + else + ixgbe_watchdog_link_is_down(adapter); + + return 0; +} + +/** + * ixgbe_handle_link_status_event - handle link status event via ACI + * @adapter: pointer to adapter structure + * @e: event structure containing link status info + */ +static void +ixgbe_handle_link_status_event(struct ixgbe_adapter *adapter, + struct ixgbe_aci_event *e) +{ + struct ixgbe_aci_cmd_get_link_status_data *link_data; + u16 link_speed; + bool link_up; + + link_data = (struct ixgbe_aci_cmd_get_link_status_data *)e->msg_buf; + + link_up = !!(link_data->link_info & IXGBE_ACI_LINK_UP); + link_speed = le16_to_cpu(link_data->link_speed); + + if (ixgbe_process_link_status_event(adapter, link_up, link_speed)) + e_dev_warn("Could not process link status event"); +} + +/** + * ixgbe_schedule_fw_event - schedule Firmware event + * @adapter: pointer to the adapter structure + * + * If the adapter is not in down, removing or resetting state, + * an event is scheduled. + */ +static void ixgbe_schedule_fw_event(struct ixgbe_adapter *adapter) +{ + if (!test_bit(__IXGBE_DOWN, &adapter->state) && + !test_bit(__IXGBE_REMOVING, &adapter->state) && + !test_bit(__IXGBE_RESETTING, &adapter->state)) { + adapter->flags2 |= IXGBE_FLAG2_FW_ASYNC_EVENT; + ixgbe_service_event_schedule(adapter); + } +} + +/** + * ixgbe_aci_event_cleanup - release msg_buf memory + * @event: pointer to the event holding msg_buf to be released + * + * Clean memory allocated for event's msg_buf. Implements auto memory cleanup. + */ +static void ixgbe_aci_event_cleanup(struct ixgbe_aci_event *event) +{ + kfree(event->msg_buf); +} + +/** + * ixgbe_handle_fw_event - handle Firmware event + * @adapter: pointer to the adapter structure + * + * Obtain an event from the ACI and then and then process it according to the + * type of the event and the opcode. + */ +static void ixgbe_handle_fw_event(struct ixgbe_adapter *adapter) +{ + struct ixgbe_aci_event event __cleanup(ixgbe_aci_event_cleanup); + struct ixgbe_hw *hw = &adapter->hw; + bool pending = false; + int err; + + if (adapter->flags2 & IXGBE_FLAG2_FW_ASYNC_EVENT) + adapter->flags2 &= ~IXGBE_FLAG2_FW_ASYNC_EVENT; + event.buf_len = IXGBE_ACI_MAX_BUFFER_SIZE; + event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL); + if (!event.msg_buf) + return; + + do { + err = ixgbe_aci_get_event(hw, &event, &pending); + if (err) + break; + + switch (le16_to_cpu(event.desc.opcode)) { + case ixgbe_aci_opc_get_link_status: + ixgbe_handle_link_status_event(adapter, &event); + break; + default: + e_warn(hw, "unknown FW async event captured\n"); + break; + } + } while (pending); +} + static inline void ixgbe_irq_enable_queues(struct ixgbe_adapter *adapter, u64 qmask) { @@ -2986,6 +3213,7 @@ static inline void ixgbe_irq_enable_queues(struct ixgbe_adapter *adapter, case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: mask = (qmask & 0xFFFFFFFF); if (mask) IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask); @@ -3039,6 +3267,9 @@ static inline void ixgbe_irq_enable(struct ixgbe_adapter *adapter, bool queues, case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: + case ixgbe_mac_e610: + mask |= IXGBE_EIMS_FW_EVENT; + fallthrough; case ixgbe_mac_x550em_a: if (adapter->hw.device_id == IXGBE_DEV_ID_X550EM_X_SFP || adapter->hw.device_id == IXGBE_DEV_ID_X550EM_A_SFP || @@ -3095,12 +3326,16 @@ static irqreturn_t ixgbe_msix_other(int irq, void *data) if (eicr & IXGBE_EICR_MAILBOX) ixgbe_msg_task(adapter); + if (eicr & IXGBE_EICR_FW_EVENT) + ixgbe_schedule_fw_event(adapter); + switch (hw->mac.type) { case ixgbe_mac_82599EB: case ixgbe_mac_X540: case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: if (hw->phy.type == ixgbe_phy_x550em_ext_t && (eicr & IXGBE_EICR_GPI_SDP0_X540)) { adapter->flags2 |= IXGBE_FLAG2_PHY_INTERRUPT; @@ -3338,6 +3573,9 @@ static irqreturn_t ixgbe_intr(int irq, void *data) if (eicr & IXGBE_EICR_LSC) ixgbe_check_lsc(adapter); + if (eicr & IXGBE_EICR_FW_EVENT) + ixgbe_schedule_fw_event(adapter); + switch (hw->mac.type) { case ixgbe_mac_82599EB: ixgbe_check_sfp_event(adapter, eicr); @@ -3346,6 +3584,7 @@ static irqreturn_t ixgbe_intr(int irq, void *data) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: if (eicr & IXGBE_EICR_ECC) { e_info(link, "Received ECC Err, initiating reset\n"); set_bit(__IXGBE_RESET_REQUESTED, &adapter->state); @@ -3446,6 +3685,7 @@ static inline void ixgbe_irq_disable(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, 0xFFFF0000); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC_EX(0), ~0); IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC_EX(1), ~0); @@ -4363,6 +4603,7 @@ static void ixgbe_setup_rdrxctl(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: if (adapter->num_vfs) rdrxctl |= IXGBE_RDRXCTL_PSP; fallthrough; @@ -4530,6 +4771,7 @@ static void ixgbe_vlan_strip_disable(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: for (i = 0; i < adapter->num_rx_queues; i++) { struct ixgbe_ring *ring = adapter->rx_ring[i]; @@ -4568,6 +4810,7 @@ static void ixgbe_vlan_strip_enable(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: for (i = 0; i < adapter->num_rx_queues; i++) { struct ixgbe_ring *ring = adapter->rx_ring[i]; @@ -5152,6 +5395,7 @@ static int ixgbe_hpbthresh(struct ixgbe_adapter *adapter, int pb) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: dv_id = IXGBE_DV_X540(link, tc); break; default: @@ -5212,6 +5456,7 @@ static int ixgbe_lpbthresh(struct ixgbe_adapter *adapter, int pb) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: dv_id = IXGBE_LOW_DV_X540(tc); break; default: @@ -5513,6 +5758,48 @@ static void ixgbe_configure(struct ixgbe_adapter *adapter) ixgbe_configure_dfwd(adapter); } +/** + * ixgbe_enable_link_status_events - enable link status events + * @adapter: pointer to the adapter structure + * @mask: event mask to be set + * + * Enables link status events by invoking ixgbe_configure_lse() + * + * Return: the exit code of the operation. + */ +static int ixgbe_enable_link_status_events(struct ixgbe_adapter *adapter, + u16 mask) +{ + int err; + + err = ixgbe_configure_lse(&adapter->hw, true, mask); + if (err) + return err; + + adapter->lse_mask = mask; + return 0; +} + +/** + * ixgbe_disable_link_status_events - disable link status events + * @adapter: pointer to the adapter structure + * + * Disables link status events by invoking ixgbe_configure_lse() + * + * Return: the exit code of the operation. + */ +static int ixgbe_disable_link_status_events(struct ixgbe_adapter *adapter) +{ + int err; + + err = ixgbe_configure_lse(&adapter->hw, false, adapter->lse_mask); + if (err) + return err; + + adapter->lse_mask = 0; + return 0; +} + /** * ixgbe_sfp_link_config - set up SFP+ link * @adapter: pointer to private adapter struct @@ -5542,9 +5829,15 @@ static void ixgbe_sfp_link_config(struct ixgbe_adapter *adapter) **/ static int ixgbe_non_sfp_link_config(struct ixgbe_hw *hw) { - u32 speed; + struct ixgbe_adapter *adapter = container_of(hw, struct ixgbe_adapter, + hw); + u16 mask = ~((u16)(IXGBE_ACI_LINK_EVENT_UPDOWN | + IXGBE_ACI_LINK_EVENT_MEDIA_NA | + IXGBE_ACI_LINK_EVENT_MODULE_QUAL_FAIL | + IXGBE_ACI_LINK_EVENT_PHY_FW_LOAD_FAIL)); bool autoneg, link_up = false; int ret = -EIO; + u32 speed; if (hw->mac.ops.check_link) ret = hw->mac.ops.check_link(hw, &speed, &link_up, false); @@ -5567,12 +5860,52 @@ static int ixgbe_non_sfp_link_config(struct ixgbe_hw *hw) if (ret) return ret; - if (hw->mac.ops.setup_link) + if (hw->mac.ops.setup_link) { + if (adapter->hw.mac.type == ixgbe_mac_e610) { + ret = ixgbe_enable_link_status_events(adapter, mask); + if (ret) + return ret; + } ret = hw->mac.ops.setup_link(hw, speed, link_up); + } return ret; } +/** + * ixgbe_check_media_subtask - check for media + * @adapter: pointer to adapter structure + * + * If media is available then initialize PHY user configuration. Configure the + * PHY if the interface is up. + */ +static void ixgbe_check_media_subtask(struct ixgbe_adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + + /* No need to check for media if it's already present */ + if (!(adapter->flags2 & IXGBE_FLAG2_NO_MEDIA)) + return; + + /* Refresh link info and check if media is present */ + if (ixgbe_update_link_info(hw)) + return; + + ixgbe_check_link_cfg_err(adapter, hw->link.link_info.link_cfg_err); + + if (hw->link.link_info.link_info & IXGBE_ACI_MEDIA_AVAILABLE) { + /* PHY settings are reset on media insertion, reconfigure + * PHY to preserve settings. + */ + if (!(ixgbe_non_sfp_link_config(&adapter->hw))) + adapter->flags2 &= ~IXGBE_FLAG2_NO_MEDIA; + + /* A Link Status Event will be generated; the event handler + * will complete bringing the interface up + */ + } +} + /** * ixgbe_clear_vf_stats_counters - Clear out VF stats after reset * @adapter: board private structure @@ -5636,6 +5969,7 @@ static void ixgbe_setup_gpie(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: default: IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(0), 0xFFFFFFFF); IXGBE_WRITE_REG(hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF); @@ -5986,6 +6320,7 @@ void ixgbe_disable_tx(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: IXGBE_WRITE_REG(hw, IXGBE_DMATXCTL, (IXGBE_READ_REG(hw, IXGBE_DMATXCTL) & ~IXGBE_DMATXCTL_TE)); @@ -6230,6 +6565,8 @@ void ixgbe_down(struct ixgbe_adapter *adapter) ixgbe_clean_all_tx_rings(adapter); ixgbe_clean_all_rx_rings(adapter); + if (adapter->hw.mac.type == ixgbe_mac_e610) + ixgbe_disable_link_status_events(adapter); } /** @@ -6285,6 +6622,7 @@ static void ixgbe_init_dcb(struct ixgbe_adapter *adapter) break; case ixgbe_mac_X540: case ixgbe_mac_X550: + case ixgbe_mac_e610: adapter->dcb_cfg.num_tcs.pg_tcs = X540_TRAFFIC_CLASS; adapter->dcb_cfg.num_tcs.pfc_tcs = X540_TRAFFIC_CLASS; break; @@ -6348,6 +6686,8 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, hw->subsystem_vendor_id = pdev->subsystem_vendor; hw->subsystem_device_id = pdev->subsystem_device; + hw->mac.max_link_up_time = IXGBE_LINK_UP_TIME; + /* get_invariants needs the device IDs */ ii->get_invariants(hw); @@ -6915,6 +7255,19 @@ int ixgbe_open(struct net_device *netdev) ixgbe_up_complete(adapter); udp_tunnel_nic_reset_ntf(netdev); + if (adapter->hw.mac.type == ixgbe_mac_e610) { + int err = ixgbe_update_link_info(&adapter->hw); + + if (err) + e_dev_err("Failed to update link info, err %d.\n", err); + + ixgbe_check_link_cfg_err(adapter, + adapter->hw.link.link_info.link_cfg_err); + + err = ixgbe_non_sfp_link_config(&adapter->hw); + if (ixgbe_non_sfp_link_config(&adapter->hw)) + e_dev_err("Link setup failed, err %d.\n", err); + } return 0; @@ -7068,6 +7421,7 @@ static int __ixgbe_shutdown(struct pci_dev *pdev, bool *enable_wake) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: pci_wake_from_d3(pdev, !!wufc); break; default: @@ -7215,6 +7569,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: hwstats->pxonrxc[i] += IXGBE_READ_REG(hw, IXGBE_PXONRXCNT(i)); break; @@ -7231,7 +7586,8 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) hw->mac.type == ixgbe_mac_X540 || hw->mac.type == ixgbe_mac_X550 || hw->mac.type == ixgbe_mac_X550EM_x || - hw->mac.type == ixgbe_mac_x550em_a) { + hw->mac.type == ixgbe_mac_x550em_a || + hw->mac.type == ixgbe_mac_e610) { hwstats->qbtc[i] += IXGBE_READ_REG(hw, IXGBE_QBTC_L(i)); IXGBE_READ_REG(hw, IXGBE_QBTC_H(i)); /* to clear */ hwstats->qbrc[i] += IXGBE_READ_REG(hw, IXGBE_QBRC_L(i)); @@ -7257,6 +7613,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: /* OS2BMC stats are X540 and later */ hwstats->o2bgptc += IXGBE_READ_REG(hw, IXGBE_O2BGPTC); hwstats->o2bspc += IXGBE_READ_REG(hw, IXGBE_O2BSPC); @@ -7557,6 +7914,7 @@ static void ixgbe_watchdog_link_is_up(struct ixgbe_adapter *adapter) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: case ixgbe_mac_82599EB: { u32 mflcn = IXGBE_READ_REG(hw, IXGBE_MFLCN); u32 fccfg = IXGBE_READ_REG(hw, IXGBE_FCCFG); @@ -8058,6 +8416,11 @@ static void ixgbe_service_task(struct work_struct *work) ixgbe_service_event_complete(adapter); return; } + if (adapter->hw.mac.type == ixgbe_mac_e610) { + if (adapter->flags2 & IXGBE_FLAG2_FW_ASYNC_EVENT) + ixgbe_handle_fw_event(adapter); + ixgbe_check_media_subtask(adapter); + } ixgbe_reset_subtask(adapter); ixgbe_phy_interrupt_subtask(adapter); ixgbe_sfp_detection_subtask(adapter); @@ -10776,6 +11139,24 @@ bool ixgbe_wol_supported(struct ixgbe_adapter *adapter, u16 device_id, return false; } +/** + * ixgbe_set_fw_version_e610 - Set FW version specifically on E610 adapters + * @adapter: the adapter private structure + * + * This function is used by probe and ethtool to determine the FW version to + * format to display. The FW version is taken from the EEPROM/NVM. + * + */ +static void ixgbe_set_fw_version_e610(struct ixgbe_adapter *adapter) +{ + struct ixgbe_orom_info *orom = &adapter->hw.flash.orom; + struct ixgbe_nvm_info *nvm = &adapter->hw.flash.nvm; + + snprintf(adapter->eeprom_id, sizeof(adapter->eeprom_id), + "%x.%02x 0x%x %d.%d.%d", nvm->major, nvm->minor, + nvm->eetrack, orom->major, orom->build, orom->patch); +} + /** * ixgbe_set_fw_version - Set FW version * @adapter: the adapter private structure @@ -10788,6 +11169,11 @@ static void ixgbe_set_fw_version(struct ixgbe_adapter *adapter) struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_nvm_version nvm_ver; + if (adapter->hw.mac.type == ixgbe_mac_e610) { + ixgbe_set_fw_version_e610(adapter); + return; + } + ixgbe_get_oem_prod_version(hw, &nvm_ver); if (nvm_ver.oem_valid) { snprintf(adapter->eeprom_id, sizeof(adapter->eeprom_id), @@ -10874,6 +11260,8 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) #else indices = IXGBE_MAX_RSS_INDICES; #endif + } else if (ii->mac == ixgbe_mac_e610) { + indices = IXGBE_MAX_RSS_INDICES_X550; } netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), indices); @@ -10951,6 +11339,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) switch (adapter->hw.mac.type) { case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: + case ixgbe_mac_e610: netdev->udp_tunnel_nic_info = &ixgbe_udp_tunnels_x550; break; case ixgbe_mac_x550em_a: @@ -10971,6 +11360,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: IXGBE_WRITE_REG(&adapter->hw, IXGBE_WUS, ~0); break; default: @@ -11142,6 +11532,8 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ether_addr_copy(hw->mac.addr, hw->mac.perm_addr); ixgbe_mac_set_default_filter(adapter); + if (hw->mac.type == ixgbe_mac_e610) + mutex_init(&hw->aci.lock); timer_setup(&adapter->service_timer, ixgbe_service_timer, 0); if (ixgbe_removed(hw->hw_addr)) { @@ -11287,6 +11679,8 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err_register: ixgbe_release_hw_control(adapter); ixgbe_clear_interrupt_scheme(adapter); + if (hw->mac.type == ixgbe_mac_e610) + mutex_destroy(&adapter->hw.aci.lock); err_sw_init: ixgbe_disable_sriov(adapter); adapter->flags2 &= ~IXGBE_FLAG2_SEARCH_FOR_SFP; @@ -11333,6 +11727,11 @@ static void ixgbe_remove(struct pci_dev *pdev) set_bit(__IXGBE_REMOVING, &adapter->state); cancel_work_sync(&adapter->service_task); + if (adapter->hw.mac.type == ixgbe_mac_e610) { + ixgbe_disable_link_status_events(adapter); + mutex_destroy(&adapter->hw.aci.lock); + } + if (adapter->mii_bus) mdiobus_unregister(adapter->mii_bus); @@ -11464,6 +11863,9 @@ static pci_ers_result_t ixgbe_io_error_detected(struct pci_dev *pdev, case ixgbe_mac_x550em_a: device_id = IXGBE_DEV_ID_X550EM_A_VF; break; + case ixgbe_mac_e610: + device_id = IXGBE_DEV_ID_E610_VF; + break; default: device_id = 0; break; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c index d67d77e5daccd..788b5af07c70e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include #include @@ -283,6 +283,7 @@ static int ixgbe_check_for_rst_pf(struct ixgbe_hw *hw, u16 vf_number) case ixgbe_mac_X550: case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: + case ixgbe_mac_e610: vflre = IXGBE_READ_REG(hw, IXGBE_VFLREC(reg_offset)); break; default: @@ -407,6 +408,7 @@ void ixgbe_init_mbx_params_pf(struct ixgbe_hw *hw) hw->mac.type != ixgbe_mac_X550 && hw->mac.type != ixgbe_mac_X550EM_x && hw->mac.type != ixgbe_mac_x550em_a && + hw->mac.type != ixgbe_mac_e610 && hw->mac.type != ixgbe_mac_X540) return; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c index 07eaa3c3f4d36..0a03a8bb5f886 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include #include @@ -1117,7 +1117,7 @@ int ixgbe_setup_phy_link_generic(struct ixgbe_hw *hw) hw->phy.ops.read_reg(hw, IXGBE_MII_AUTONEG_VENDOR_PROVISION_1_REG, MDIO_MMD_AN, &autoneg_reg); - if (hw->mac.type == ixgbe_mac_X550) { + if (hw->mac.type == ixgbe_mac_X550 || hw->mac.type == ixgbe_mac_e610) { /* Set or unset auto-negotiation 5G advertisement */ autoneg_reg &= ~IXGBE_MII_5GBASE_T_ADVERTISE; if ((hw->phy.autoneg_advertised & IXGBE_LINK_SPEED_5GB_FULL) && @@ -1233,6 +1233,7 @@ static int ixgbe_get_copper_speeds_supported(struct ixgbe_hw *hw) switch (hw->mac.type) { case ixgbe_mac_X550: + case ixgbe_mac_e610: hw->phy.speeds_supported |= IXGBE_LINK_SPEED_2_5GB_FULL; hw->phy.speeds_supported |= IXGBE_LINK_SPEED_5GB_FULL; break; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c index 81e1df83f1369..1fc821fb351a5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include #include @@ -66,7 +66,9 @@ int ixgbe_setup_mac_link_X540(struct ixgbe_hw *hw, ixgbe_link_speed speed, * Resets the hardware by resetting the transmit and receive units, masks * and clears all interrupts, perform a PHY reset, and perform a link (MAC) * reset. - **/ + * + * Return: 0 on success or negative value on failure + */ int ixgbe_reset_hw_X540(struct ixgbe_hw *hw) { u32 swfw_mask = hw->phy.phy_semaphore_mask; @@ -133,10 +135,14 @@ int ixgbe_reset_hw_X540(struct ixgbe_hw *hw) hw->mac.num_rar_entries = IXGBE_X540_MAX_TX_QUEUES; hw->mac.ops.init_rx_addrs(hw); + /* The following is not supported by E610. */ + if (hw->mac.type == ixgbe_mac_e610) + return status; + /* Store the permanent SAN mac address */ hw->mac.ops.get_san_mac_addr(hw, hw->mac.san_addr); - /* Add the SAN MAC address to the RAR only if it's a valid address */ + /* Add the SAN MAC address to RAR if it's a valid address */ if (is_valid_ether_addr(hw->mac.san_addr)) { /* Save the SAN MAC RAR index */ hw->mac.san_mac_rar_index = hw->mac.num_rar_entries - 1; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c index 1de05443d3a28..277ceaf8a7939 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include "ixgbe_x540.h" +#include "ixgbe_x550.h" #include "ixgbe_type.h" #include "ixgbe_common.h" #include "ixgbe_mbx.h" @@ -2770,9 +2771,9 @@ static int ixgbe_led_off_t_x550em(struct ixgbe_hw *hw, u32 led_idx) * semaphore, -EIO when command fails or -ENIVAL when incorrect * params passed. **/ -static int ixgbe_set_fw_drv_ver_x550(struct ixgbe_hw *hw, u8 maj, u8 min, - u8 build, u8 sub, u16 len, - const char *driver_ver) +int ixgbe_set_fw_drv_ver_x550(struct ixgbe_hw *hw, u8 maj, u8 min, + u8 build, u8 sub, u16 len, + const char *driver_ver) { struct ixgbe_hic_drv_info2 fw_cmd; int ret_val; @@ -3511,8 +3512,8 @@ static int ixgbe_reset_hw_X550em(struct ixgbe_hw *hw) * @enable: enable or disable switch for Ethertype anti-spoofing * @vf: Virtual Function pool - VF Pool to set for Ethertype anti-spoofing **/ -static void ixgbe_set_ethertype_anti_spoofing_x550(struct ixgbe_hw *hw, - bool enable, int vf) +void ixgbe_set_ethertype_anti_spoofing_x550(struct ixgbe_hw *hw, + bool enable, int vf) { int vf_target_reg = vf >> 3; int vf_target_shift = vf % 8 + IXGBE_SPOOF_ETHERTYPEAS_SHIFT; @@ -3532,9 +3533,9 @@ static void ixgbe_set_ethertype_anti_spoofing_x550(struct ixgbe_hw *hw, * @enable: enable or disable source address pruning * @pool: Rx pool to set source address pruning for **/ -static void ixgbe_set_source_address_pruning_x550(struct ixgbe_hw *hw, - bool enable, - unsigned int pool) +void ixgbe_set_source_address_pruning_x550(struct ixgbe_hw *hw, + bool enable, + unsigned int pool) { u64 pfflp; @@ -4047,7 +4048,7 @@ static const u32 ixgbe_mvals_X550EM_x[IXGBE_MVALS_IDX_LIMIT] = { IXGBE_MVALS_INIT(X550EM_x) }; -static const u32 ixgbe_mvals_x550em_a[IXGBE_MVALS_IDX_LIMIT] = { +const u32 ixgbe_mvals_x550em_a[IXGBE_MVALS_IDX_LIMIT] = { IXGBE_MVALS_INIT(X550EM_a) }; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.h new file mode 100644 index 0000000000000..3e4092f8da3eb --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2024 Intel Corporation. */ + +#ifndef _IXGBE_X550_H_ +#define _IXGBE_X550_H_ + +#include "ixgbe_type.h" + +extern const u32 ixgbe_mvals_x550em_a[IXGBE_MVALS_IDX_LIMIT]; + +int ixgbe_set_fw_drv_ver_x550(struct ixgbe_hw *hw, u8 maj, u8 min, + u8 build, u8 sub, u16 len, + const char *driver_ver); +void ixgbe_set_source_address_pruning_x550(struct ixgbe_hw *hw, + bool enable, + unsigned int pool); +void ixgbe_set_ethertype_anti_spoofing_x550(struct ixgbe_hw *hw, + bool enable, int vf); + +#endif /* _IXGBE_X550_H_ */ From 208fff3f567e2a3c3e7e4788845e90245c3891b4 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Wed, 18 Dec 2024 14:12:37 +0100 Subject: [PATCH 0867/1450] PCI: Add PCI_VDEVICE_SUB helper macro PCI_VDEVICE_SUB generates the pci_device_id struct layout for the specific PCI device/subdevice. Private data may follow the output. Reviewed-by: Przemek Kitszel Signed-off-by: Piotr Kwapulinski Acked-by: Bjorn Helgaas Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- include/linux/pci.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eefd..414ee5fff66bc 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1046,6 +1046,20 @@ struct pci_driver { .vendor = PCI_VENDOR_ID_##vend, .device = (dev), \ .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, 0, 0 +/** + * PCI_VDEVICE_SUB - describe a specific PCI device/subdevice in a short form + * @vend: the vendor name + * @dev: the 16 bit PCI Device ID + * @subvend: the 16 bit PCI Subvendor ID + * @subdev: the 16 bit PCI Subdevice ID + * + * Generate the pci_device_id struct layout for the specific PCI + * device/subdevice. Private data may follow the output. + */ +#define PCI_VDEVICE_SUB(vend, dev, subvend, subdev) \ + .vendor = PCI_VENDOR_ID_##vend, .device = (dev), \ + .subvendor = (subvend), .subdevice = (subdev), 0, 0 + /** * PCI_DEVICE_DATA - macro used to describe a specific PCI device in very short form * @vend: the vendor name (without PCI_VENDOR_ID_ prefix) From 4c44b450c69b676955c2790dcf467c1f969d80f1 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Wed, 18 Dec 2024 14:12:38 +0100 Subject: [PATCH 0868/1450] ixgbevf: Add support for Intel(R) E610 device Add support for Intel(R) E610 Series of network devices. The E610 is based on X550 but adds firmware managed link, enhanced security capabilities and support for updated server manageability Reviewed-by: Przemek Kitszel Signed-off-by: Piotr Kwapulinski Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbevf/defines.h | 5 ++++- drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 6 +++++- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 12 ++++++++++-- drivers/net/ethernet/intel/ixgbevf/vf.c | 12 +++++++++++- drivers/net/ethernet/intel/ixgbevf/vf.h | 4 +++- 5 files changed, 33 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/defines.h b/drivers/net/ethernet/intel/ixgbevf/defines.h index 5f08779c0e4e3..a9bc96f6399dc 100644 --- a/drivers/net/ethernet/intel/ixgbevf/defines.h +++ b/drivers/net/ethernet/intel/ixgbevf/defines.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #ifndef _IXGBEVF_DEFINES_H_ #define _IXGBEVF_DEFINES_H_ @@ -16,6 +16,9 @@ #define IXGBE_DEV_ID_X550_VF_HV 0x1564 #define IXGBE_DEV_ID_X550EM_X_VF_HV 0x15A9 +#define IXGBE_DEV_ID_E610_VF 0x57AD +#define IXGBE_SUBDEV_ID_E610_VF_HV 0x00FF + #define IXGBE_VF_IRQ_CLEAR_MASK 7 #define IXGBE_VF_MAX_TX_QUEUES 8 #define IXGBE_VF_MAX_RX_QUEUES 8 diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 130cb868774c4..9b37f354d78ce 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #ifndef _IXGBEVF_H_ #define _IXGBEVF_H_ @@ -418,6 +418,8 @@ enum ixgbevf_boards { board_X550EM_x_vf, board_X550EM_x_vf_hv, board_x550em_a_vf, + board_e610_vf, + board_e610_vf_hv, }; enum ixgbevf_xcast_modes { @@ -434,11 +436,13 @@ extern const struct ixgbevf_info ixgbevf_X550EM_x_vf_info; extern const struct ixgbe_mbx_operations ixgbevf_mbx_ops; extern const struct ixgbe_mbx_operations ixgbevf_mbx_ops_legacy; extern const struct ixgbevf_info ixgbevf_x550em_a_vf_info; +extern const struct ixgbevf_info ixgbevf_e610_vf_info; extern const struct ixgbevf_info ixgbevf_82599_vf_hv_info; extern const struct ixgbevf_info ixgbevf_X540_vf_hv_info; extern const struct ixgbevf_info ixgbevf_X550_vf_hv_info; extern const struct ixgbevf_info ixgbevf_X550EM_x_vf_hv_info; +extern const struct ixgbevf_info ixgbevf_e610_vf_hv_info; extern const struct ixgbe_mbx_operations ixgbevf_hv_mbx_ops; /* needed by ethtool.c */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 149911e3002a2..2829bac9af949 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ /****************************************************************************** Copyright (c)2006 - 2007 Myricom, Inc. for some LRO specific code @@ -39,7 +39,7 @@ static const char ixgbevf_driver_string[] = "Intel(R) 10 Gigabit PCI Express Virtual Function Network Driver"; static char ixgbevf_copyright[] = - "Copyright (c) 2009 - 2018 Intel Corporation."; + "Copyright (c) 2009 - 2024 Intel Corporation."; static const struct ixgbevf_info *ixgbevf_info_tbl[] = { [board_82599_vf] = &ixgbevf_82599_vf_info, @@ -51,6 +51,8 @@ static const struct ixgbevf_info *ixgbevf_info_tbl[] = { [board_X550EM_x_vf] = &ixgbevf_X550EM_x_vf_info, [board_X550EM_x_vf_hv] = &ixgbevf_X550EM_x_vf_hv_info, [board_x550em_a_vf] = &ixgbevf_x550em_a_vf_info, + [board_e610_vf] = &ixgbevf_e610_vf_info, + [board_e610_vf_hv] = &ixgbevf_e610_vf_hv_info, }; /* ixgbevf_pci_tbl - PCI Device ID Table @@ -71,6 +73,9 @@ static const struct pci_device_id ixgbevf_pci_tbl[] = { {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_VF), board_X550EM_x_vf }, {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_X_VF_HV), board_X550EM_x_vf_hv}, {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_X550EM_A_VF), board_x550em_a_vf }, + {PCI_VDEVICE_SUB(INTEL, IXGBE_DEV_ID_E610_VF, PCI_ANY_ID, + IXGBE_SUBDEV_ID_E610_VF_HV), board_e610_vf_hv}, + {PCI_VDEVICE(INTEL, IXGBE_DEV_ID_E610_VF), board_e610_vf}, /* required last entry */ {0, } }; @@ -4693,6 +4698,9 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) case ixgbe_mac_X540_vf: dev_info(&pdev->dev, "Intel(R) X540 Virtual Function\n"); break; + case ixgbe_mac_e610_vf: + dev_info(&pdev->dev, "Intel(R) E610 Virtual Function\n"); + break; case ixgbe_mac_82599_vf: default: dev_info(&pdev->dev, "Intel(R) 82599 Virtual Function\n"); diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c index 1641d00d8ed35..da7a72ecce7a2 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.c +++ b/drivers/net/ethernet/intel/ixgbevf/vf.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #include "vf.h" #include "ixgbevf.h" @@ -1076,3 +1076,13 @@ const struct ixgbevf_info ixgbevf_x550em_a_vf_info = { .mac = ixgbe_mac_x550em_a_vf, .mac_ops = &ixgbevf_mac_ops, }; + +const struct ixgbevf_info ixgbevf_e610_vf_info = { + .mac = ixgbe_mac_e610_vf, + .mac_ops = &ixgbevf_mac_ops, +}; + +const struct ixgbevf_info ixgbevf_e610_vf_hv_info = { + .mac = ixgbe_mac_e610_vf, + .mac_ops = &ixgbevf_hv_mac_ops, +}; diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.h b/drivers/net/ethernet/intel/ixgbevf/vf.h index b4eef5b6c172b..2d791bc26ae4e 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.h +++ b/drivers/net/ethernet/intel/ixgbevf/vf.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 1999 - 2018 Intel Corporation. */ +/* Copyright(c) 1999 - 2024 Intel Corporation. */ #ifndef __IXGBE_VF_H__ #define __IXGBE_VF_H__ @@ -54,6 +54,8 @@ enum ixgbe_mac_type { ixgbe_mac_X550_vf, ixgbe_mac_X550EM_x_vf, ixgbe_mac_x550em_a_vf, + ixgbe_mac_e610, + ixgbe_mac_e610_vf, ixgbe_num_macs }; From 246068b86b1c36e4590388ab8f278e21f1997dc1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 19 Dec 2024 17:54:10 +0200 Subject: [PATCH 0869/1450] selftests: net: local_termination: require mausezahn Since the blamed commit, we require mausezahn because send_raw() uses it. Remove the "REQUIRE_MZ=no" line, which overwrites the default of requiring it. Fixes: 237979504264 ("selftests: net: local_termination: add PTP frames to the mix") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241219155410.1856868-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/local_termination.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index c35548767756d..ecd34f364125c 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -7,7 +7,6 @@ ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \ NUM_NETIFS=2 PING_COUNT=1 REQUIRE_MTOOLS=yes -REQUIRE_MZ=no source lib.sh From 4a25201aa46ce88e8e31f9ccdec0e4e3dd6bb736 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:32 -0800 Subject: [PATCH 0870/1450] netdev-genl: avoid empty messages in napi get Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down we "hide" the NAPI instances. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241219032833.1165433-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2d3ae0cd3ad25..b0772d135efbe 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -246,8 +246,12 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rcu_read_unlock(); rtnl_unlock(); - if (err) + if (err) { + goto err_free_msg; + } else if (!rsp->len) { + err = -ENOENT; goto err_free_msg; + } return genlmsg_reply(rsp, info); From 30b981796b94b083da8fdded7cb74cb493608760 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:33 -0800 Subject: [PATCH 0871/1450] selftests: drv-net: test empty queue and NAPI responses in netlink Make sure kernel doesn't respond to GETs for queues and NAPIs when link is down. Not with valid data, or with empty message, we want a ENOENT. Link: https://patch.msgid.link/20241219032833.1165433-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/queues.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 9c5473abbd784..38303da957ee5 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_exit, ksft_eq, KsftSkipEx -from lib.py import EthtoolFamily, NetdevFamily +from lib.py import ksft_disruptive, ksft_exit, ksft_run +from lib.py import ksft_eq, ksft_raises, KsftSkipEx +from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import NetDrvEnv -from lib.py import cmd +from lib.py import cmd, defer, ip +import errno import glob @@ -59,9 +61,27 @@ def addremove_queues(cfg, nl) -> None: ksft_eq(queues, expected) +@ksft_disruptive +def check_down(cfg, nl) -> None: + # Check the NAPI IDs before interface goes down and hides them + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + + ip(f"link set dev {cfg.dev['ifname']} down") + defer(ip, f"link set dev {cfg.dev['ifname']} up") + + with ksft_raises(NlError) as cm: + nl.queue_get({'ifindex': cfg.ifindex, 'id': 0, 'type': 'rx'}) + ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT) + + if napis: + with ksft_raises(NlError) as cm: + nl.napi_get({'id': napis[0]['id']}) + ksft_eq(cm.exception.nl_msg.error, -errno.ENOENT) + + def main() -> None: with NetDrvEnv(__file__, queue_count=100) as cfg: - ksft_run([get_queues, addremove_queues], args=(cfg, NetdevFamily())) + ksft_run([get_queues, addremove_queues, check_down], args=(cfg, NetdevFamily())) ksft_exit() From a574fe14ed1e496deb8ee6821029ed96591021e8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 18 Dec 2024 16:33:38 +0000 Subject: [PATCH 0872/1450] net: hisilicon: hns: Remove unused hns_dsaf_roce_reset hns_dsaf_roce_reset() has been unused since 2021's commit 38d220882426 ("RDMA/hns: Remove support for HIP06") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jijie Shao Link: https://patch.msgid.link/20241218163341.40297-2-linux@treblig.org Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hns/hns_dsaf_main.c | 109 ------------------ .../ethernet/hisilicon/hns/hns_dsaf_main.h | 2 - 2 files changed, 111 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c index 851490346261e..6b6ced37e490e 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c @@ -3019,115 +3019,6 @@ static struct platform_driver g_dsaf_driver = { module_platform_driver(g_dsaf_driver); -/** - * hns_dsaf_roce_reset - reset dsaf and roce - * @dsaf_fwnode: Pointer to framework node for the dasf - * @dereset: false - request reset , true - drop reset - * return 0 - success , negative -fail - */ -int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool dereset) -{ - struct dsaf_device *dsaf_dev; - struct platform_device *pdev; - u32 mp; - u32 sl; - u32 credit; - int i; - static const u32 port_map[DSAF_ROCE_CREDIT_CHN][DSAF_ROCE_CHAN_MODE_NUM] = { - {DSAF_ROCE_PORT_0, DSAF_ROCE_PORT_0, DSAF_ROCE_PORT_0}, - {DSAF_ROCE_PORT_1, DSAF_ROCE_PORT_0, DSAF_ROCE_PORT_0}, - {DSAF_ROCE_PORT_2, DSAF_ROCE_PORT_1, DSAF_ROCE_PORT_0}, - {DSAF_ROCE_PORT_3, DSAF_ROCE_PORT_1, DSAF_ROCE_PORT_0}, - {DSAF_ROCE_PORT_4, DSAF_ROCE_PORT_2, DSAF_ROCE_PORT_1}, - {DSAF_ROCE_PORT_4, DSAF_ROCE_PORT_2, DSAF_ROCE_PORT_1}, - {DSAF_ROCE_PORT_5, DSAF_ROCE_PORT_3, DSAF_ROCE_PORT_1}, - {DSAF_ROCE_PORT_5, DSAF_ROCE_PORT_3, DSAF_ROCE_PORT_1}, - }; - static const u32 sl_map[DSAF_ROCE_CREDIT_CHN][DSAF_ROCE_CHAN_MODE_NUM] = { - {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_0}, - {DSAF_ROCE_SL_0, DSAF_ROCE_SL_1, DSAF_ROCE_SL_1}, - {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_2}, - {DSAF_ROCE_SL_0, DSAF_ROCE_SL_1, DSAF_ROCE_SL_3}, - {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_0}, - {DSAF_ROCE_SL_1, DSAF_ROCE_SL_1, DSAF_ROCE_SL_1}, - {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_2}, - {DSAF_ROCE_SL_1, DSAF_ROCE_SL_1, DSAF_ROCE_SL_3}, - }; - - /* find the platform device corresponding to fwnode */ - if (is_of_node(dsaf_fwnode)) { - pdev = of_find_device_by_node(to_of_node(dsaf_fwnode)); - } else if (is_acpi_device_node(dsaf_fwnode)) { - pdev = hns_dsaf_find_platform_device(dsaf_fwnode); - } else { - pr_err("fwnode is neither OF or ACPI type\n"); - return -EINVAL; - } - - /* check if we were a success in fetching pdev */ - if (!pdev) { - pr_err("couldn't find platform device for node\n"); - return -ENODEV; - } - - /* retrieve the dsaf_device from the driver data */ - dsaf_dev = dev_get_drvdata(&pdev->dev); - if (!dsaf_dev) { - dev_err(&pdev->dev, "dsaf_dev is NULL\n"); - put_device(&pdev->dev); - return -ENODEV; - } - - /* now, make sure we are running on compatible SoC */ - if (AE_IS_VER1(dsaf_dev->dsaf_ver)) { - dev_err(dsaf_dev->dev, "%s v1 chip doesn't support RoCE!\n", - dsaf_dev->ae_dev.name); - put_device(&pdev->dev); - return -ENODEV; - } - - /* do reset or de-reset according to the flag */ - if (!dereset) { - /* reset rocee-channels in dsaf and rocee */ - dsaf_dev->misc_op->hns_dsaf_srst_chns(dsaf_dev, DSAF_CHNS_MASK, - false); - dsaf_dev->misc_op->hns_dsaf_roce_srst(dsaf_dev, false); - } else { - /* configure dsaf tx roce correspond to port map and sl map */ - mp = dsaf_read_dev(dsaf_dev, DSAF_ROCE_PORT_MAP_REG); - for (i = 0; i < DSAF_ROCE_CREDIT_CHN; i++) - dsaf_set_field(mp, 7 << i * 3, i * 3, - port_map[i][DSAF_ROCE_6PORT_MODE]); - dsaf_set_field(mp, 3 << i * 3, i * 3, 0); - dsaf_write_dev(dsaf_dev, DSAF_ROCE_PORT_MAP_REG, mp); - - sl = dsaf_read_dev(dsaf_dev, DSAF_ROCE_SL_MAP_REG); - for (i = 0; i < DSAF_ROCE_CREDIT_CHN; i++) - dsaf_set_field(sl, 3 << i * 2, i * 2, - sl_map[i][DSAF_ROCE_6PORT_MODE]); - dsaf_write_dev(dsaf_dev, DSAF_ROCE_SL_MAP_REG, sl); - - /* de-reset rocee-channels in dsaf and rocee */ - dsaf_dev->misc_op->hns_dsaf_srst_chns(dsaf_dev, DSAF_CHNS_MASK, - true); - msleep(SRST_TIME_INTERVAL); - dsaf_dev->misc_op->hns_dsaf_roce_srst(dsaf_dev, true); - - /* enable dsaf channel rocee credit */ - credit = dsaf_read_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG); - dsaf_set_bit(credit, DSAF_SBM_ROCEE_CFG_CRD_EN_B, 0); - dsaf_write_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG, credit); - - dsaf_set_bit(credit, DSAF_SBM_ROCEE_CFG_CRD_EN_B, 1); - dsaf_write_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG, credit); - } - - put_device(&pdev->dev); - - return 0; -} -EXPORT_SYMBOL(hns_dsaf_roce_reset); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Huawei Tech. Co., Ltd."); MODULE_DESCRIPTION("HNS DSAF driver"); diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h index 0eb03dff1a8bf..c90f41c75500d 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h @@ -463,6 +463,4 @@ int hns_dsaf_clr_mac_mc_port(struct dsaf_device *dsaf_dev, u8 mac_id, u8 port_num); int hns_dsaf_wait_pkt_clean(struct dsaf_device *dsaf_dev, int port); -int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool dereset); - #endif /* __HNS_DSAF_MAIN_H__ */ From 0265e9edf2100735304907e9979a9264c4dc7b5e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 18 Dec 2024 16:33:39 +0000 Subject: [PATCH 0873/1450] net: hisilicon: hns: Remove unused hns_rcb_start hns_rcb_start() has been unused since 2016's commit 454784d85de3 ("net: hns: delete redundancy ring enable operations") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jijie Shao Link: https://patch.msgid.link/20241218163341.40297-3-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c | 5 ----- drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h | 1 - 2 files changed, 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c index 46af467aa5963..635b3a95dd82a 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c @@ -195,11 +195,6 @@ void hns_rcb_ring_enable_hw(struct hnae_queue *q, u32 val) dsaf_write_dev(q, RCB_RING_PREFETCH_EN_REG, !!val); } -void hns_rcb_start(struct hnae_queue *q, u32 val) -{ - hns_rcb_ring_enable_hw(q, val); -} - /** *hns_rcb_common_init_commit_hw - make rcb common init completed *@rcb_common: rcb common device diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h index 0f4cc184ef390..68f81547dfb45 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h @@ -116,7 +116,6 @@ int hns_rcb_buf_size2type(u32 buf_size); int hns_rcb_common_get_cfg(struct dsaf_device *dsaf_dev, int comm_index); void hns_rcb_common_free_cfg(struct dsaf_device *dsaf_dev, u32 comm_index); int hns_rcb_common_init_hw(struct rcb_common_cb *rcb_common); -void hns_rcb_start(struct hnae_queue *q, u32 val); int hns_rcb_get_cfg(struct rcb_common_cb *rcb_common); void hns_rcb_get_queue_mode(enum dsaf_mode dsaf_mode, u16 *max_vfn, u16 *max_q_per_vf); From 0198b459f54e5813da4f6c36c867822401abc0c8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 18 Dec 2024 16:33:40 +0000 Subject: [PATCH 0874/1450] net: hisilicon: hns: Remove reset helpers With hns_dsaf_roce_reset() removed in a previous patch, the two helper member pointers, 'hns_dsaf_roce_srst', and 'hns_dsaf_srst_chns' are now unread. Remove them, and the helper functions that they were initialised to, that is hns_dsaf_srst_chns(), hns_dsaf_srst_chns_acpi(), hns_dsaf_roce_srst() and hns_dsaf_roce_srst_acpi(). Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jijie Shao Link: https://patch.msgid.link/20241218163341.40297-4-linux@treblig.org Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hns/hns_dsaf_main.h | 3 - .../ethernet/hisilicon/hns/hns_dsaf_misc.c | 67 ------------------- 2 files changed, 70 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h index c90f41c75500d..bb8267aafc434 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h @@ -307,9 +307,6 @@ struct dsaf_misc_op { void (*ge_srst)(struct dsaf_device *dsaf_dev, u32 port, bool dereset); void (*ppe_srst)(struct dsaf_device *dsaf_dev, u32 port, bool dereset); void (*ppe_comm_srst)(struct dsaf_device *dsaf_dev, bool dereset); - void (*hns_dsaf_srst_chns)(struct dsaf_device *dsaf_dev, u32 msk, - bool dereset); - void (*hns_dsaf_roce_srst)(struct dsaf_device *dsaf_dev, bool dereset); phy_interface_t (*get_phy_if)(struct hns_mac_cb *mac_cb); int (*get_sfp_prsnt)(struct hns_mac_cb *mac_cb, int *sfp_prsnt); diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c index 5df19c604d09a..91391a49fcea2 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c @@ -326,69 +326,6 @@ static void hns_dsaf_xge_srst_by_port_acpi(struct dsaf_device *dsaf_dev, HNS_XGE_RESET_FUNC, port, dereset); } -/** - * hns_dsaf_srst_chns - reset dsaf channels - * @dsaf_dev: dsaf device struct pointer - * @msk: xbar channels mask value: - * @dereset: false - request reset , true - drop reset - * - * bit0-5 for xge0-5 - * bit6-11 for ppe0-5 - * bit12-17 for roce0-5 - * bit18-19 for com/dfx - */ -static void -hns_dsaf_srst_chns(struct dsaf_device *dsaf_dev, u32 msk, bool dereset) -{ - u32 reg_addr; - - if (!dereset) - reg_addr = DSAF_SUB_SC_DSAF_RESET_REQ_REG; - else - reg_addr = DSAF_SUB_SC_DSAF_RESET_DREQ_REG; - - dsaf_write_sub(dsaf_dev, reg_addr, msk); -} - -/** - * hns_dsaf_srst_chns_acpi - reset dsaf channels - * @dsaf_dev: dsaf device struct pointer - * @msk: xbar channels mask value: - * @dereset: false - request reset , true - drop reset - * - * bit0-5 for xge0-5 - * bit6-11 for ppe0-5 - * bit12-17 for roce0-5 - * bit18-19 for com/dfx - */ -static void -hns_dsaf_srst_chns_acpi(struct dsaf_device *dsaf_dev, u32 msk, bool dereset) -{ - hns_dsaf_acpi_srst_by_port(dsaf_dev, HNS_OP_RESET_FUNC, - HNS_DSAF_CHN_RESET_FUNC, - msk, dereset); -} - -static void hns_dsaf_roce_srst(struct dsaf_device *dsaf_dev, bool dereset) -{ - if (!dereset) { - dsaf_write_sub(dsaf_dev, DSAF_SUB_SC_ROCEE_RESET_REQ_REG, 1); - } else { - dsaf_write_sub(dsaf_dev, - DSAF_SUB_SC_ROCEE_CLK_DIS_REG, 1); - dsaf_write_sub(dsaf_dev, - DSAF_SUB_SC_ROCEE_RESET_DREQ_REG, 1); - msleep(20); - dsaf_write_sub(dsaf_dev, DSAF_SUB_SC_ROCEE_CLK_EN_REG, 1); - } -} - -static void hns_dsaf_roce_srst_acpi(struct dsaf_device *dsaf_dev, bool dereset) -{ - hns_dsaf_acpi_srst_by_port(dsaf_dev, HNS_OP_RESET_FUNC, - HNS_ROCE_RESET_FUNC, 0, dereset); -} - static void hns_dsaf_ge_srst_by_port(struct dsaf_device *dsaf_dev, u32 port, bool dereset) { @@ -729,8 +666,6 @@ struct dsaf_misc_op *hns_misc_op_get(struct dsaf_device *dsaf_dev) misc_op->ge_srst = hns_dsaf_ge_srst_by_port; misc_op->ppe_srst = hns_ppe_srst_by_port; misc_op->ppe_comm_srst = hns_ppe_com_srst; - misc_op->hns_dsaf_srst_chns = hns_dsaf_srst_chns; - misc_op->hns_dsaf_roce_srst = hns_dsaf_roce_srst; misc_op->get_phy_if = hns_mac_get_phy_if; misc_op->get_sfp_prsnt = hns_mac_get_sfp_prsnt; @@ -746,8 +681,6 @@ struct dsaf_misc_op *hns_misc_op_get(struct dsaf_device *dsaf_dev) misc_op->ge_srst = hns_dsaf_ge_srst_by_port_acpi; misc_op->ppe_srst = hns_ppe_srst_by_port_acpi; misc_op->ppe_comm_srst = hns_ppe_com_srst; - misc_op->hns_dsaf_srst_chns = hns_dsaf_srst_chns_acpi; - misc_op->hns_dsaf_roce_srst = hns_dsaf_roce_srst_acpi; misc_op->get_phy_if = hns_mac_get_phy_if_acpi; misc_op->get_sfp_prsnt = hns_mac_get_sfp_prsnt_acpi; From 8973ce189376e41ddd398fab0ba8dc8b14e50cd0 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 18 Dec 2024 16:33:41 +0000 Subject: [PATCH 0875/1450] net: hisilicon: hns: Remove unused enums The enums dsaf_roce_port_mode, dsaf_roce_port_num and dsaf_roce_qos_sl are unused after the removal of the reset code. Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jijie Shao Link: https://patch.msgid.link/20241218163341.40297-5-linux@treblig.org Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hns/hns_dsaf_main.h | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h index bb8267aafc434..653dfbb25d1b5 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h @@ -42,29 +42,6 @@ struct hns_mac_cb; #define HNS_MAX_WAIT_CNT 10000 -enum dsaf_roce_port_mode { - DSAF_ROCE_6PORT_MODE, - DSAF_ROCE_4PORT_MODE, - DSAF_ROCE_2PORT_MODE, - DSAF_ROCE_CHAN_MODE_NUM, -}; - -enum dsaf_roce_port_num { - DSAF_ROCE_PORT_0, - DSAF_ROCE_PORT_1, - DSAF_ROCE_PORT_2, - DSAF_ROCE_PORT_3, - DSAF_ROCE_PORT_4, - DSAF_ROCE_PORT_5, -}; - -enum dsaf_roce_qos_sl { - DSAF_ROCE_SL_0, - DSAF_ROCE_SL_1, - DSAF_ROCE_SL_2, - DSAF_ROCE_SL_3, -}; - #define DSAF_STATS_READ(p, offset) (*((u64 *)((u8 *)(p) + (offset)))) #define HNS_DSAF_IS_DEBUG(dev) ((dev)->dsaf_mode == DSAF_MODE_DISABLE_SP) From 05dd04b218f42c57a14e330fd8583995f141ed6b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Dec 2024 15:03:30 +0000 Subject: [PATCH 0876/1450] inetpeer: avoid false sharing in inet_peer_xrlim_allow() Under DOS, inet_peer_xrlim_allow() might be called millions of times per second from different cpus. Make sure to write over peer->rate_tokens and peer->rate_last only when really needed. Note the inherent races of this function are still there, we do not care of precise ICMP rate limiting. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241219150330.3159027-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inetpeer.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e02484f4d22b8..b8b23a77ceb4f 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -246,23 +246,27 @@ void inet_putpeer(struct inet_peer *p) #define XRLIM_BURST_FACTOR 6 bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) { - unsigned long now, token; + unsigned long now, token, otoken, delta; bool rc = false; if (!peer) return true; - token = peer->rate_tokens; + token = otoken = READ_ONCE(peer->rate_tokens); now = jiffies; - token += now - peer->rate_last; - peer->rate_last = now; - if (token > XRLIM_BURST_FACTOR * timeout) - token = XRLIM_BURST_FACTOR * timeout; + delta = now - READ_ONCE(peer->rate_last); + if (delta) { + WRITE_ONCE(peer->rate_last, now); + token += delta; + if (token > XRLIM_BURST_FACTOR * timeout) + token = XRLIM_BURST_FACTOR * timeout; + } if (token >= timeout) { token -= timeout; rc = true; } - peer->rate_tokens = token; + if (token != otoken) + WRITE_ONCE(peer->rate_tokens, token); return rc; } EXPORT_SYMBOL(inet_peer_xrlim_allow); From 973b710b8821c3401ad7a25360c89e94b26884ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:01 +0000 Subject: [PATCH 0877/1450] kheaders: Ignore silly-rename files Tell tar to ignore silly-rename files (".__afs*" and ".nfs*") when building the header archive. These occur when a file that is open is unlinked locally, but hasn't yet been closed. Such files are visible to the user via the getdents() syscall and so programs may want to do things with them. During the kernel build, such files may be made during the processing of header files and the cleanup may get deferred by fput() which may result in tar seeing these files when it reads the directory, but they may have disappeared by the time it tries to open them, causing tar to fail with an error. Further, we don't want to include them in the tarball if they still exist. With CONFIG_HEADERS_INSTALL=y, something like the following may be seen: find: './kernel/.tmp_cpio_dir/include/dt-bindings/reset/.__afs2080': No such file or directory tar: ./include/linux/greybus/.__afs3C95: File removed before we read it The find warning doesn't seem to cause a problem. Fix this by telling tar when called from in gen_kheaders.sh to exclude such files. This only affects afs and nfs; cifs uses the Windows Hidden attribute to prevent the file from being seen. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-2-dhowells@redhat.com cc: Masahiro Yamada cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: linux-nfs@vger.kernel.org cc: linux-kernel@vger.kernel.org Signed-off-by: Christian Brauner --- kernel/gen_kheaders.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 383fd43ac6122..7e1340da5acae 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -89,6 +89,7 @@ find $cpio_dir -type f -print0 | # Create archive and try to normalize metadata for reproducibility. tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --exclude=".__afs*" --exclude=".nfs*" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null From c8b90d40d5bba8e6fba457b8a7c10d3c0d467e37 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:02 +0000 Subject: [PATCH 0878/1450] netfs: Fix non-contiguous donation between completed reads When a read subrequest finishes, if it doesn't have sufficient coverage to complete the folio(s) covering either side of it, it will donate the excess coverage to the adjacent subrequests on either side, offloading responsibility for unlocking the folio(s) covered to them. Now, preference is given to donating down to a lower file offset over donating up because that check is done first - but there's no check that the lower subreq is actually contiguous, and so we can end up donating incorrectly. The scenario seen[1] is that an 8MiB readahead request spanning four 2MiB folios is split into eight 1MiB subreqs (numbered 1 through 8). These terminate in the order 1,6,2,5,3,7,4,8. What happens is: - 1 donates to 2 - 6 donates to 5 - 2 completes, unlocking the first folio (with 1). - 5 completes, unlocking the third folio (with 6). - 3 donates to 4 - 7 donates to 4 incorrectly - 4 completes, unlocking the second folio (with 3), but can't use the excess from 7. - 8 donates to 4, also incorrectly. Fix this by preventing downward donation if the subreqs are not contiguous (in the example above, 7 donates to 4 across the gap left by 5 and 6). Reported-by: Shyam Prasad N Closes: https://lore.kernel.org/r/CANT5p=qBwjBm-D8soFVVtswGEfmMtQXVW83=TNfUtvyHeFQZBA@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/526707.1733224486@warthog.procyon.org.uk/ [1] Link: https://lore.kernel.org/r/20241213135013.2964079-3-dhowells@redhat.com cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 3cbb289535a85..b415e39723361 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -247,16 +247,17 @@ static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was /* Deal with the trickiest case: that this subreq is in the middle of a * folio, not touching either edge, but finishes first. In such a - * case, we donate to the previous subreq, if there is one, so that the - * donation is only handled when that completes - and remove this - * subreq from the list. + * case, we donate to the previous subreq, if there is one and if it is + * contiguous, so that the donation is only handled when that completes + * - and remove this subreq from the list. * * If the previous subreq finished first, we will have acquired their * donation and should be able to unlock folios and/or donate nextwards. */ if (!subreq->consumed && !prev_donated && - !list_is_first(&subreq->rreq_link, &rreq->subrequests)) { + !list_is_first(&subreq->rreq_link, &rreq->subrequests) && + subreq->start == prev->start + prev->len) { prev = list_prev_entry(subreq, rreq_link); WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len); subreq->start += subreq->len; From 105549d09a539a876b7c3330ab52d8aceedad358 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:03 +0000 Subject: [PATCH 0879/1450] netfs: Fix enomem handling in buffered reads If netfs_read_to_pagecache() gets an error from either ->prepare_read() or from netfs_prepare_read_iterator(), it needs to decrement ->nr_outstanding, cancel the subrequest and break out of the issuing loop. Currently, it only does this for two of the cases, but there are two more that aren't handled. Fix this by moving the handling to a common place and jumping to it from all four places. This is in preference to inserting a wrapper around netfs_prepare_read_iterator() as proposed by Dmitry Antipov[1]. Link: https://lore.kernel.org/r/20241202093943.227786-1-dmantipov@yandex.ru/ [1] Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Reported-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=404b4b745080b6210c6c Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-4-dhowells@redhat.com Tested-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com cc: Dmitry Antipov cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 7ac34550c403c..4dc9b8286355e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -275,22 +275,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) netfs_stat(&netfs_n_rh_download); if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); - if (ret < 0) { - atomic_dec(&rreq->nr_outstanding); - netfs_put_subrequest(subreq, false, - netfs_sreq_trace_put_cancel); - break; - } + if (ret < 0) + goto prep_failed; trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); } slice = netfs_prepare_read_iterator(subreq); - if (slice < 0) { - atomic_dec(&rreq->nr_outstanding); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); - ret = slice; - break; - } + if (slice < 0) + goto prep_iter_failed; rreq->netfs_ops->issue_read(subreq); goto done; @@ -302,6 +294,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) trace_netfs_sreq(subreq, netfs_sreq_trace_submit); netfs_stat(&netfs_n_rh_zero); slice = netfs_prepare_read_iterator(subreq); + if (slice < 0) + goto prep_iter_failed; __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); netfs_read_subreq_terminated(subreq, 0, false); goto done; @@ -310,6 +304,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (source == NETFS_READ_FROM_CACHE) { trace_netfs_sreq(subreq, netfs_sreq_trace_submit); slice = netfs_prepare_read_iterator(subreq); + if (slice < 0) + goto prep_iter_failed; netfs_read_cache_to_pagecache(rreq, subreq); goto done; } @@ -318,6 +314,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) WARN_ON_ONCE(1); break; + prep_iter_failed: + ret = slice; + prep_failed: + subreq->error = ret; + atomic_dec(&rreq->nr_outstanding); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + break; + done: size -= slice; start += slice; From 86ad1a58f6a9453f49e06ef957a40a8dac00a13f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:04 +0000 Subject: [PATCH 0880/1450] nfs: Fix oops in nfs_netfs_init_request() when copying to cache When netfslib wants to copy some data that has just been read on behalf of nfs, it creates a new write request and calls nfs_netfs_init_request() to initialise it, but with a NULL file pointer. This causes nfs_file_open_context() to oops - however, we don't actually need the nfs context as we're only going to write to the cache. Fix this by just returning if we aren't given a file pointer and emit a warning if the request was for something other than copy-to-cache. Further, fix nfs_netfs_free_request() so that it doesn't try to free the context if the pointer is NULL. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Reported-by: Max Kellermann Closes: https://lore.kernel.org/r/CAKPOu+9DyMbKLhyJb7aMLDTb=Fh0T8Teb9sjuf_pze+XWT1VaQ@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-5-dhowells@redhat.com cc: Trond Myklebust cc: Anna Schumaker cc: Dave Wysochanski cc: Jeff Layton cc: linux-nfs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/nfs/fscache.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 810269ee0a50e..d49e4ce279994 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -263,6 +263,12 @@ int nfs_netfs_readahead(struct readahead_control *ractl) static atomic_t nfs_netfs_debug_id; static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file) { + if (!file) { + if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE)) + return -EIO; + return 0; + } + rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ @@ -274,7 +280,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi static void nfs_netfs_free_request(struct netfs_io_request *rreq) { - put_nfs_open_context(rreq->netfs_priv); + if (rreq->netfs_priv) + put_nfs_open_context(rreq->netfs_priv); } static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) From e5a8b6446c0d370716f193771ccacf3260a57534 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 13 Dec 2024 13:50:05 +0000 Subject: [PATCH 0881/1450] cachefiles: Parse the "secctx" immediately Instead of storing an opaque string, call security_secctx_to_secid() right in the "secctx" command handler and store only the numeric "secid". This eliminates an unnecessary string allocation and allows the daemon to receive errors when writing the "secctx" command instead of postponing the error to the "bind" command handler. For example, if the kernel was built without `CONFIG_SECURITY`, "bind" will return `EOPNOTSUPP`, but the daemon doesn't know why. With this patch, the "secctx" will instead return `EOPNOTSUPP` which is the right context for this error. This patch adds a boolean flag `have_secid` because I'm not sure if we can safely assume that zero is the special secid value for "not set". This appears to be true for SELinux, Smack and AppArmor, but since this attribute is not documented, I'm unable to derive a stable guarantee for that. Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241209141554.638708-1-max.kellermann@ionos.com/ Link: https://lore.kernel.org/r/20241213135013.2964079-6-dhowells@redhat.com Signed-off-by: Christian Brauner --- fs/cachefiles/daemon.c | 14 +++++++------- fs/cachefiles/internal.h | 3 ++- fs/cachefiles/security.c | 6 +++--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 89b11336a8369..1806bff8e59bc 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -576,7 +577,7 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) */ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) { - char *secctx; + int err; _enter(",%s", args); @@ -585,16 +586,16 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) return -EINVAL; } - if (cache->secctx) { + if (cache->have_secid) { pr_err("Second security context specified\n"); return -EINVAL; } - secctx = kstrdup(args, GFP_KERNEL); - if (!secctx) - return -ENOMEM; + err = security_secctx_to_secid(args, strlen(args), &cache->secid); + if (err) + return err; - cache->secctx = secctx; + cache->have_secid = true; return 0; } @@ -820,7 +821,6 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) put_cred(cache->cache_cred); kfree(cache->rootdirname); - kfree(cache->secctx); kfree(cache->tag); _leave(""); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 7b99bd98de75b..38c236e38cef8 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -122,7 +122,6 @@ struct cachefiles_cache { #define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ #define CACHEFILES_ONDEMAND_MODE 4 /* T if in on-demand read mode */ char *rootdirname; /* name of cache root directory */ - char *secctx; /* LSM security context */ char *tag; /* cache binding tag */ refcount_t unbind_pincount;/* refcount to do daemon unbind */ struct xarray reqs; /* xarray of pending on-demand requests */ @@ -130,6 +129,8 @@ struct cachefiles_cache { struct xarray ondemand_ids; /* xarray for ondemand_id allocation */ u32 ondemand_id_next; u32 msg_id_next; + u32 secid; /* LSM security id */ + bool have_secid; /* whether "secid" was set */ }; static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index fe777164f1d89..fc6611886b3b5 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -18,7 +18,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) struct cred *new; int ret; - _enter("{%s}", cache->secctx); + _enter("{%u}", cache->have_secid ? cache->secid : 0); new = prepare_kernel_cred(current); if (!new) { @@ -26,8 +26,8 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) goto error; } - if (cache->secctx) { - ret = set_security_override_from_ctx(new, cache->secctx); + if (cache->have_secid) { + ret = set_security_override(new, cache->secid); if (ret < 0) { put_cred(new); pr_err("Security denies permission to nominate security context: error %d\n", From f4d3cde410cc62b5483f59f0f3454a5c5203a2cb Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Fri, 13 Dec 2024 13:50:06 +0000 Subject: [PATCH 0882/1450] netfs: Remove redundant use of smp_rmb() The function netfs_unbuffered_write_iter_locked() in fs/netfs/direct_write.c contains an unnecessary smp_rmb() call after wait_on_bit(). Since wait_on_bit() already incorporates a memory barrier that ensures the flag update is visible before the function returns, the smp_rmb() provides no additional benefit and incurs unnecessary overhead. This patch removes the redundant barrier to simplify and optimize the code. Signed-off-by: Zilin Guan Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241207021952.2978530-1-zilin@seu.edu.cn/ Link: https://lore.kernel.org/r/20241213135013.2964079-7-dhowells@redhat.com Reviewed-by: Akira Yokosawa cc: Akira Yokosawa cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/direct_write.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 88f2adfab75e9..173e8b5e6a930 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -104,7 +104,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - smp_rmb(); /* Read error/transferred after RIP flag */ ret = wreq->error; if (ret == 0) { ret = wreq->transferred; From aa3956418985bda1f68313eadde3267921847978 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:07 +0000 Subject: [PATCH 0883/1450] netfs: Fix missing barriers by using clear_and_wake_up_bit() Use clear_and_wake_up_bit() rather than something like: clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); as there needs to be a barrier inserted between which is present in clear_and_wake_up_bit(). Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-8-dhowells@redhat.com Reviewed-by: Akira Yokosawa cc: Zilin Guan cc: Akira Yokosawa cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 3 +-- fs/netfs/write_collect.c | 9 +++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index b415e39723361..46ce3b7adf072 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -379,8 +379,7 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq) task_io_account_read(rreq->transferred); trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_done); netfs_clear_subrequests(rreq, false); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 1d438be2e1b4b..82290c92ba7a2 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -501,8 +501,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) goto need_retry; if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { trace_netfs_rreq(wreq, netfs_rreq_trace_unpause); - clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags); - wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE); + clear_and_wake_up_bit(NETFS_RREQ_PAUSE, &wreq->flags); } if (notes & NEED_REASSESS) { @@ -605,8 +604,7 @@ void netfs_write_collection_worker(struct work_struct *work) _debug("finished"); trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); - wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags); if (wreq->iocb) { size_t written = min(wreq->transferred, wreq->len); @@ -714,8 +712,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); /* If we are at the head of the queue, wake up the collector, * transferring a ref to it if we were the ones to do so. From 4acb665cf4f3e5436844f17ece0a8a55ce688c7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:08 +0000 Subject: [PATCH 0884/1450] netfs: Work around recursion by abandoning retry if nothing read syzkaller reported recursion with a loop of three calls (netfs_rreq_assess, netfs_retry_reads and netfs_rreq_terminated) hitting the limit of the stack during an unbuffered or direct I/O read. There are a number of issues: (1) There is no limit on the number of retries. (2) A subrequest is supposed to be abandoned if it does not transfer anything (NETFS_SREQ_NO_PROGRESS), but that isn't checked under all circumstances. (3) The actual root cause, which is this: if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_terminated(rreq, ...); When we do a retry, we bump the rreq->nr_outstanding counter to prevent the final cleanup phase running before we've finished dispatching the retries. The problem is if we hit 0, we have to do the cleanup phase - but we're in the cleanup phase and end up repeating the retry cycle, hence the recursion. Work around the problem by limiting the number of retries. This is based on Lizhi Xu's patch[1], and makes the following changes: (1) Replace NETFS_SREQ_NO_PROGRESS with NETFS_SREQ_MADE_PROGRESS and make the filesystem set it if it managed to read or write at least one byte of data. Clear this bit before issuing a subrequest. (2) Add a ->retry_count member to the subrequest and increment it any time we do a retry. (3) Remove the NETFS_SREQ_RETRYING flag as it is superfluous with ->retry_count. If the latter is non-zero, we're doing a retry. (4) Abandon a subrequest if retry_count is non-zero and we made no progress. (5) Use ->retry_count in both the write-side and the read-size. [?] Question: Should I set a hard limit on retry_count in both read and write? Say it hits 50, we always abandon it. The problem is that these changes only mitigate the issue. As long as it made at least one byte of progress, the recursion is still an issue. This patch mitigates the problem, but does not fix the underlying cause. I have patches that will do that, but it's an intrusive fix that's currently pending for the next merge window. The oops generated by KASAN looks something like: BUG: TASK stack guard page was hit at ffffc9000482ff48 (stack is ffffc90004830000..ffffc90004838000) Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN NOPTI ... RIP: 0010:mark_lock+0x25/0xc60 kernel/locking/lockdep.c:4686 ... mark_usage kernel/locking/lockdep.c:4646 [inline] __lock_acquire+0x906/0x3ce0 kernel/locking/lockdep.c:5156 lock_acquire.part.0+0x11b/0x380 kernel/locking/lockdep.c:5825 local_lock_acquire include/linux/local_lock_internal.h:29 [inline] ___slab_alloc+0x123/0x1880 mm/slub.c:3695 __slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908 __slab_alloc_node mm/slub.c:3961 [inline] slab_alloc_node mm/slub.c:4122 [inline] kmem_cache_alloc_noprof+0x2a7/0x2f0 mm/slub.c:4141 radix_tree_node_alloc.constprop.0+0x1e8/0x350 lib/radix-tree.c:253 idr_get_free+0x528/0xa40 lib/radix-tree.c:1506 idr_alloc_u32+0x191/0x2f0 lib/idr.c:46 idr_alloc+0xc1/0x130 lib/idr.c:87 p9_tag_alloc+0x394/0x870 net/9p/client.c:321 p9_client_prepare_req+0x19f/0x4d0 net/9p/client.c:644 p9_client_zc_rpc.constprop.0+0x105/0x880 net/9p/client.c:793 p9_client_read_once+0x443/0x820 net/9p/client.c:1570 p9_client_read+0x13f/0x1b0 net/9p/client.c:1534 v9fs_issue_read+0x115/0x310 fs/9p/vfs_addr.c:74 netfs_retry_read_subrequests fs/netfs/read_retry.c:60 [inline] netfs_retry_reads+0x153a/0x1d00 fs/netfs/read_retry.c:232 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 ... netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_dispatch_unbuffered_reads fs/netfs/direct_read.c:103 [inline] netfs_unbuffered_read fs/netfs/direct_read.c:127 [inline] netfs_unbuffered_read_iter_locked+0x12f6/0x19b0 fs/netfs/direct_read.c:221 netfs_unbuffered_read_iter+0xc5/0x100 fs/netfs/direct_read.c:256 v9fs_file_read_iter+0xbf/0x100 fs/9p/vfs_file.c:361 do_iter_readv_writev+0x614/0x7f0 fs/read_write.c:832 vfs_readv+0x4cf/0x890 fs/read_write.c:1025 do_preadv fs/read_write.c:1142 [inline] __do_sys_preadv fs/read_write.c:1192 [inline] __se_sys_preadv fs/read_write.c:1187 [inline] __x64_sys_preadv+0x22d/0x310 fs/read_write.c:1187 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Closes: https://syzkaller.appspot.com/bug?extid=1fc6f64c40a9d143cfb6 Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241108034020.3695718-1-lizhi.xu@windriver.com/ [1] Link: https://lore.kernel.org/r/20241213135013.2964079-9-dhowells@redhat.com Tested-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Suggested-by: Lizhi Xu cc: Dominique Martinet cc: Jeff Layton cc: v9fs@lists.linux.dev cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/9p/vfs_addr.c | 6 +++++- fs/afs/write.c | 5 ++++- fs/netfs/read_collect.c | 15 +++++++++------ fs/netfs/read_retry.c | 6 ++++-- fs/netfs/write_collect.c | 5 ++--- fs/netfs/write_issue.c | 2 ++ fs/smb/client/cifssmb.c | 13 +++++++++---- fs/smb/client/smb2pdu.c | 9 ++++++--- include/linux/netfs.h | 6 +++--- 9 files changed, 44 insertions(+), 23 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 819c752332355..3bc9ce6c575e5 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -57,6 +57,8 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq) int err, len; len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + if (len > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); netfs_write_subrequest_terminated(subreq, len ?: err, false); } @@ -80,8 +82,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) if (pos + total >= i_size_read(rreq->inode)) __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); - if (!err) + if (!err) { subreq->transferred += total; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + } netfs_read_subreq_terminated(subreq, err, false); } diff --git a/fs/afs/write.c b/fs/afs/write.c index 34107b55f834c..ccb6aa8027c5b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -122,7 +122,7 @@ static void afs_issue_write_worker(struct work_struct *work) if (subreq->debug_index == 3) return netfs_write_subrequest_terminated(subreq, -ENOANO, false); - if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { + if (!subreq->retry_count) { set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); } @@ -149,6 +149,9 @@ static void afs_issue_write_worker(struct work_struct *work) afs_wait_for_operation(op); ret = afs_put_operation(op); switch (ret) { + case 0: + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + break; case -EACCES: case -EPERM: case -ENOKEY: diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 46ce3b7adf072..47ed3a5044e21 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -438,7 +438,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, rreq->origin == NETFS_READPAGE || rreq->origin == NETFS_READ_FOR_WRITE)) { netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); } } EXPORT_SYMBOL(netfs_read_subreq_progress); @@ -497,7 +497,7 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, rreq->origin == NETFS_READPAGE || rreq->origin == NETFS_READ_FOR_WRITE)) { netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); } rreq->transferred += subreq->transferred; } @@ -511,10 +511,13 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, } else { trace_netfs_sreq(subreq, netfs_sreq_trace_short); if (subreq->transferred > subreq->consumed) { - __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); - } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + /* If we didn't read new data, abandon retry. */ + if (subreq->retry_count && + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); + } + } else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); } else { diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0350592ea8047..0e72e9226fc88 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -56,6 +56,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; netfs_reset_iter(subreq); netfs_reissue_read(rreq, subreq); } @@ -137,7 +139,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) stream0->sreq_max_len = subreq->len; __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; spin_lock_bh(&rreq->lock); list_add_tail(&subreq->rreq_link, &rreq->subrequests); @@ -213,7 +216,6 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) subreq->error = -ENOMEM; __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags); } spin_lock_bh(&rreq->lock); list_splice_tail_init(&queue, &rreq->subrequests); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 82290c92ba7a2..ca3a11ed9b541 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -179,7 +179,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, struct iov_iter source = subreq->io_iter; iov_iter_revert(&source, subreq->len - source.count); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_write(stream, subreq, &source); } @@ -234,7 +233,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, /* Renegotiate max_len (wsize) */ trace_netfs_sreq(subreq, netfs_sreq_trace_retry); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + subreq->retry_count++; stream->prepare_write(subreq); part = min(len, stream->sreq_max_len); @@ -279,7 +278,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, subreq->start = start; subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); subreq->stream_nr = to->stream_nr; - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + subreq->retry_count = 1; trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index bf6d507578e53..ff0e82505a0bd 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -244,6 +244,8 @@ void netfs_reissue_write(struct netfs_io_stream *stream, iov_iter_advance(source, size); iov_iter_truncate(&subreq->io_iter, size); + subreq->retry_count++; + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); netfs_do_issue_write(stream, subreq); } diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index bd42a419458e3..6cb1e81993f89 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1319,14 +1319,16 @@ cifs_readv_callback(struct mid_q_entry *mid) } if (rdata->result == -ENODATA) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && rdata->subreq.start + trans == ictx->remote_i_size) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + } else if (rdata->got_bytes > 0) { + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } } @@ -1670,10 +1672,13 @@ cifs_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; - if (written < wdata->subreq.len) + if (written < wdata->subreq.len) { result = -ENOSPC; - else + } else { result = written; + if (written > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 010eae9d6c47e..458b53d1f9cb8 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4615,6 +4615,7 @@ smb2_readv_callback(struct mid_q_entry *mid) __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, server->credits, server->in_flight, @@ -4840,10 +4841,12 @@ smb2_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; - if (written < wdata->subreq.len) + if (written < wdata->subreq.len) { wdata->result = -ENOSPC; - else + } else if (written > 0) { wdata->subreq.len = written; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: @@ -5012,7 +5015,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) } #endif - if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags)) + if (wdata->subreq.retry_count > 0) smb2_set_replay(server, &rqst); cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5eaceef41e6ca..4083d77e3f39d 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -185,6 +185,7 @@ struct netfs_io_subrequest { short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ + u8 retry_count; /* The number of retries (0 on initial pass) */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned char curr_folioq_slot; /* Folio currently being read */ @@ -194,14 +195,13 @@ struct netfs_io_subrequest { #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ -#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ +#define NETFS_SREQ_MADE_PROGRESS 4 /* Set if we transferred at least some data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ #define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ -#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ -#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ +#define NETFS_SREQ_FAILED 10 /* Set if the subreq failed unretryably */ }; enum netfs_io_origin { From 38cf8e945721ffe708fa675507465da7f4f2a9f7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:09 +0000 Subject: [PATCH 0885/1450] netfs: Fix ceph copy to cache on write-begin At the end of netfs_unlock_read_folio() in which folios are marked appropriately for copying to the cache (either with by being marked dirty and having their private data set or by having PG_private_2 set) and then unlocked, the folio_queue struct has the entry pointing to the folio cleared. This presents a problem for netfs_pgpriv2_write_to_the_cache(), which is used to write folios marked with PG_private_2 to the cache as it expects to be able to trawl the folio_queue list thereafter to find the relevant folios, leading to a hang. Fix this by not clearing the folio_queue entry if we're going to do the deprecated copy-to-cache. The clearance will be done instead as the folios are written to the cache. This can be reproduced by starting cachefiles, mounting a ceph filesystem with "-o fsc" and writing to it. Fixes: 796a4049640b ("netfs: In readahead, put the folio refs as soon extracted") Reported-by: Max Kellermann Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-10-dhowells@redhat.com Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") cc: Jeff Layton cc: Ilya Dryomov cc: Xiubo Li cc: netfs@lists.linux.dev cc: ceph-devel@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 47ed3a5044e21..e8624f5c7fcc6 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -62,10 +62,14 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, } else { trace_netfs_folio(folio, netfs_folio_trace_read_done); } + + folioq_clear(folioq, slot); } else { // TODO: Use of PG_private_2 is deprecated. if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); + else + folioq_clear(folioq, slot); } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { @@ -77,8 +81,6 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, folio_unlock(folio); } } - - folioq_clear(folioq, slot); } /* From d0327c824338cdccad058723a31d038ecd553409 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:10 +0000 Subject: [PATCH 0886/1450] netfs: Fix the (non-)cancellation of copy when cache is temporarily disabled When the caching for a cookie is temporarily disabled (e.g. due to a DIO write on that file), future copying to the cache for that file is disabled until all fds open on that file are closed. However, if netfslib is using the deprecated PG_private_2 method (such as is currently used by ceph), and decides it wants to copy to the cache, netfs_advance_write() will just bail at the first check seeing that the cache stream is unavailable, and indicate that it dealt with all the content. This means that we have no subrequests to provide notifications to drive the state machine or even to pin the request and the request just gets discarded, leaving the folios with PG_private_2 set. Fix this by jumping directly to cancel the request if the cache is not available. That way, we don't remove mark3 from the folio_queue list and netfs_pgpriv2_cancel() will clean up the folios. This was found by running the generic/013 xfstest against ceph with an active cache and the "-o fsc" option passed to ceph. That would usually hang Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Reported-by: Max Kellermann Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241213135013.2964079-11-dhowells@redhat.com cc: Jeff Layton cc: Ilya Dryomov cc: Xiubo Li cc: netfs@lists.linux.dev cc: ceph-devel@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_pgpriv2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index ba5af89d37fae..54d5004fec182 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -170,6 +170,10 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq) trace_netfs_write(wreq, netfs_write_trace_copy_to_cache); netfs_stat(&netfs_n_wh_copy_to_cache); + if (!wreq->io_streams[1].avail) { + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + goto couldnt_start; + } for (;;) { error = netfs_pgpriv2_copy_folio(wreq, folio); From d4e338de17cb6532bf805fae00db8b41e914009b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:34:45 +0000 Subject: [PATCH 0887/1450] netfs: Fix is-caching check in read-retry netfs: Fix is-caching check in read-retry The read-retry code checks the NETFS_RREQ_COPY_TO_CACHE flag to determine if there might be failed reads from the cache that need turning into reads from the server, with the intention of skipping the complicated part if it can. The code that set the flag, however, got lost during the read-side rewrite. Fix the check to see if the cache_resources are valid instead. The flag can then be removed. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Signed-off-by: David Howells Link: https://lore.kernel.org/r/3752048.1734381285@warthog.procyon.org.uk cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_retry.c | 2 +- include/linux/netfs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0e72e9226fc88..21b4a54e545e1 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -49,7 +49,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) * up to the first permanently failed one. */ if (!rreq->netfs_ops->prepare_read && - !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { + !rreq->cache_resources.ops) { struct netfs_io_subrequest *subreq; list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4083d77e3f39d..ecdd5ced16a84 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -269,7 +269,6 @@ struct netfs_io_request { size_t prev_donated; /* Fallback for subreq->prev_donated */ refcount_t ref; unsigned long flags; -#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ From f284424dc17b57d779a03dfc9a66489a67150b30 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 18 Dec 2024 18:15:56 +0100 Subject: [PATCH 0888/1450] net: bridge: Extract a helper to handle bridge_binding toggles Currently, the BROPT_VLAN_BRIDGE_BINDING bridge option is only toggled when VLAN devices are added on top of a bridge or removed from it. Extract the toggling of the option to a function so that it could be invoked by a subsequent patch when the state of an upper VLAN device changes. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/a7455f6fe1dfa7b13126ed8a7fb33d3b611eecb8.1734540770.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_vlan.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 89f51ea4cabec..b728b71e693f3 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1664,6 +1664,18 @@ static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) } } +static void br_vlan_toggle_bridge_binding(struct net_device *br_dev, + bool enable) +{ + struct net_bridge *br = netdev_priv(br_dev); + + if (enable) + br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); + else + br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, + br_vlan_has_upper_bind_vlan_dev(br_dev)); +} + static void br_vlan_upper_change(struct net_device *dev, struct net_device *upper_dev, bool linking) @@ -1673,13 +1685,9 @@ static void br_vlan_upper_change(struct net_device *dev, if (!br_vlan_is_bind_vlan_dev(upper_dev)) return; - if (linking) { + br_vlan_toggle_bridge_binding(dev, linking); + if (linking) br_vlan_set_vlan_dev_state(br, upper_dev); - br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); - } else { - br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, - br_vlan_has_upper_bind_vlan_dev(dev)); - } } struct br_vlan_link_state_walk_data { From 3abd45122c72d6a66a52d41a65586fdf7ab40ef7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 18 Dec 2024 18:15:57 +0100 Subject: [PATCH 0889/1450] net: bridge: Handle changes in VLAN_FLAG_BRIDGE_BINDING When bridge binding is enabled on a VLAN netdevice, its link state should track bridge ports that are members of the corresponding VLAN. This works for newly-added netdevices. However toggling the option does not have the effect of enabling or disabling the behavior as appropriate. In this patch, react to bridge_binding toggles on VLAN uppers. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/90a8ca8aea4d81378b29d75d9e562433e0d5c7ff.1734540770.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br.c | 7 +++++++ net/bridge/br_private.h | 9 +++++++++ net/bridge/br_vlan.c | 24 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/net/bridge/br.c b/net/bridge/br.c index 2cab878e0a39c..183fcb362f9e6 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -51,6 +51,13 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v } } + if (is_vlan_dev(dev)) { + struct net_device *real_dev = vlan_dev_real_dev(dev); + + if (netif_is_bridge_master(real_dev)) + br_vlan_vlan_upper_event(real_dev, dev, event); + } + /* not a port of a bridge */ p = br_port_get_rtnl(dev); if (!p) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 9853cfbb9d144..29d6ec45cf41d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1571,6 +1571,9 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); +void br_vlan_vlan_upper_event(struct net_device *br_dev, + struct net_device *vlan_dev, + unsigned long event); int br_vlan_rtnl_init(void); void br_vlan_rtnl_uninit(void); void br_vlan_notify(const struct net_bridge *br, @@ -1802,6 +1805,12 @@ static inline int br_vlan_bridge_event(struct net_device *dev, return 0; } +static inline void br_vlan_vlan_upper_event(struct net_device *br_dev, + struct net_device *vlan_dev, + unsigned long event) +{ +} + static inline int br_vlan_rtnl_init(void) { return 0; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index b728b71e693f3..d9a69ec9affe5 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1772,6 +1772,30 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) return ret; } +void br_vlan_vlan_upper_event(struct net_device *br_dev, + struct net_device *vlan_dev, + unsigned long event) +{ + struct vlan_dev_priv *vlan = vlan_dev_priv(vlan_dev); + struct net_bridge *br = netdev_priv(br_dev); + bool bridge_binding; + + switch (event) { + case NETDEV_CHANGE: + case NETDEV_UP: + break; + default: + return; + } + + bridge_binding = vlan->flags & VLAN_FLAG_BRIDGE_BINDING; + br_vlan_toggle_bridge_binding(br_dev, bridge_binding); + if (bridge_binding) + br_vlan_set_vlan_dev_state(br, vlan_dev); + else if (!bridge_binding && netif_carrier_ok(br_dev)) + netif_carrier_on(vlan_dev); +} + /* Must be protected by RTNL. */ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) { From 976d248bd33356eecb958cdc1b0c37622fd5d595 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 18 Dec 2024 18:15:58 +0100 Subject: [PATCH 0890/1450] selftests: net: lib: Add a couple autodefer helpers Alongside the helper ip_link_set_up(), one to set the link down will be useful as well. Add a helper to determine the link state as well, ip_link_is_up(), and use it to short-circuit any changes if the state is already the desired one. Furthermore, add a helper bridge_vlan_add(). Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/856d9e01725fdba21b7f6716358f645b19131af2.1734540770.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib.sh | 31 ++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 2cd5c743b2d99..0bd9a038a1f0e 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -477,12 +477,33 @@ ip_link_set_addr() defer ip link set dev "$name" address "$old_addr" } +ip_link_is_up() +{ + local name=$1; shift + + local state=$(ip -j link show "$name" | + jq -r '(.[].flags[] | select(. == "UP")) // "DOWN"') + [[ $state == "UP" ]] +} + ip_link_set_up() { local name=$1; shift - ip link set dev "$name" up - defer ip link set dev "$name" down + if ! ip_link_is_up "$name"; then + ip link set dev "$name" up + defer ip link set dev "$name" down + fi +} + +ip_link_set_down() +{ + local name=$1; shift + + if ip_link_is_up "$name"; then + ip link set dev "$name" down + defer ip link set dev "$name" up + fi } ip_addr_add() @@ -498,3 +519,9 @@ ip_route_add() ip route add "$@" defer ip route del "$@" } + +bridge_vlan_add() +{ + bridge vlan add "$@" + defer bridge vlan del "$@" +} From dca12e9ab7603d94e47ded65080f750d6527c852 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 18 Dec 2024 18:15:59 +0100 Subject: [PATCH 0891/1450] selftests: net: Add a VLAN bridge binding selftest Add a test that exercises bridge binding. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/baf7244fd1fe223a6d93e027584fa9f99dee982c.1734540770.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/vlan_bridge_binding.sh | 256 ++++++++++++++++++ 2 files changed, 257 insertions(+) create mode 100755 tools/testing/selftests/net/vlan_bridge_binding.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index f09bd96cc978b..73ee88d6b0430 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -96,6 +96,7 @@ TEST_PROGS += test_bridge_backup_port.sh TEST_PROGS += fdb_flush.sh fdb_notify.sh TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh +TEST_PROGS += vlan_bridge_binding.sh TEST_PROGS += bpf_offload.py TEST_PROGS += ipv6_route_update_soft_lockup.sh TEST_PROGS += busy_poll_test.sh diff --git a/tools/testing/selftests/net/vlan_bridge_binding.sh b/tools/testing/selftests/net/vlan_bridge_binding.sh new file mode 100755 index 0000000000000..e7cb8c678bdee --- /dev/null +++ b/tools/testing/selftests/net/vlan_bridge_binding.sh @@ -0,0 +1,256 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +ALL_TESTS=" + test_binding_on + test_binding_off + test_binding_toggle_on + test_binding_toggle_off + test_binding_toggle_on_when_upper_down + test_binding_toggle_off_when_upper_down + test_binding_toggle_on_when_lower_down + test_binding_toggle_off_when_lower_down +" + +setup_prepare() +{ + local port + + ip_link_add br up type bridge vlan_filtering 1 + + for port in d1 d2 d3; do + ip_link_add $port type veth peer name r$port + ip_link_set_up $port + ip_link_set_up r$port + ip_link_set_master $port br + done + + bridge_vlan_add vid 11 dev br self + bridge_vlan_add vid 11 dev d1 master + + bridge_vlan_add vid 12 dev br self + bridge_vlan_add vid 12 dev d2 master + + bridge_vlan_add vid 13 dev br self + bridge_vlan_add vid 13 dev d1 master + bridge_vlan_add vid 13 dev d2 master + + bridge_vlan_add vid 14 dev br self + bridge_vlan_add vid 14 dev d1 master + bridge_vlan_add vid 14 dev d2 master + bridge_vlan_add vid 14 dev d3 master +} + +operstate_is() +{ + local dev=$1; shift + local expect=$1; shift + + local operstate=$(ip -j link show $dev | jq -r .[].operstate) + if [[ $operstate == UP ]]; then + operstate=1 + elif [[ $operstate == DOWN || $operstate == LOWERLAYERDOWN ]]; then + operstate=0 + fi + echo -n $operstate + [[ $operstate == $expect ]] +} + +check_operstate() +{ + local dev=$1; shift + local expect=$1; shift + local operstate + + operstate=$(busywait 1000 \ + operstate_is "$dev" "$expect") + check_err $? "Got operstate of $operstate, expected $expect" +} + +add_one_vlan() +{ + local link=$1; shift + local id=$1; shift + + ip_link_add $link.$id link $link type vlan id $id "$@" +} + +add_vlans() +{ + add_one_vlan br 11 "$@" + add_one_vlan br 12 "$@" + add_one_vlan br 13 "$@" + add_one_vlan br 14 "$@" +} + +set_vlans() +{ + ip link set dev br.11 "$@" + ip link set dev br.12 "$@" + ip link set dev br.13 "$@" + ip link set dev br.14 "$@" +} + +down_netdevs() +{ + local dev + + for dev in "$@"; do + ip_link_set_down $dev + done +} + +check_operstates() +{ + local opst_11=$1; shift + local opst_12=$1; shift + local opst_13=$1; shift + local opst_14=$1; shift + + check_operstate br.11 $opst_11 + check_operstate br.12 $opst_12 + check_operstate br.13 $opst_13 + check_operstate br.14 $opst_14 +} + +do_test_binding() +{ + local inject=$1; shift + local what=$1; shift + local opsts_d1=$1; shift + local opsts_d2=$1; shift + local opsts_d12=$1; shift + local opsts_d123=$1; shift + + RET=0 + + defer_scope_push + down_netdevs d1 + $inject + check_operstates $opsts_d1 + defer_scope_pop + + defer_scope_push + down_netdevs d2 + $inject + check_operstates $opsts_d2 + defer_scope_pop + + defer_scope_push + down_netdevs d1 d2 + $inject + check_operstates $opsts_d12 + defer_scope_pop + + defer_scope_push + down_netdevs d1 d2 d3 + $inject + check_operstates $opsts_d123 + defer_scope_pop + + log_test "Test bridge_binding $what" +} + +do_test_binding_on() +{ + local inject=$1; shift + local what=$1; shift + + do_test_binding "$inject" "$what" \ + "0 1 1 1" \ + "1 0 1 1" \ + "0 0 0 1" \ + "0 0 0 0" +} + +do_test_binding_off() +{ + local inject=$1; shift + local what=$1; shift + + do_test_binding "$inject" "$what" \ + "1 1 1 1" \ + "1 1 1 1" \ + "1 1 1 1" \ + "0 0 0 0" +} + +test_binding_on() +{ + add_vlans bridge_binding on + set_vlans up + do_test_binding_on : "on" +} + +test_binding_off() +{ + add_vlans bridge_binding off + set_vlans up + do_test_binding_off : "off" +} + +test_binding_toggle_on() +{ + add_vlans bridge_binding off + set_vlans up + set_vlans type vlan bridge_binding on + do_test_binding_on : "off->on" +} + +test_binding_toggle_off() +{ + add_vlans bridge_binding on + set_vlans up + set_vlans type vlan bridge_binding off + do_test_binding_off : "on->off" +} + +dfr_set_binding_on() +{ + set_vlans type vlan bridge_binding on + defer set_vlans type vlan bridge_binding off +} + +dfr_set_binding_off() +{ + set_vlans type vlan bridge_binding off + defer set_vlans type vlan bridge_binding on +} + +test_binding_toggle_on_when_lower_down() +{ + add_vlans bridge_binding off + set_vlans up + do_test_binding_on dfr_set_binding_on "off->on when lower down" +} + +test_binding_toggle_off_when_lower_down() +{ + add_vlans bridge_binding on + set_vlans up + do_test_binding_off dfr_set_binding_off "on->off when lower down" +} + +test_binding_toggle_on_when_upper_down() +{ + add_vlans bridge_binding off + set_vlans type vlan bridge_binding on + set_vlans up + do_test_binding_on : "off->on when upper down" +} + +test_binding_toggle_off_when_upper_down() +{ + add_vlans bridge_binding on + set_vlans type vlan bridge_binding off + set_vlans up + do_test_binding_off : "on->off when upper down" +} + +trap defer_scopes_cleanup EXIT +setup_prepare +tests_run + +exit $EXIT_STATUS From 3272040790eb4b6cafe6c30ec05049e9599ec456 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 19 Dec 2024 11:00:19 +0100 Subject: [PATCH 0892/1450] qlcnic: use const 'struct bin_attribute' callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now provides callback variants that explicitly take a const pointer. Use them so the non-const variants can be removed. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20241219-sysfs-const-bin_attr-net-v2-1-93bdaece3c90@weissschuh.net Signed-off-by: Jakub Kicinski --- .../net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c | 69 +++++++++---------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c index 74125188beb82..c0f20464fd1e0 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c @@ -264,7 +264,7 @@ static int qlcnic_sysfs_validate_crb(struct qlcnic_adapter *adapter, } static ssize_t qlcnic_sysfs_read_crb(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -281,7 +281,7 @@ static ssize_t qlcnic_sysfs_read_crb(struct file *filp, struct kobject *kobj, } static ssize_t qlcnic_sysfs_write_crb(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -310,7 +310,7 @@ static int qlcnic_sysfs_validate_mem(struct qlcnic_adapter *adapter, } static ssize_t qlcnic_sysfs_read_mem(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -332,7 +332,7 @@ static ssize_t qlcnic_sysfs_read_mem(struct file *filp, struct kobject *kobj, } static ssize_t qlcnic_sysfs_write_mem(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { struct device *dev = kobj_to_dev(kobj); @@ -396,7 +396,7 @@ static int validate_pm_config(struct qlcnic_adapter *adapter, static ssize_t qlcnic_sysfs_write_pm_config(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -446,7 +446,7 @@ static ssize_t qlcnic_sysfs_write_pm_config(struct file *filp, static ssize_t qlcnic_sysfs_read_pm_config(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -539,7 +539,7 @@ static int validate_esw_config(struct qlcnic_adapter *adapter, static ssize_t qlcnic_sysfs_write_esw_config(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -623,7 +623,7 @@ static ssize_t qlcnic_sysfs_write_esw_config(struct file *file, static ssize_t qlcnic_sysfs_read_esw_config(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -675,7 +675,7 @@ static int validate_npar_config(struct qlcnic_adapter *adapter, static ssize_t qlcnic_sysfs_write_npar_config(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -722,7 +722,7 @@ static ssize_t qlcnic_sysfs_write_npar_config(struct file *file, static ssize_t qlcnic_sysfs_read_npar_config(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -769,7 +769,7 @@ static ssize_t qlcnic_sysfs_read_npar_config(struct file *file, static ssize_t qlcnic_sysfs_get_port_stats(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -804,7 +804,7 @@ static ssize_t qlcnic_sysfs_get_port_stats(struct file *file, static ssize_t qlcnic_sysfs_get_esw_stats(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -839,7 +839,7 @@ static ssize_t qlcnic_sysfs_get_esw_stats(struct file *file, static ssize_t qlcnic_sysfs_clear_esw_stats(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -868,7 +868,7 @@ static ssize_t qlcnic_sysfs_clear_esw_stats(struct file *file, static ssize_t qlcnic_sysfs_clear_port_stats(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -898,7 +898,7 @@ static ssize_t qlcnic_sysfs_clear_port_stats(struct file *file, static ssize_t qlcnic_sysfs_read_pci_config(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -938,7 +938,7 @@ static ssize_t qlcnic_sysfs_read_pci_config(struct file *file, static ssize_t qlcnic_83xx_sysfs_flash_read_handler(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -1115,7 +1115,7 @@ static int qlcnic_83xx_sysfs_flash_write(struct qlcnic_adapter *adapter, static ssize_t qlcnic_83xx_sysfs_flash_write_handler(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t offset, size_t size) { @@ -1195,64 +1195,63 @@ static const struct device_attribute dev_attr_beacon = { static const struct bin_attribute bin_attr_crb = { .attr = { .name = "crb", .mode = 0644 }, .size = 0, - .read = qlcnic_sysfs_read_crb, - .write = qlcnic_sysfs_write_crb, + .read_new = qlcnic_sysfs_read_crb, + .write_new = qlcnic_sysfs_write_crb, }; static const struct bin_attribute bin_attr_mem = { .attr = { .name = "mem", .mode = 0644 }, .size = 0, - .read = qlcnic_sysfs_read_mem, - .write = qlcnic_sysfs_write_mem, + .read_new = qlcnic_sysfs_read_mem, + .write_new = qlcnic_sysfs_write_mem, }; static const struct bin_attribute bin_attr_npar_config = { .attr = { .name = "npar_config", .mode = 0644 }, .size = 0, - .read = qlcnic_sysfs_read_npar_config, - .write = qlcnic_sysfs_write_npar_config, + .read_new = qlcnic_sysfs_read_npar_config, + .write_new = qlcnic_sysfs_write_npar_config, }; static const struct bin_attribute bin_attr_pci_config = { .attr = { .name = "pci_config", .mode = 0644 }, .size = 0, - .read = qlcnic_sysfs_read_pci_config, - .write = NULL, + .read_new = qlcnic_sysfs_read_pci_config, }; static const struct bin_attribute bin_attr_port_stats = { .attr = { .name = "port_stats", .mode = 0644 }, .size = 0, - .read = qlcnic_sysfs_get_port_stats, - .write = qlcnic_sysfs_clear_port_stats, + .read_new = qlcnic_sysfs_get_port_stats, + .write_new = qlcnic_sysfs_clear_port_stats, }; static const struct bin_attribute bin_attr_esw_stats = { .attr = { .name = "esw_stats", .mode = 0644 }, .size = 0, - .read = qlcnic_sysfs_get_esw_stats, - .write = qlcnic_sysfs_clear_esw_stats, + .read_new = qlcnic_sysfs_get_esw_stats, + .write_new = qlcnic_sysfs_clear_esw_stats, }; static const struct bin_attribute bin_attr_esw_config = { .attr = { .name = "esw_config", .mode = 0644 }, .size = 0, - .read = qlcnic_sysfs_read_esw_config, - .write = qlcnic_sysfs_write_esw_config, + .read_new = qlcnic_sysfs_read_esw_config, + .write_new = qlcnic_sysfs_write_esw_config, }; static const struct bin_attribute bin_attr_pm_config = { .attr = { .name = "pm_config", .mode = 0644 }, .size = 0, - .read = qlcnic_sysfs_read_pm_config, - .write = qlcnic_sysfs_write_pm_config, + .read_new = qlcnic_sysfs_read_pm_config, + .write_new = qlcnic_sysfs_write_pm_config, }; static const struct bin_attribute bin_attr_flash = { .attr = { .name = "flash", .mode = 0644 }, .size = 0, - .read = qlcnic_83xx_sysfs_flash_read_handler, - .write = qlcnic_83xx_sysfs_flash_write_handler, + .read_new = qlcnic_83xx_sysfs_flash_read_handler, + .write_new = qlcnic_83xx_sysfs_flash_write_handler, }; #ifdef CONFIG_QLCNIC_HWMON From 6ed3472173c575cd8aaed6c62eb74f7728404ee6 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Tue, 17 Dec 2024 18:02:40 -0800 Subject: [PATCH 0893/1450] net: dsa: microchip: Do not execute PTP driver code for unsupported switches The PTP driver code only works for certain KSZ switches like KSZ9477, KSZ9567, LAN937X and their varieties. This code is enabled by kernel configuration CONFIG_NET_DSA_MICROCHIP_KSZ_PTP. As the DSA driver is common to work with all KSZ switches this PTP code is not appropriate for other unsupported switches. The ptp_capable indication is added to the chip data structure to signal whether to execute those code. Signed-off-by: Tristram Ha Link: https://patch.msgid.link/20241218020240.70601-1-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 40 +++++++++++++++++++------- drivers/net/dsa/microchip/ksz_common.h | 1 + 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index a8dac7ff6b817..e3512e324572e 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -1339,6 +1339,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .supports_rgmii = {false, false, true}, .internal_phy = {true, true, false}, .gbit_capable = {false, false, true}, + .ptp_capable = true, .wr_table = &ksz8563_register_set, .rd_table = &ksz8563_register_set, }, @@ -1550,6 +1551,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .internal_phy = {true, true, true, true, true, false, false}, .gbit_capable = {true, true, true, true, true, true, true}, + .ptp_capable = true, .wr_table = &ksz9477_register_set, .rd_table = &ksz9477_register_set, }, @@ -1677,6 +1679,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .supports_rgmii = {false, false, true}, .internal_phy = {true, true, false}, .gbit_capable = {true, true, true}, + .ptp_capable = true, }, [KSZ8567] = { @@ -1712,6 +1715,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { true, false, false}, .gbit_capable = {false, false, false, false, false, true, true}, + .ptp_capable = true, }, [KSZ9567] = { @@ -1744,6 +1748,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .internal_phy = {true, true, true, true, true, false, false}, .gbit_capable = {true, true, true, true, true, true, true}, + .ptp_capable = true, }, [LAN9370] = { @@ -1773,6 +1778,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .supports_rmii = {false, false, false, false, true}, .supports_rgmii = {false, false, false, false, true}, .internal_phy = {true, true, true, true, false}, + .ptp_capable = true, }, [LAN9371] = { @@ -1802,6 +1808,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { .supports_rmii = {false, false, false, false, true, true}, .supports_rgmii = {false, false, false, false, true, true}, .internal_phy = {true, true, true, true, false, false}, + .ptp_capable = true, }, [LAN9372] = { @@ -1835,6 +1842,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { true, true, false, false}, .internal_phy = {true, true, true, true, false, false, true, true}, + .ptp_capable = true, }, [LAN9373] = { @@ -1868,6 +1876,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { true, true, false, false}, .internal_phy = {true, true, true, false, false, false, true, true}, + .ptp_capable = true, }, [LAN9374] = { @@ -1901,6 +1910,7 @@ const struct ksz_chip_data ksz_switch_chips[] = { true, true, false, false}, .internal_phy = {true, true, true, true, false, false, true, true}, + .ptp_capable = true, }, [LAN9646] = { @@ -2809,16 +2819,21 @@ static int ksz_setup(struct dsa_switch *ds) if (ret) goto out_girq; - ret = ksz_ptp_irq_setup(ds, dp->index); - if (ret) - goto out_pirq; + if (dev->info->ptp_capable) { + ret = ksz_ptp_irq_setup(ds, dp->index); + if (ret) + goto out_pirq; + } } } - ret = ksz_ptp_clock_register(ds); - if (ret) { - dev_err(dev->dev, "Failed to register PTP clock: %d\n", ret); - goto out_ptpirq; + if (dev->info->ptp_capable) { + ret = ksz_ptp_clock_register(ds); + if (ret) { + dev_err(dev->dev, "Failed to register PTP clock: %d\n", + ret); + goto out_ptpirq; + } } ret = ksz_mdio_register(dev); @@ -2838,9 +2853,10 @@ static int ksz_setup(struct dsa_switch *ds) return 0; out_ptp_clock_unregister: - ksz_ptp_clock_unregister(ds); + if (dev->info->ptp_capable) + ksz_ptp_clock_unregister(ds); out_ptpirq: - if (dev->irq > 0) + if (dev->irq > 0 && dev->info->ptp_capable) dsa_switch_for_each_user_port(dp, dev->ds) ksz_ptp_irq_free(ds, dp->index); out_pirq: @@ -2859,11 +2875,13 @@ static void ksz_teardown(struct dsa_switch *ds) struct ksz_device *dev = ds->priv; struct dsa_port *dp; - ksz_ptp_clock_unregister(ds); + if (dev->info->ptp_capable) + ksz_ptp_clock_unregister(ds); if (dev->irq > 0) { dsa_switch_for_each_user_port(dp, dev->ds) { - ksz_ptp_irq_free(ds, dp->index); + if (dev->info->ptp_capable) + ksz_ptp_irq_free(ds, dp->index); ksz_irq_free(&dev->ports[dp->index].pirq); } diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index 2bc96127a447b..af17a9c030d41 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -92,6 +92,7 @@ struct ksz_chip_data { bool supports_rgmii[KSZ_MAX_NUM_PORTS]; bool internal_phy[KSZ_MAX_NUM_PORTS]; bool gbit_capable[KSZ_MAX_NUM_PORTS]; + bool ptp_capable; const struct regmap_access_table *wr_table; const struct regmap_access_table *rd_table; }; From 8600058ba28a7b07660ddcd150372d72fb3bc895 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Fri, 20 Dec 2024 15:06:47 -0600 Subject: [PATCH 0894/1450] of: Add coreboot firmware to excluded default cells list Google Juniper and other Chromebook platforms have a very old bootloader which populates /firmware node without proper address/size-cells leading to warnings: Missing '#address-cells' in /firmware WARNING: CPU: 0 PID: 1 at drivers/of/base.c:106 of_bus_n_addr_cells+0x90/0xf0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0 #1 933ab9971ff4d5dc58cb378a96f64c7f72e3454d Hardware name: Google juniper sku16 board (DT) ... Missing '#size-cells' in /firmware WARNING: CPU: 0 PID: 1 at drivers/of/base.c:133 of_bus_n_size_cells+0x90/0xf0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.12.0 #1 933ab9971ff4d5dc58cb378a96f64c7f72e3454d Tainted: [W]=WARN Hardware name: Google juniper sku16 board (DT) These platform won't receive updated bootloader/firmware, so add an exclusion for platforms with a "coreboot" compatible node. While this is wider than necessary, that's the easiest fix and it doesn't doesn't matter if we miss checking other platforms using coreboot. We may revisit this later and address with a fixup to the DT itself. Reported-by: Sasha Levin Closes: https://lore.kernel.org/all/Z0NUdoG17EwuCigT@sashalap/ Cc: AngeloGioacchino Del Regno Cc: Matthias Brugger Cc: Chen-Yu Tsai Cc: Krzysztof Kozlowski Signed-off-by: Rob Herring (Arm) --- drivers/of/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index 44b1c8bf9cc0d..e6ef31c4940fb 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -88,7 +88,8 @@ static bool __of_node_is_type(const struct device_node *np, const char *type) } #define EXCLUDED_DEFAULT_CELLS_PLATFORMS ( \ - IS_ENABLED(CONFIG_SPARC) \ + IS_ENABLED(CONFIG_SPARC) || \ + of_find_compatible_node(NULL, NULL, "coreboot") \ ) int of_bus_n_addr_cells(struct device_node *np) From 1dbdce30f040a87f5aa6a9dbe43be398737f090f Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 16 Dec 2024 18:21:44 +0100 Subject: [PATCH 0895/1450] ipv4: Define inet_sk_init_flowi4() and use it in inet_sk_rebuild_header(). IPv4 code commonly has to initialise a flowi4 structure from an IPv4 socket. This requires looking at potential IPv4 options to set the proper destination address, call flowi4_init_output() with the correct set of parameters and run the sk_classify_flow security hook. Instead of reimplementing these operations in different parts of the stack, let's define inet_sk_init_flowi4() which does all these operations. The first user is inet_sk_rebuild_header(), where inet_sk_init_flowi4() replaces ip_route_output_ports(). Unlike ip_route_output_ports(), which sets the flowi4 structure and performs the route lookup in one go, inet_sk_init_flowi4() only initialises the flow. The route lookup is then done by ip_route_output_flow(). Decoupling flow initialisation from route lookup makes this new interface applicable more broadly as it will allow some users to overwrite specific struct flowi4 members before the route lookup. Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/fd416275262b1f518d5abfcef740ce4f4a1a6522.1734357769.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 28 ++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 14 ++------------ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 6947a155d5011..f86775be3e293 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -129,6 +130,33 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); + +static inline void inet_sk_init_flowi4(const struct inet_sock *inet, + struct flowi4 *fl4) +{ + const struct ip_options_rcu *ip4_opt; + const struct sock *sk; + __be32 daddr; + + rcu_read_lock(); + ip4_opt = rcu_dereference(inet->inet_opt); + + /* Source routing option overrides the socket destination address */ + if (ip4_opt && ip4_opt->opt.srr) + daddr = ip4_opt->opt.faddr; + else + daddr = inet->inet_daddr; + rcu_read_unlock(); + + sk = &inet->sk; + flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), + sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk->sk_uid); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); +} + struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp, const struct sk_buff *skb); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8095e82de8083..21f46ee7b6e95 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1309,8 +1309,6 @@ int inet_sk_rebuild_header(struct sock *sk) { struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0)); struct inet_sock *inet = inet_sk(sk); - __be32 daddr; - struct ip_options_rcu *inet_opt; struct flowi4 *fl4; int err; @@ -1319,17 +1317,9 @@ int inet_sk_rebuild_header(struct sock *sk) return 0; /* Reroute. */ - rcu_read_lock(); - inet_opt = rcu_dereference(inet->inet_opt); - daddr = inet->inet_daddr; - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; - rcu_read_unlock(); fl4 = &inet->cork.fl.u.ip4; - rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, - inet->inet_dport, inet->inet_sport, - sk->sk_protocol, ip_sock_rt_tos(sk), - sk->sk_bound_dev_if); + inet_sk_init_flowi4(inet, fl4); + rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (!IS_ERR(rt)) { err = 0; sk_setup_caps(sk, &rt->dst); From 5be1323b5041d806716c80be4f8b11cfb64fa24c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 16 Dec 2024 18:21:46 +0100 Subject: [PATCH 0896/1450] ipv4: Use inet_sk_init_flowi4() in ip4_datagram_release_cb(). Use inet_sk_init_flowi4() to automatically initialise the flowi4 structure in ip4_datagram_release_cb() instead of passing parameters manually to ip_route_output_ports(). Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/9c326b8d9e919478f7952b21473d31da07eba2dd.1734357769.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/datagram.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 4aca1f05edd31..4b5bc6eb52e75 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -102,8 +102,6 @@ EXPORT_SYMBOL(ip4_datagram_connect); void ip4_datagram_release_cb(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); - const struct ip_options_rcu *inet_opt; - __be32 daddr = inet->inet_daddr; struct dst_entry *dst; struct flowi4 fl4; struct rtable *rt; @@ -115,14 +113,9 @@ void ip4_datagram_release_cb(struct sock *sk) rcu_read_unlock(); return; } - inet_opt = rcu_dereference(inet->inet_opt); - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; - rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, - inet->inet_saddr, inet->inet_dport, - inet->inet_sport, sk->sk_protocol, - ip_sock_rt_tos(sk), sk->sk_bound_dev_if); + inet_sk_init_flowi4(inet, &fl4); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); dst = !IS_ERR(rt) ? &rt->dst : NULL; sk_dst_set(sk, dst); From 42e5ffc385f3b0790c6cd5b54d3396a6c772d3b6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 16 Dec 2024 18:21:48 +0100 Subject: [PATCH 0897/1450] ipv4: Use inet_sk_init_flowi4() in inet_csk_rebuild_route(). Use inet_sk_init_flowi4() to automatically initialise the flowi4 structure in inet_csk_rebuild_route() instead of passing parameters manually to ip_route_output_ports(). Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/b270931636effa1095508e0f0a3e8c3a0e6d357f.1734357769.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6872b5aff73e5..e4decfb270fa1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1561,20 +1561,13 @@ EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); - const struct ip_options_rcu *inet_opt; - __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; rcu_read_lock(); - inet_opt = rcu_dereference(inet->inet_opt); - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; fl4 = &fl->u.ip4; - rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, - inet->inet_saddr, inet->inet_dport, - inet->inet_sport, sk->sk_protocol, - ip_sock_rt_tos(sk), sk->sk_bound_dev_if); + inet_sk_init_flowi4(inet, fl4); + rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (IS_ERR(rt)) rt = NULL; if (rt) From 148721f8e04a10a3b9c51f030c9be0d15b0a4d17 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 16 Dec 2024 18:21:51 +0100 Subject: [PATCH 0898/1450] ipv4: Use inet_sk_init_flowi4() in __ip_queue_xmit(). Use inet_sk_init_flowi4() to automatically initialise the flowi4 structure in __ip_queue_xmit() instead of passing parameters manually to ip_route_output_ports(). Override ->flowi4_tos with the value passed as parameter since that's required by SCTP. Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/37e64ffbd9adac187b14aa9097b095f5c86e85be.1734357769.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_output.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f45a083f2c13b..ea7a260bec8aa 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -478,24 +478,16 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, /* Make sure we can route this packet. */ rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) { - __be32 daddr; + inet_sk_init_flowi4(inet, fl4); - /* Use correct destination address if we have options. */ - daddr = inet->inet_daddr; - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; + /* sctp_v4_xmit() uses its own DSCP value */ + fl4->flowi4_tos = tos & INET_DSCP_MASK; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(net, fl4, sk, - daddr, inet->inet_saddr, - inet->inet_dport, - inet->inet_sport, - sk->sk_protocol, - tos & INET_DSCP_MASK, - sk->sk_bound_dev_if); + rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; sk_setup_caps(sk, &rt->dst); From c63e9f3b89d3f96220a1c99466fed4563c14a259 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 16 Dec 2024 18:21:54 +0100 Subject: [PATCH 0899/1450] l2tp: Use inet_sk_init_flowi4() in l2tp_ip_sendmsg(). Use inet_sk_init_flowi4() to automatically initialise the flowi4 structure in l2tp_ip_sendmsg() instead of passing parameters manually to ip_route_output_ports(). Override ->daddr with the value passed in the msghdr structure if provided. Signed-off-by: Guillaume Nault Reviewed-by: James Chapman Link: https://patch.msgid.link/2ff22a3560c5050228928456662b80b9c84a8fe4.1734357769.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/l2tp/l2tp_ip.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 4bc24fddfd524..29795d2839e8b 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -425,7 +425,6 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int rc; struct inet_sock *inet = inet_sk(sk); struct rtable *rt = NULL; - struct flowi4 *fl4; int connected = 0; __be32 daddr; @@ -455,7 +454,6 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (sk->sk_state != TCP_ESTABLISHED) goto out; - daddr = inet->inet_daddr; connected = 1; } @@ -482,29 +480,24 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto error; } - fl4 = &inet->cork.fl.u.ip4; if (connected) rt = dst_rtable(__sk_dst_check(sk, 0)); rcu_read_lock(); if (!rt) { - const struct ip_options_rcu *inet_opt; + struct flowi4 *fl4 = &inet->cork.fl.u.ip4; - inet_opt = rcu_dereference(inet->inet_opt); + inet_sk_init_flowi4(inet, fl4); - /* Use correct destination address if we have options. */ - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; + /* Overwrite ->daddr if msg->msg_name was provided */ + if (!connected) + fl4->daddr = daddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), fl4, sk, - daddr, inet->inet_saddr, - inet->inet_dport, inet->inet_sport, - sk->sk_protocol, ip_sock_rt_tos(sk), - sk->sk_bound_dev_if); + rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (IS_ERR(rt)) goto no_route; if (connected) { From fdf478d236dcf0f1f68534df5d456ced625195bd Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:22 +0800 Subject: [PATCH 0900/1450] skmsg: Return copied bytes in sk_msg_memcopy_from_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously sk_msg_memcopy_from_iter returns the copied bytes from the last copy_from_iter{,_nocache} call upon success. This commit changes it to return the total number of copied bytes on success. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-1-bae583d014f3@outlook.com --- net/core/skmsg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8ad7e6755fd64..61f3f3d4e5285 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -369,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; struct scatterlist *sge; - u32 copy, buf_size; void *to; do { @@ -397,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } bytes -= copy; + copied += copy; if (!bytes) break; msg->sg.copybreak = 0; @@ -404,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } while (i != msg->sg.end); out: msg->sg.curr = i; - return ret; + return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); From 5153a75ef34b3f7478ca918044d0f05eed8fb3f9 Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:23 +0800 Subject: [PATCH 0901/1450] tcp_bpf: Fix copied value in tcp_bpf_sendmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf kselftest sockhash::test_txmsg_cork_hangs in test_sockmap.c triggers a kernel NULL pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000008 ? __die_body+0x6e/0xb0 ? __die+0x8b/0xa0 ? page_fault_oops+0x358/0x3c0 ? local_clock+0x19/0x30 ? lock_release+0x11b/0x440 ? kernelmode_fixup_or_oops+0x54/0x60 ? __bad_area_nosemaphore+0x4f/0x210 ? mmap_read_unlock+0x13/0x30 ? bad_area_nosemaphore+0x16/0x20 ? do_user_addr_fault+0x6fd/0x740 ? prb_read_valid+0x1d/0x30 ? exc_page_fault+0x55/0xd0 ? asm_exc_page_fault+0x2b/0x30 ? splice_to_socket+0x52e/0x630 ? shmem_file_splice_read+0x2b1/0x310 direct_splice_actor+0x47/0x70 splice_direct_to_actor+0x133/0x300 ? do_splice_direct+0x90/0x90 do_splice_direct+0x64/0x90 ? __ia32_sys_tee+0x30/0x30 do_sendfile+0x214/0x300 __se_sys_sendfile64+0x8e/0xb0 __x64_sys_sendfile64+0x25/0x30 x64_sys_call+0xb82/0x2840 do_syscall_64+0x75/0x110 entry_SYSCALL_64_after_hwframe+0x4b/0x53 This is caused by tcp_bpf_sendmsg() returning a larger value(12289) than size (8192), which causes the while loop in splice_to_socket() to release an uninitialized pipe buf. The underlying cause is that this code assumes sk_msg_memcopy_from_iter() will copy all bytes upon success but it actually might only copy part of it. This commit changes it to use the real copied bytes. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-2-bae583d014f3@outlook.com --- net/ipv4/tcp_bpf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 392678ae80f4e..47f65b1b70ca2 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -495,7 +495,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; - int copied = 0, err = 0; + int copied = 0, err = 0, ret = 0; struct sk_psock *psock; long timeo; int flags; @@ -538,14 +538,14 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) copy = msg_tx->sg.size - osize; } - err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); - if (err < 0) { + if (ret < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } - copied += copy; + copied += ret; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; From 9ecc4d858b92c1bb0673ad9c327298e600c55659 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:54 -0800 Subject: [PATCH 0902/1450] bpf: Check negative offsets in __bpf_skb_min_len() skb_network_offset() and skb_transport_offset() can be negative when they are called after we pull the transport header, for example, when we use eBPF sockmap at the point of ->sk_data_ready(). __bpf_skb_min_len() uses an unsigned int to get these offsets, this leads to a very large number which then causes bpf_skb_change_tail() failed unexpectedly. Fix this by using a signed int to get these offsets and ensure the minimum is at least zero. Fixes: 5293efe62df8 ("bpf: add bpf_skb_change_tail helper") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-2-xiyou.wangcong@gmail.com --- net/core/filter.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 21131ec25f24a..834614071727a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3734,13 +3734,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - u32 min_len = skb_network_offset(skb); + int offset = skb_network_offset(skb); + u32 min_len = 0; - if (skb_transport_header_was_set(skb)) - min_len = skb_transport_offset(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) - min_len = skb_checksum_start_offset(skb) + - skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } return min_len; } From 9ee0c7b8654346d60c823babe4b3747357a30477 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:55 -0800 Subject: [PATCH 0903/1450] selftests/bpf: Add a BPF selftest for bpf_skb_change_tail() As requested by Daniel, we need to add a selftest to cover bpf_skb_change_tail() cases in skb_verdict. Here we test trimming, growing and error cases, and validate its expected return values and the expected sizes of the payload. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-3-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 51 +++++++++++++++++++ .../bpf/progs/test_sockmap_change_tail.c | 40 +++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 248754296d972..884ad87783d59 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -12,6 +12,7 @@ #include "test_sockmap_progs_query.skel.h" #include "test_sockmap_pass_prog.skel.h" #include "test_sockmap_drop_prog.skel.h" +#include "test_sockmap_change_tail.skel.h" #include "bpf_iter_sockmap.skel.h" #include "sockmap_helpers.h" @@ -643,6 +644,54 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) test_sockmap_drop_prog__destroy(drop); } +static void test_sockmap_skb_verdict_change_tail(void) +{ + struct test_sockmap_change_tail *skel; + int err, map, verdict; + int c1, p1, sent, recvd; + int zero = 0; + char buf[2]; + + skel = test_sockmap_change_tail__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1); + if (!ASSERT_OK(err, "create_pair()")) + goto out; + err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) + goto out_close; + sent = xsend(p1, "Tr", 2, 0); + ASSERT_EQ(sent, 2, "xsend(p1)"); + recvd = recv(c1, buf, 2, 0); + ASSERT_EQ(recvd, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + sent = xsend(p1, "G", 1, 0); + ASSERT_EQ(sent, 1, "xsend(p1)"); + recvd = recv(c1, buf, 2, 0); + ASSERT_EQ(recvd, 2, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + sent = xsend(p1, "E", 1, 0); + ASSERT_EQ(sent, 1, "xsend(p1)"); + recvd = recv(c1, buf, 1, 0); + ASSERT_EQ(recvd, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret"); + +out_close: + close(c1); + close(p1); +out: + test_sockmap_change_tail__destroy(skel); +} + static void test_sockmap_skb_verdict_peek_helper(int map) { int err, c1, p1, zero = 0, sent, recvd, avail; @@ -1058,6 +1107,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(true); if (test__start_subtest("sockmap skb_verdict fionread on drop")) test_sockmap_skb_verdict_fionread(false); + if (test__start_subtest("sockmap skb_verdict change tail")) + test_sockmap_skb_verdict_change_tail(); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c b/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c new file mode 100644 index 0000000000000..2796dd8545eb9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_change_tail.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 ByteDance */ +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sock_map_rx SEC(".maps"); + +long change_tail_ret = 1; + +SEC("sk_skb") +int prog_skb_verdict(struct __sk_buff *skb) +{ + char *data, *data_end; + + bpf_skb_pull_data(skb, 1); + data = (char *)(unsigned long)skb->data; + data_end = (char *)(unsigned long)skb->data_end; + + if (data + 1 > data_end) + return SK_PASS; + + if (data[0] == 'T') { /* Trim the packet */ + change_tail_ret = bpf_skb_change_tail(skb, skb->len - 1, 0); + return SK_PASS; + } else if (data[0] == 'G') { /* Grow the packet */ + change_tail_ret = bpf_skb_change_tail(skb, skb->len + 1, 0); + return SK_PASS; + } else if (data[0] == 'E') { /* Error */ + change_tail_ret = bpf_skb_change_tail(skb, 65535, 0); + return SK_PASS; + } + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; From 472759c9f5377912c7483cca5da847888a27cecc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:56 -0800 Subject: [PATCH 0904/1450] selftests/bpf: Introduce socket_helpers.h for TC tests Pull socket helpers out of sockmap_helpers.h so that they can be reused for TC tests as well. This prepares for the next patch. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-4-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/socket_helpers.h | 394 ++++++++++++++++++ .../bpf/prog_tests/sockmap_helpers.h | 385 +---------------- 2 files changed, 395 insertions(+), 384 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/socket_helpers.h diff --git a/tools/testing/selftests/bpf/prog_tests/socket_helpers.h b/tools/testing/selftests/bpf/prog_tests/socket_helpers.h new file mode 100644 index 0000000000000..1bdfb79ef009b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/socket_helpers.h @@ -0,0 +1,394 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SOCKET_HELPERS__ +#define __SOCKET_HELPERS__ + +#include + +/* include/linux/net.h */ +#define SOCK_TYPE_MASK 0xf + +#define IO_TIMEOUT_SEC 30 +#define MAX_STRERR_LEN 256 + +/* workaround for older vm_sockets.h */ +#ifndef VMADDR_CID_LOCAL +#define VMADDR_CID_LOCAL 1 +#endif + +/* include/linux/cleanup.h */ +#define __get_and_null(p, nullvalue) \ + ({ \ + __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = nullvalue; \ + __val; \ + }) + +#define take_fd(fd) __get_and_null(fd, -EBADF) + +/* Wrappers that fail the test on error and report it. */ + +#define _FAIL(errnum, fmt...) \ + ({ \ + error_at_line(0, (errnum), __func__, __LINE__, fmt); \ + CHECK_FAIL(true); \ + }) +#define FAIL(fmt...) _FAIL(0, fmt) +#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt) +#define FAIL_LIBBPF(err, msg) \ + ({ \ + char __buf[MAX_STRERR_LEN]; \ + libbpf_strerror((err), __buf, sizeof(__buf)); \ + FAIL("%s: %s", (msg), __buf); \ + }) + + +#define xaccept_nonblock(fd, addr, len) \ + ({ \ + int __ret = \ + accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("accept"); \ + __ret; \ + }) + +#define xbind(fd, addr, len) \ + ({ \ + int __ret = bind((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("bind"); \ + __ret; \ + }) + +#define xclose(fd) \ + ({ \ + int __ret = close((fd)); \ + if (__ret == -1) \ + FAIL_ERRNO("close"); \ + __ret; \ + }) + +#define xconnect(fd, addr, len) \ + ({ \ + int __ret = connect((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("connect"); \ + __ret; \ + }) + +#define xgetsockname(fd, addr, len) \ + ({ \ + int __ret = getsockname((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockname"); \ + __ret; \ + }) + +#define xgetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = getsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockopt(" #name ")"); \ + __ret; \ + }) + +#define xlisten(fd, backlog) \ + ({ \ + int __ret = listen((fd), (backlog)); \ + if (__ret == -1) \ + FAIL_ERRNO("listen"); \ + __ret; \ + }) + +#define xsetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = setsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("setsockopt(" #name ")"); \ + __ret; \ + }) + +#define xsend(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = send((fd), (buf), (len), (flags)); \ + if (__ret == -1) \ + FAIL_ERRNO("send"); \ + __ret; \ + }) + +#define xrecv_nonblock(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = recv_timeout((fd), (buf), (len), (flags), \ + IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("recv"); \ + __ret; \ + }) + +#define xsocket(family, sotype, flags) \ + ({ \ + int __ret = socket(family, sotype, flags); \ + if (__ret == -1) \ + FAIL_ERRNO("socket"); \ + __ret; \ + }) + +static inline void close_fd(int *fd) +{ + if (*fd >= 0) + xclose(*fd); +} + +#define __close_fd __attribute__((cleanup(close_fd))) + +static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss) +{ + return (struct sockaddr *)ss; +} + +static inline void init_addr_loopback4(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss)); + + addr4->sin_family = AF_INET; + addr4->sin_port = 0; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + *len = sizeof(*addr4); +} + +static inline void init_addr_loopback6(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss)); + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = 0; + addr6->sin6_addr = in6addr_loopback; + *len = sizeof(*addr6); +} + +static inline void init_addr_loopback_vsock(struct sockaddr_storage *ss, + socklen_t *len) +{ + struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss)); + + addr->svm_family = AF_VSOCK; + addr->svm_port = VMADDR_PORT_ANY; + addr->svm_cid = VMADDR_CID_LOCAL; + *len = sizeof(*addr); +} + +static inline void init_addr_loopback(int family, struct sockaddr_storage *ss, + socklen_t *len) +{ + switch (family) { + case AF_INET: + init_addr_loopback4(ss, len); + return; + case AF_INET6: + init_addr_loopback6(ss, len); + return; + case AF_VSOCK: + init_addr_loopback_vsock(ss, len); + return; + default: + FAIL("unsupported address family %d", family); + } +} + +static inline int enable_reuseport(int s, int progfd) +{ + int err, one = 1; + + err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + if (err) + return -1; + err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd, + sizeof(progfd)); + if (err) + return -1; + + return 0; +} + +static inline int socket_loopback_reuseport(int family, int sotype, int progfd) +{ + struct sockaddr_storage addr; + socklen_t len = 0; + int err, s; + + init_addr_loopback(family, &addr, &len); + + s = xsocket(family, sotype, 0); + if (s == -1) + return -1; + + if (progfd >= 0) + enable_reuseport(s, progfd); + + err = xbind(s, sockaddr(&addr), len); + if (err) + goto close; + + if (sotype & SOCK_DGRAM) + return s; + + err = xlisten(s, SOMAXCONN); + if (err) + goto close; + + return s; +close: + xclose(s); + return -1; +} + +static inline int socket_loopback(int family, int sotype) +{ + return socket_loopback_reuseport(family, sotype, -1); +} + +static inline int poll_connect(int fd, unsigned int timeout_sec) +{ + struct timeval timeout = { .tv_sec = timeout_sec }; + fd_set wfds; + int r, eval; + socklen_t esize = sizeof(eval); + + FD_ZERO(&wfds); + FD_SET(fd, &wfds); + + r = select(fd + 1, NULL, &wfds, NULL, &timeout); + if (r == 0) + errno = ETIME; + if (r != 1) + return -1; + + if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0) + return -1; + if (eval != 0) { + errno = eval; + return -1; + } + + return 0; +} + +static inline int poll_read(int fd, unsigned int timeout_sec) +{ + struct timeval timeout = { .tv_sec = timeout_sec }; + fd_set rfds; + int r; + + FD_ZERO(&rfds); + FD_SET(fd, &rfds); + + r = select(fd + 1, &rfds, NULL, NULL, &timeout); + if (r == 0) + errno = ETIME; + + return r == 1 ? 0 : -1; +} + +static inline int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return accept(fd, addr, len); +} + +static inline int recv_timeout(int fd, void *buf, size_t len, int flags, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return recv(fd, buf, len, flags); +} + + +static inline int create_pair(int family, int sotype, int *p0, int *p1) +{ + __close_fd int s, c = -1, p = -1; + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int err; + + s = socket_loopback(family, sotype); + if (s < 0) + return s; + + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + return err; + + c = xsocket(family, sotype, 0); + if (c < 0) + return c; + + err = connect(c, sockaddr(&addr), len); + if (err) { + if (errno != EINPROGRESS) { + FAIL_ERRNO("connect"); + return err; + } + + err = poll_connect(c, IO_TIMEOUT_SEC); + if (err) { + FAIL_ERRNO("poll_connect"); + return err; + } + } + + switch (sotype & SOCK_TYPE_MASK) { + case SOCK_DGRAM: + err = xgetsockname(c, sockaddr(&addr), &len); + if (err) + return err; + + err = xconnect(s, sockaddr(&addr), len); + if (err) + return err; + + *p0 = take_fd(s); + break; + case SOCK_STREAM: + case SOCK_SEQPACKET: + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) + return p; + + *p0 = take_fd(p); + break; + default: + FAIL("Unsupported socket type %#x", sotype); + return -EOPNOTSUPP; + } + + *p1 = take_fd(c); + return 0; +} + +static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1, + int *p0, int *p1) +{ + int err; + + err = create_pair(family, sotype, c0, p0); + if (err) + return err; + + err = create_pair(family, sotype, c1, p1); + if (err) { + close(*c0); + close(*p0); + } + + return err; +} + +#endif // __SOCKET_HELPERS__ diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index 38e35c72bdaa2..3e5571dd578d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -1,139 +1,12 @@ #ifndef __SOCKMAP_HELPERS__ #define __SOCKMAP_HELPERS__ -#include +#include "socket_helpers.h" -/* include/linux/net.h */ -#define SOCK_TYPE_MASK 0xf - -#define IO_TIMEOUT_SEC 30 -#define MAX_STRERR_LEN 256 #define MAX_TEST_NAME 80 -/* workaround for older vm_sockets.h */ -#ifndef VMADDR_CID_LOCAL -#define VMADDR_CID_LOCAL 1 -#endif - #define __always_unused __attribute__((__unused__)) -/* include/linux/cleanup.h */ -#define __get_and_null(p, nullvalue) \ - ({ \ - __auto_type __ptr = &(p); \ - __auto_type __val = *__ptr; \ - *__ptr = nullvalue; \ - __val; \ - }) - -#define take_fd(fd) __get_and_null(fd, -EBADF) - -#define _FAIL(errnum, fmt...) \ - ({ \ - error_at_line(0, (errnum), __func__, __LINE__, fmt); \ - CHECK_FAIL(true); \ - }) -#define FAIL(fmt...) _FAIL(0, fmt) -#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt) -#define FAIL_LIBBPF(err, msg) \ - ({ \ - char __buf[MAX_STRERR_LEN]; \ - libbpf_strerror((err), __buf, sizeof(__buf)); \ - FAIL("%s: %s", (msg), __buf); \ - }) - -/* Wrappers that fail the test on error and report it. */ - -#define xaccept_nonblock(fd, addr, len) \ - ({ \ - int __ret = \ - accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC); \ - if (__ret == -1) \ - FAIL_ERRNO("accept"); \ - __ret; \ - }) - -#define xbind(fd, addr, len) \ - ({ \ - int __ret = bind((fd), (addr), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("bind"); \ - __ret; \ - }) - -#define xclose(fd) \ - ({ \ - int __ret = close((fd)); \ - if (__ret == -1) \ - FAIL_ERRNO("close"); \ - __ret; \ - }) - -#define xconnect(fd, addr, len) \ - ({ \ - int __ret = connect((fd), (addr), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("connect"); \ - __ret; \ - }) - -#define xgetsockname(fd, addr, len) \ - ({ \ - int __ret = getsockname((fd), (addr), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("getsockname"); \ - __ret; \ - }) - -#define xgetsockopt(fd, level, name, val, len) \ - ({ \ - int __ret = getsockopt((fd), (level), (name), (val), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("getsockopt(" #name ")"); \ - __ret; \ - }) - -#define xlisten(fd, backlog) \ - ({ \ - int __ret = listen((fd), (backlog)); \ - if (__ret == -1) \ - FAIL_ERRNO("listen"); \ - __ret; \ - }) - -#define xsetsockopt(fd, level, name, val, len) \ - ({ \ - int __ret = setsockopt((fd), (level), (name), (val), (len)); \ - if (__ret == -1) \ - FAIL_ERRNO("setsockopt(" #name ")"); \ - __ret; \ - }) - -#define xsend(fd, buf, len, flags) \ - ({ \ - ssize_t __ret = send((fd), (buf), (len), (flags)); \ - if (__ret == -1) \ - FAIL_ERRNO("send"); \ - __ret; \ - }) - -#define xrecv_nonblock(fd, buf, len, flags) \ - ({ \ - ssize_t __ret = recv_timeout((fd), (buf), (len), (flags), \ - IO_TIMEOUT_SEC); \ - if (__ret == -1) \ - FAIL_ERRNO("recv"); \ - __ret; \ - }) - -#define xsocket(family, sotype, flags) \ - ({ \ - int __ret = socket(family, sotype, flags); \ - if (__ret == -1) \ - FAIL_ERRNO("socket"); \ - __ret; \ - }) - #define xbpf_map_delete_elem(fd, key) \ ({ \ int __ret = bpf_map_delete_elem((fd), (key)); \ @@ -193,130 +66,6 @@ __ret; \ }) -static inline void close_fd(int *fd) -{ - if (*fd >= 0) - xclose(*fd); -} - -#define __close_fd __attribute__((cleanup(close_fd))) - -static inline int poll_connect(int fd, unsigned int timeout_sec) -{ - struct timeval timeout = { .tv_sec = timeout_sec }; - fd_set wfds; - int r, eval; - socklen_t esize = sizeof(eval); - - FD_ZERO(&wfds); - FD_SET(fd, &wfds); - - r = select(fd + 1, NULL, &wfds, NULL, &timeout); - if (r == 0) - errno = ETIME; - if (r != 1) - return -1; - - if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0) - return -1; - if (eval != 0) { - errno = eval; - return -1; - } - - return 0; -} - -static inline int poll_read(int fd, unsigned int timeout_sec) -{ - struct timeval timeout = { .tv_sec = timeout_sec }; - fd_set rfds; - int r; - - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - - r = select(fd + 1, &rfds, NULL, NULL, &timeout); - if (r == 0) - errno = ETIME; - - return r == 1 ? 0 : -1; -} - -static inline int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len, - unsigned int timeout_sec) -{ - if (poll_read(fd, timeout_sec)) - return -1; - - return accept(fd, addr, len); -} - -static inline int recv_timeout(int fd, void *buf, size_t len, int flags, - unsigned int timeout_sec) -{ - if (poll_read(fd, timeout_sec)) - return -1; - - return recv(fd, buf, len, flags); -} - -static inline void init_addr_loopback4(struct sockaddr_storage *ss, - socklen_t *len) -{ - struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss)); - - addr4->sin_family = AF_INET; - addr4->sin_port = 0; - addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - *len = sizeof(*addr4); -} - -static inline void init_addr_loopback6(struct sockaddr_storage *ss, - socklen_t *len) -{ - struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss)); - - addr6->sin6_family = AF_INET6; - addr6->sin6_port = 0; - addr6->sin6_addr = in6addr_loopback; - *len = sizeof(*addr6); -} - -static inline void init_addr_loopback_vsock(struct sockaddr_storage *ss, - socklen_t *len) -{ - struct sockaddr_vm *addr = memset(ss, 0, sizeof(*ss)); - - addr->svm_family = AF_VSOCK; - addr->svm_port = VMADDR_PORT_ANY; - addr->svm_cid = VMADDR_CID_LOCAL; - *len = sizeof(*addr); -} - -static inline void init_addr_loopback(int family, struct sockaddr_storage *ss, - socklen_t *len) -{ - switch (family) { - case AF_INET: - init_addr_loopback4(ss, len); - return; - case AF_INET6: - init_addr_loopback6(ss, len); - return; - case AF_VSOCK: - init_addr_loopback_vsock(ss, len); - return; - default: - FAIL("unsupported address family %d", family); - } -} - -static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss) -{ - return (struct sockaddr *)ss; -} - static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2) { u64 value; @@ -334,136 +83,4 @@ static inline int add_to_sockmap(int sock_mapfd, int fd1, int fd2) return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); } -static inline int enable_reuseport(int s, int progfd) -{ - int err, one = 1; - - err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); - if (err) - return -1; - err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd, - sizeof(progfd)); - if (err) - return -1; - - return 0; -} - -static inline int socket_loopback_reuseport(int family, int sotype, int progfd) -{ - struct sockaddr_storage addr; - socklen_t len = 0; - int err, s; - - init_addr_loopback(family, &addr, &len); - - s = xsocket(family, sotype, 0); - if (s == -1) - return -1; - - if (progfd >= 0) - enable_reuseport(s, progfd); - - err = xbind(s, sockaddr(&addr), len); - if (err) - goto close; - - if (sotype & SOCK_DGRAM) - return s; - - err = xlisten(s, SOMAXCONN); - if (err) - goto close; - - return s; -close: - xclose(s); - return -1; -} - -static inline int socket_loopback(int family, int sotype) -{ - return socket_loopback_reuseport(family, sotype, -1); -} - -static inline int create_pair(int family, int sotype, int *p0, int *p1) -{ - __close_fd int s, c = -1, p = -1; - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int err; - - s = socket_loopback(family, sotype); - if (s < 0) - return s; - - err = xgetsockname(s, sockaddr(&addr), &len); - if (err) - return err; - - c = xsocket(family, sotype, 0); - if (c < 0) - return c; - - err = connect(c, sockaddr(&addr), len); - if (err) { - if (errno != EINPROGRESS) { - FAIL_ERRNO("connect"); - return err; - } - - err = poll_connect(c, IO_TIMEOUT_SEC); - if (err) { - FAIL_ERRNO("poll_connect"); - return err; - } - } - - switch (sotype & SOCK_TYPE_MASK) { - case SOCK_DGRAM: - err = xgetsockname(c, sockaddr(&addr), &len); - if (err) - return err; - - err = xconnect(s, sockaddr(&addr), len); - if (err) - return err; - - *p0 = take_fd(s); - break; - case SOCK_STREAM: - case SOCK_SEQPACKET: - p = xaccept_nonblock(s, NULL, NULL); - if (p < 0) - return p; - - *p0 = take_fd(p); - break; - default: - FAIL("Unsupported socket type %#x", sotype); - return -EOPNOTSUPP; - } - - *p1 = take_fd(c); - return 0; -} - -static inline int create_socket_pairs(int family, int sotype, int *c0, int *c1, - int *p0, int *p1) -{ - int err; - - err = create_pair(family, sotype, c0, p0); - if (err) - return err; - - err = create_pair(family, sotype, c1, p1); - if (err) { - close(*c0); - close(*p0); - } - - return err; -} - #endif // __SOCKMAP_HELPERS__ From 4a58963d10fa3cb654b859e3f9a8aecbcf9f4982 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:57 -0800 Subject: [PATCH 0905/1450] selftests/bpf: Test bpf_skb_change_tail() in TC ingress Similarly to the previous test, we also need a test case to cover positive offsets as well, TC is an excellent hook for this. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Tested-by: Zijian Zhang Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-5-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/tc_change_tail.c | 62 ++++++++++ .../selftests/bpf/progs/test_tc_change_tail.c | 106 ++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_change_tail.c create mode 100644 tools/testing/selftests/bpf/progs/test_tc_change_tail.c diff --git a/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c b/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c new file mode 100644 index 0000000000000..74752233e779d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tc_change_tail.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "test_tc_change_tail.skel.h" +#include "socket_helpers.h" + +#define LO_IFINDEX 1 + +void test_tc_change_tail(void) +{ + LIBBPF_OPTS(bpf_tcx_opts, tcx_opts); + struct test_tc_change_tail *skel = NULL; + struct bpf_link *link; + int c1, p1; + char buf[2]; + int ret; + + skel = test_tc_change_tail__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tc_change_tail__open_and_load")) + return; + + link = bpf_program__attach_tcx(skel->progs.change_tail, LO_IFINDEX, + &tcx_opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_tcx")) + goto destroy; + + skel->links.change_tail = link; + ret = create_pair(AF_INET, SOCK_DGRAM, &c1, &p1); + if (!ASSERT_OK(ret, "create_pair")) + goto destroy; + + ret = xsend(p1, "Tr", 2, 0); + ASSERT_EQ(ret, 2, "xsend(p1)"); + ret = recv(c1, buf, 2, 0); + ASSERT_EQ(ret, 2, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + ret = xsend(p1, "G", 1, 0); + ASSERT_EQ(ret, 1, "xsend(p1)"); + ret = recv(c1, buf, 2, 0); + ASSERT_EQ(ret, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, 0, "change_tail_ret"); + + ret = xsend(p1, "E", 1, 0); + ASSERT_EQ(ret, 1, "xsend(p1)"); + ret = recv(c1, buf, 1, 0); + ASSERT_EQ(ret, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret"); + + ret = xsend(p1, "Z", 1, 0); + ASSERT_EQ(ret, 1, "xsend(p1)"); + ret = recv(c1, buf, 1, 0); + ASSERT_EQ(ret, 1, "recv(c1)"); + ASSERT_EQ(skel->data->change_tail_ret, -EINVAL, "change_tail_ret"); + + close(c1); + close(p1); +destroy: + test_tc_change_tail__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_change_tail.c b/tools/testing/selftests/bpf/progs/test_tc_change_tail.c new file mode 100644 index 0000000000000..28edafe803f04 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_tc_change_tail.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +long change_tail_ret = 1; + +static __always_inline struct iphdr *parse_ip_header(struct __sk_buff *skb, int *ip_proto) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct ethhdr *eth = data; + struct iphdr *iph; + + /* Verify Ethernet header */ + if ((void *)(data + sizeof(*eth)) > data_end) + return NULL; + + /* Skip Ethernet header to get to IP header */ + iph = (void *)(data + sizeof(struct ethhdr)); + + /* Verify IP header */ + if ((void *)(data + sizeof(struct ethhdr) + sizeof(*iph)) > data_end) + return NULL; + + /* Basic IP header validation */ + if (iph->version != 4) /* Only support IPv4 */ + return NULL; + + if (iph->ihl < 5) /* Minimum IP header length */ + return NULL; + + *ip_proto = iph->protocol; + return iph; +} + +static __always_inline struct udphdr *parse_udp_header(struct __sk_buff *skb, struct iphdr *iph) +{ + void *data_end = (void *)(long)skb->data_end; + void *hdr = (void *)iph; + struct udphdr *udp; + + /* Calculate UDP header position */ + udp = hdr + (iph->ihl * 4); + hdr = (void *)udp; + + /* Verify UDP header bounds */ + if ((void *)(hdr + sizeof(*udp)) > data_end) + return NULL; + + return udp; +} + +SEC("tc/ingress") +int change_tail(struct __sk_buff *skb) +{ + int len = skb->len; + struct udphdr *udp; + struct iphdr *iph; + void *data_end; + char *payload; + int ip_proto; + + bpf_skb_pull_data(skb, len); + + data_end = (void *)(long)skb->data_end; + iph = parse_ip_header(skb, &ip_proto); + if (!iph) + return TCX_PASS; + + if (ip_proto != IPPROTO_UDP) + return TCX_PASS; + + udp = parse_udp_header(skb, iph); + if (!udp) + return TCX_PASS; + + payload = (char *)udp + (sizeof(struct udphdr)); + if (payload + 1 > (char *)data_end) + return TCX_PASS; + + if (payload[0] == 'T') { /* Trim the packet */ + change_tail_ret = bpf_skb_change_tail(skb, len - 1, 0); + if (!change_tail_ret) + bpf_skb_change_tail(skb, len, 0); + return TCX_PASS; + } else if (payload[0] == 'G') { /* Grow the packet */ + change_tail_ret = bpf_skb_change_tail(skb, len + 1, 0); + if (!change_tail_ret) + bpf_skb_change_tail(skb, len, 0); + return TCX_PASS; + } else if (payload[0] == 'E') { /* Error */ + change_tail_ret = bpf_skb_change_tail(skb, 65535, 0); + return TCX_PASS; + } else if (payload[0] == 'Z') { /* Zero */ + change_tail_ret = bpf_skb_change_tail(skb, 0, 0); + return TCX_PASS; + } + return TCX_DROP; +} + +char _license[] SEC("license") = "GPL"; From c384481006476ac65478fa3584c7245782e52f34 Mon Sep 17 00:00:00 2001 From: Nikolaus Voss Date: Thu, 19 Dec 2024 11:54:11 +0100 Subject: [PATCH 0906/1450] clk: clk-imx8mp-audiomix: fix function signature clk_imx8mp_audiomix_reset_controller_register() in the "if !CONFIG_RESET_CONTROLLER" branch had the first argument missing. It is an empty function for this branch so it wasn't immediately apparent. Fixes: 6f0e817175c5 ("clk: imx: clk-audiomix: Add reset controller") Cc: # 6.12.x Signed-off-by: Nikolaus Voss Link: https://lore.kernel.org/r/20241219105447.889CB11FE@mail.steuer-voss.de Reviewed-by: Daniel Baluta Acked-by: Shengjiu Wang Reviewed-by: Peng Fan Signed-off-by: Stephen Boyd --- drivers/clk/imx/clk-imx8mp-audiomix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clk/imx/clk-imx8mp-audiomix.c b/drivers/clk/imx/clk-imx8mp-audiomix.c index b2cb157703c57..c409fc7e06186 100644 --- a/drivers/clk/imx/clk-imx8mp-audiomix.c +++ b/drivers/clk/imx/clk-imx8mp-audiomix.c @@ -278,7 +278,8 @@ static int clk_imx8mp_audiomix_reset_controller_register(struct device *dev, #else /* !CONFIG_RESET_CONTROLLER */ -static int clk_imx8mp_audiomix_reset_controller_register(struct clk_imx8mp_audiomix_priv *priv) +static int clk_imx8mp_audiomix_reset_controller_register(struct device *dev, + struct clk_imx8mp_audiomix_priv *priv) { return 0; } From d67393f4d28ef0544eaf382f1123dcaf56495dc9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 Dec 2024 14:20:43 +0100 Subject: [PATCH 0907/1450] kbuild: Drop support for include/asm- in headers_check.pl "include/asm-" was replaced by "arch//include/asm" a long time ago. All assembler header files are now included using "#include ", so there is no longer a need to rewrite paths. Signed-off-by: Geert Uytterhoeven Signed-off-by: Masahiro Yamada --- usr/include/Makefile | 2 +- usr/include/headers_check.pl | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/usr/include/Makefile b/usr/include/Makefile index 771e32872b2ab..6c6de1b1622b1 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -78,7 +78,7 @@ quiet_cmd_hdrtest = HDRTEST $< cmd_hdrtest = \ $(CC) $(c_flags) -fsyntax-only -x c /dev/null \ $(if $(filter-out $(no-header-test), $*.h), -include $< -include $<); \ - $(PERL) $(src)/headers_check.pl $(obj) $(SRCARCH) $<; \ + $(PERL) $(src)/headers_check.pl $(obj) $<; \ touch $@ $(obj)/%.hdrtest: $(obj)/%.h FORCE diff --git a/usr/include/headers_check.pl b/usr/include/headers_check.pl index b6aec5e4365f9..2b70bfa5558e6 100755 --- a/usr/include/headers_check.pl +++ b/usr/include/headers_check.pl @@ -3,9 +3,8 @@ # # headers_check.pl execute a number of trivial consistency checks # -# Usage: headers_check.pl dir arch [files...] +# Usage: headers_check.pl dir [files...] # dir: dir to look for included files -# arch: architecture # files: list of files to check # # The script reads the supplied files line by line and: @@ -23,7 +22,7 @@ use strict; use File::Basename; -my ($dir, $arch, @files) = @ARGV; +my ($dir, @files) = @ARGV; my $ret = 0; my $line; @@ -54,10 +53,6 @@ sub check_include my $inc = $1; my $found; $found = stat($dir . "/" . $inc); - if (!$found) { - $inc =~ s#asm/#asm-$arch/#; - $found = stat($dir . "/" . $inc); - } if (!$found) { printf STDERR "$filename:$lineno: included file '$inc' is not exported\n"; $ret = 1; From a34e92d2e831729f0ed5df20d15b4df419cd0ba4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 3 Dec 2024 20:14:45 +0900 Subject: [PATCH 0908/1450] kbuild: deb-pkg: add debarch for ARCH=um 'make ARCH=um bindeb-pkg' shows the following warning. $ make ARCH=um bindeb-pkg [snip] GEN debian ** ** ** WARNING ** ** ** Your architecture doesn't have its equivalent Debian userspace architecture defined! Falling back to the current host architecture (amd64). Please add support for um to ./scripts/package/mkdebian ... This commit hard-codes i386/amd64 because UML is only supported for x86. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/mkdebian | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 4ffcc70f8e319..b038a1380b8af 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -70,6 +70,13 @@ set_debarch() { debarch=sh4$(if_enabled_echo CONFIG_CPU_BIG_ENDIAN eb) fi ;; + um) + if is_enabled CONFIG_64BIT; then + debarch=amd64 + else + debarch=i386 + fi + ;; esac if [ -z "$debarch" ]; then debarch=$(dpkg-architecture -qDEB_HOST_ARCH) From 54956567a055345d17438f08c895c68aff3f4cf2 Mon Sep 17 00:00:00 2001 From: Nicolas Schier Date: Thu, 12 Dec 2024 14:05:29 +0100 Subject: [PATCH 0909/1450] kbuild: deb-pkg: Do not install maint scripts for arch 'um' Stop installing Debian maintainer scripts when building a user-mode-linux Debian package. Debian maintainer scripts are used for e.g. requesting rebuilds of initrd, rebuilding DKMS modules and updating of grub configuration. As all of this is not relevant for UML but also may lead to failures while processing the kernel hooks, do no more install maintainer scripts for the UML package. Suggested-by: Masahiro Yamada Signed-off-by: Nicolas Schier Signed-off-by: Masahiro Yamada --- scripts/package/builddeb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index fb686fd3266f0..ad7aba0f268e1 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -63,6 +63,12 @@ install_linux_image () { esac cp "$(${MAKE} -s -f ${srctree}/Makefile image_name)" "${pdir}/${installed_image_path}" + if [ "${ARCH}" != um ]; then + install_maint_scripts "${pdir}" + fi +} + +install_maint_scripts () { # Install the maintainer scripts # Note: hook scripts under /etc/kernel are also executed by official Debian # kernel packages, as well as kernel packages built using make-kpkg. From 9435dc77a33fa20afec7cd35ceaae5f7f42dbbe2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 13 Dec 2024 00:46:15 +0900 Subject: [PATCH 0910/1450] modpost: distinguish same module paths from different dump files Since commit 13b25489b6f8 ("kbuild: change working directory to external module directory with M="), module paths are always relative to the top of the external module tree. The module paths recorded in Module.symvers are no longer globally unique when they are passed via KBUILD_EXTRA_SYMBOLS for building other external modules, which may result in false-positive "exported twice" errors. Such errors should not occur because external modules should be able to override in-tree modules. To address this, record the dump file path in struct module and check it when searching for a module. Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") Reported-by: Jon Hunter Closes: https://lore.kernel.org/all/eb21a546-a19c-40df-b821-bbba80f19a3d@nvidia.com/ Signed-off-by: Masahiro Yamada Tested-by: Jon Hunter --- scripts/mod/modpost.c | 17 +++++++++-------- scripts/mod/modpost.h | 3 ++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index fb787a5715f5e..94ee49207a45a 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -155,12 +155,13 @@ char *get_line(char **stringp) /* A list of all modules we processed */ LIST_HEAD(modules); -static struct module *find_module(const char *modname) +static struct module *find_module(const char *filename, const char *modname) { struct module *mod; list_for_each_entry(mod, &modules, list) { - if (strcmp(mod->name, modname) == 0) + if (!strcmp(mod->dump_file, filename) && + !strcmp(mod->name, modname)) return mod; } return NULL; @@ -2030,10 +2031,10 @@ static void read_dump(const char *fname) continue; } - mod = find_module(modname); + mod = find_module(fname, modname); if (!mod) { mod = new_module(modname, strlen(modname)); - mod->from_dump = true; + mod->dump_file = fname; } s = sym_add_exported(symname, mod, gpl_only, namespace); sym_set_crc(s, crc); @@ -2052,7 +2053,7 @@ static void write_dump(const char *fname) struct symbol *sym; list_for_each_entry(mod, &modules, list) { - if (mod->from_dump) + if (mod->dump_file) continue; list_for_each_entry(sym, &mod->exported_symbols, list) { if (trim_unused_exports && !sym->used) @@ -2076,7 +2077,7 @@ static void write_namespace_deps_files(const char *fname) list_for_each_entry(mod, &modules, list) { - if (mod->from_dump || list_empty(&mod->missing_namespaces)) + if (mod->dump_file || list_empty(&mod->missing_namespaces)) continue; buf_printf(&ns_deps_buf, "%s.ko:", mod->name); @@ -2194,7 +2195,7 @@ int main(int argc, char **argv) read_symbols_from_files(files_source); list_for_each_entry(mod, &modules, list) { - if (mod->from_dump || mod->is_vmlinux) + if (mod->dump_file || mod->is_vmlinux) continue; check_modname_len(mod); @@ -2205,7 +2206,7 @@ int main(int argc, char **argv) handle_white_list_exports(unused_exports_white_list); list_for_each_entry(mod, &modules, list) { - if (mod->from_dump) + if (mod->dump_file) continue; if (mod->is_vmlinux) diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 49848fcbe2a12..8b72c227ebf47 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -95,14 +95,15 @@ struct module_alias { /** * struct module - represent a module (vmlinux or *.ko) * + * @dump_file: path to the .symvers file if loaded from a file * @aliases: list head for module_aliases */ struct module { struct list_head list; struct list_head exported_symbols; struct list_head unresolved_symbols; + const char *dump_file; bool is_gpl_compatible; - bool from_dump; /* true if module was loaded from *.symvers */ bool is_vmlinux; bool seen; bool has_init; From e84a3bf7f4aa669c05e3884497774148ac111468 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Dec 2024 10:19:04 -0500 Subject: [PATCH 0911/1450] staging: gpib: Fix allyesconfig build failures My tests run an allyesconfig build and it failed with the following errors: LD [M] samples/kfifo/dma-example.ko ld.lld: error: undefined symbol: nec7210_board_reset ld.lld: error: undefined symbol: nec7210_read ld.lld: error: undefined symbol: nec7210_write It appears that some modules call the function nec7210_board_reset() that is defined in nec7210.c. In an allyesconfig build, these other modules are built in. But the file that holds nec7210_board_reset() has: obj-m += nec7210.o Where that "-m" means it only gets built as a module. With the other modules built in, they have no access to nec7210_board_reset() and the build fails. This isn't the only function. After fixing that one, I hit another: ld.lld: error: undefined symbol: push_gpib_event ld.lld: error: undefined symbol: gpib_match_device_path Where push_gpib_event() was also used outside of the file it was defined in, and that file too only was built as a module. Since the directory that nec7210.c is only traversed when CONFIG_GPIB_NEC7210 is set, and the directory with gpib_common.c is only traversed when CONFIG_GPIB_COMMON is set, use those configs as the option to build those modules. When it is an allyesconfig, then they will both be built in and their functions will be available to the other modules that are also built in. Fixes: 3ba84ac69b53e ("staging: gpib: Add nec7210 GPIB chip driver") Fixes: 9dde4559e9395 ("staging: gpib: Add GPIB common core driver") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Palmer Dabbelt Signed-off-by: Linus Torvalds --- drivers/staging/gpib/common/Makefile | 2 +- drivers/staging/gpib/nec7210/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/gpib/common/Makefile b/drivers/staging/gpib/common/Makefile index 0c4c77bea75b3..460586edb5741 100644 --- a/drivers/staging/gpib/common/Makefile +++ b/drivers/staging/gpib/common/Makefile @@ -1,5 +1,5 @@ -obj-m += gpib_common.o +obj-$(CONFIG_GPIB_COMMON) += gpib_common.o gpib_common-objs := gpib_os.o iblib.o diff --git a/drivers/staging/gpib/nec7210/Makefile b/drivers/staging/gpib/nec7210/Makefile index 8d4d90f211092..64330f2e89d10 100644 --- a/drivers/staging/gpib/nec7210/Makefile +++ b/drivers/staging/gpib/nec7210/Makefile @@ -1,4 +1,4 @@ -obj-m += nec7210.o +obj-$(CONFIG_GPIB_NEC7210) += nec7210.o From 37d1d99b8806b24ffe4a2b453620df932994a5c0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 17 Dec 2024 08:05:40 +0100 Subject: [PATCH 0912/1450] KVM: VMX: don't include '' directly The header clearly states that it does not want to be included directly, only via ''. Replace the include accordingly. Signed-off-by: Wolfram Sang Message-ID: <20241217070539.2433-2-wsa+renesas@sang-engineering.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 1715d2ab07be5..ad9116a99bccb 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -2,7 +2,7 @@ #ifndef __KVM_X86_VMX_POSTED_INTR_H #define __KVM_X86_VMX_POSTED_INTR_H -#include +#include #include void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); From 398b7b6cb9e046f137a188670da12f790492b56b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 19 Dec 2024 07:43:20 -0500 Subject: [PATCH 0913/1450] KVM: x86: let it be known that ignore_msrs is a bad idea When running KVM with ignore_msrs=1 and report_ignored_msrs=0, the user has no clue that that the guest is being lied to. This may cause bug reports such as https://gitlab.com/qemu-project/qemu/-/issues/2571, where enabling a CPUID bit in QEMU caused Linux guests to try reading MSR_CU_DEF_ERR; and being lied about the existence of MSR_CU_DEF_ERR caused the guest to assume other things about the local APIC which were not true: Sep 14 12:02:53 kernel: mce: [Firmware Bug]: Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly. Sep 14 12:02:53 kernel: unchecked MSR access error: RDMSR from 0x852 at rIP: 0xffffffffb548ffa7 (native_read_msr+0x7/0x40) Sep 14 12:02:53 kernel: Call Trace: ... Sep 14 12:02:53 kernel: native_apic_msr_read+0x20/0x30 Sep 14 12:02:53 kernel: setup_APIC_eilvt+0x47/0x110 Sep 14 12:02:53 kernel: mce_amd_feature_init+0x485/0x4e0 ... Sep 14 12:02:53 kernel: [Firmware Bug]: cpu 0, try to use APIC520 (LVT offset 2) for vector 0xf4, but the register is already in use for vector 0x0 on this cpu Without reported_ignored_msrs=0 at least the host kernel log will contain enough information to avoid going on a wild goose chase. But if reports about individual MSR accesses are being silenced too, at least complain loudly the first time a VM is started. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8160baf38385..12fa68a069661 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12724,6 +12724,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_xen_init_vm(kvm); + if (ignore_msrs && !report_ignored_msrs) { + pr_warn_once("Running KVM with ignore_msrs=1 and report_ignored_msrs=0 is not a\n" + "a supported configuration. Lying to the guest about the existence of MSRs\n" + "may cause the guest operating system to hang or produce errors. If a guest\n" + "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); + } + return 0; out_uninit_mmu: From 4bbf9020becbfd8fc2c3da790855b7042fad455b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 Dec 2024 13:22:21 -0800 Subject: [PATCH 0914/1450] Linux 6.13-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e5b8a8832c0ca..5c9b1d2d59b4d 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* From 0b7a66a2c864859fbf9bb16229c03172eef02c05 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 5 Dec 2024 17:06:02 +0100 Subject: [PATCH 0915/1450] preempt: Move PREEMPT_RT before PREEMPT in vermagic. Since the dynamic preemption has been enabled for PREEMPT_RT we have now CONFIG_PREEMPT and CONFIG_PREEMPT_RT set simultaneously. This affects the vermagic strings which comes now PREEMPT with PREEMPT_RT enabled. The PREEMPT_RT module usually can not be loaded on a PREEMPT kernel because some symbols are missing. However if the symbols are fine then it continues and it crashes later. The problem is that the struct module has a different layout and the num_exentries or init members are at a different position leading to a crash later on. This is not necessary caught by the size check in elf_validity_cache_index_mod() because the mem member has an alignment requirement of __module_memory_align which is big enough keep the total size unchanged. Therefore we should keep the string accurate instead of removing it. Move the PREEMPT_RT check before the PREEMPT so that it takes precedence if both symbols are enabled. Fixes: 35772d627b55c ("sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241205160602.3lIAsJRT@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/vermagic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index a54046bf37e55..939ceabcaf06f 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -15,10 +15,10 @@ #else #define MODULE_VERMAGIC_SMP "" #endif -#ifdef CONFIG_PREEMPT_BUILD -#define MODULE_VERMAGIC_PREEMPT "preempt " -#elif defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_PREEMPT_RT #define MODULE_VERMAGIC_PREEMPT "preempt_rt " +#elif defined(CONFIG_PREEMPT_BUILD) +#define MODULE_VERMAGIC_PREEMPT "preempt " #else #define MODULE_VERMAGIC_PREEMPT "" #endif From 36131b72fb1c62bc61e86068618de304763b8ac7 Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Thu, 14 Nov 2024 10:14:50 +0100 Subject: [PATCH 0916/1450] can: tcan4x5x: add option for selecting nWKRQ voltage The nWKRQ pin supports an output voltage of either the internal reference voltage (3.6V) or the reference voltage of the digital interface 0-6V (VIO). Add the devicetree option ti,nwkrq-voltage-vio to set it to VIO. If this property is omitted the reset default, the internal reference voltage, is used. Signed-off-by: Sean Nyekjaer Reviewed-by: Marc Kleine-Budde Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20241114-tcan-wkrqv-v5-2-a2d50833ed71@geanix.com [mkl: remove unused variable in tcan4x5x_get_dt_data()] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/tcan4x5x-core.c | 19 +++++++++++++++++++ drivers/net/can/m_can/tcan4x5x.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/drivers/net/can/m_can/tcan4x5x-core.c b/drivers/net/can/m_can/tcan4x5x-core.c index 2f73bf3abad88..84b914056b7d0 100644 --- a/drivers/net/can/m_can/tcan4x5x-core.c +++ b/drivers/net/can/m_can/tcan4x5x-core.c @@ -92,6 +92,8 @@ #define TCAN4X5X_MODE_STANDBY BIT(6) #define TCAN4X5X_MODE_NORMAL BIT(7) +#define TCAN4X5X_NWKRQ_VOLTAGE_VIO BIT(19) + #define TCAN4X5X_DISABLE_WAKE_MSK (BIT(31) | BIT(30)) #define TCAN4X5X_DISABLE_INH_MSK BIT(9) @@ -267,6 +269,13 @@ static int tcan4x5x_init(struct m_can_classdev *cdev) if (ret) return ret; + if (tcan4x5x->nwkrq_voltage_vio) { + ret = regmap_set_bits(tcan4x5x->regmap, TCAN4X5X_CONFIG, + TCAN4X5X_NWKRQ_VOLTAGE_VIO); + if (ret) + return ret; + } + return ret; } @@ -318,6 +327,14 @@ static const struct tcan4x5x_version_info return &tcan4x5x_versions[TCAN4X5X]; } +static void tcan4x5x_get_dt_data(struct m_can_classdev *cdev) +{ + struct tcan4x5x_priv *tcan4x5x = cdev_to_priv(cdev); + + tcan4x5x->nwkrq_voltage_vio = + of_property_read_bool(cdev->dev->of_node, "ti,nwkrq-voltage-vio"); +} + static int tcan4x5x_get_gpios(struct m_can_classdev *cdev, const struct tcan4x5x_version_info *version_info) { @@ -453,6 +470,8 @@ static int tcan4x5x_can_probe(struct spi_device *spi) goto out_power; } + tcan4x5x_get_dt_data(mcan_class); + tcan4x5x_check_wake(priv); ret = tcan4x5x_write_tcan_reg(mcan_class, TCAN4X5X_INT_EN, 0); diff --git a/drivers/net/can/m_can/tcan4x5x.h b/drivers/net/can/m_can/tcan4x5x.h index e62c030d3e1e5..203399d5e8ccf 100644 --- a/drivers/net/can/m_can/tcan4x5x.h +++ b/drivers/net/can/m_can/tcan4x5x.h @@ -42,6 +42,8 @@ struct tcan4x5x_priv { struct tcan4x5x_map_buf map_buf_rx; struct tcan4x5x_map_buf map_buf_tx; + + bool nwkrq_voltage_vio; }; static inline void From bddad4fac9f73c14f57b111058dd0fa6d9ede228 Mon Sep 17 00:00:00 2001 From: Dario Binacchi Date: Fri, 22 Nov 2024 23:15:44 +0100 Subject: [PATCH 0917/1450] can: sun4i_can: continue to use likely() to check skb Throughout the sun4i_can_err() function, the likely() macro is used to check the skb buffer, except in one instance. This patch makes the code consistent by using the macro in that case as well. Signed-off-by: Dario Binacchi Link: https://patch.msgid.link/20241122221650.633981-4-dario.binacchi@amarulasolutions.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/sun4i_can.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c index 360158c295d34..48d31197adec2 100644 --- a/drivers/net/can/sun4i_can.c +++ b/drivers/net/can/sun4i_can.c @@ -570,7 +570,7 @@ static int sun4i_can_err(struct net_device *dev, u8 isrc, u8 status) else state = CAN_STATE_ERROR_ACTIVE; } - if (skb && state != CAN_STATE_BUS_OFF) { + if (likely(skb) && state != CAN_STATE_BUS_OFF) { cf->can_id |= CAN_ERR_CNT; cf->data[6] = txerr; cf->data[7] = rxerr; From 68d426da13fac6b1d3f5949a38d31ce2e3d88e49 Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Thu, 28 Nov 2024 09:32:31 +0100 Subject: [PATCH 0918/1450] can: tcan4x5x: get rid of false clock errors tcan4x5x devices only requires the clock "cclk", so call devm_clk_get() directly. This is done to avoid m_can_class_get_clocks() that checks for both hclk and cclk and results in this warning message: | tcan4x5x spi0.0: no clock found Signed-off-by: Sean Nyekjaer Link: https://patch.msgid.link/20241128-mcancclk-v1-1-a93aac64dbae@geanix.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/tcan4x5x-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/m_can/tcan4x5x-core.c b/drivers/net/can/m_can/tcan4x5x-core.c index 84b914056b7d0..4c94541766072 100644 --- a/drivers/net/can/m_can/tcan4x5x-core.c +++ b/drivers/net/can/m_can/tcan4x5x-core.c @@ -409,7 +409,7 @@ static int tcan4x5x_can_probe(struct spi_device *spi) priv->power = NULL; } - m_can_class_get_clocks(mcan_class); + mcan_class->cclk = devm_clk_get(mcan_class->dev, "cclk"); if (IS_ERR(mcan_class->cclk)) { dev_err(&spi->dev, "no CAN clock source defined\n"); freq = TCAN4X5X_EXT_CLK_DEF; From 2351998fd833eb40358adf0b889637311b5bc6b4 Mon Sep 17 00:00:00 2001 From: Charan Pedumuru Date: Wed, 20 Nov 2024 13:58:08 +0530 Subject: [PATCH 0919/1450] dt-bindings: net: can: atmel: Convert to json schema Convert old text based binding to json schema. Changes during conversion: - Add a fallback for `microchip,sam9x60-can` as it is compatible with the CAN IP core on `atmel,at91sam9x5-can`. - Add the required properties `clock` and `clock-names`, which were missing in the original binding. - Update examples and include appropriate file directives to resolve errors identified by `dt_binding_check` and `dtbs_check`. Signed-off-by: Charan Pedumuru Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20241120-can-v3-1-da5bb4f6128d@microchip.com [mkl: fixed indention in example] Signed-off-by: Marc Kleine-Budde --- .../net/can/atmel,at91sam9263-can.yaml | 58 +++++++++++++++++++ .../devicetree/bindings/net/can/atmel-can.txt | 15 ----- 2 files changed, 58 insertions(+), 15 deletions(-) create mode 100644 Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml delete mode 100644 Documentation/devicetree/bindings/net/can/atmel-can.txt diff --git a/Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml b/Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml new file mode 100644 index 0000000000000..c818c01a718b0 --- /dev/null +++ b/Documentation/devicetree/bindings/net/can/atmel,at91sam9263-can.yaml @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/can/atmel,at91sam9263-can.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Microchip AT91 CAN Controller + +maintainers: + - Nicolas Ferre + +allOf: + - $ref: can-controller.yaml# + +properties: + compatible: + oneOf: + - enum: + - atmel,at91sam9263-can + - atmel,at91sam9x5-can + - items: + - enum: + - microchip,sam9x60-can + - const: atmel,at91sam9x5-can + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + clock-names: + items: + - const: can_clk + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +unevaluatedProperties: false + +examples: + - | + #include + #include + can@f000c000 { + compatible = "atmel,at91sam9263-can"; + reg = <0xf000c000 0x300>; + interrupts = <30 IRQ_TYPE_LEVEL_HIGH 3>; + clocks = <&pmc PMC_TYPE_PERIPHERAL 12>; + clock-names = "can_clk"; + }; diff --git a/Documentation/devicetree/bindings/net/can/atmel-can.txt b/Documentation/devicetree/bindings/net/can/atmel-can.txt deleted file mode 100644 index 218a3b3eb27ea..0000000000000 --- a/Documentation/devicetree/bindings/net/can/atmel-can.txt +++ /dev/null @@ -1,15 +0,0 @@ -* AT91 CAN * - -Required properties: - - compatible: Should be "atmel,at91sam9263-can", "atmel,at91sam9x5-can" or - "microchip,sam9x60-can" - - reg: Should contain CAN controller registers location and length - - interrupts: Should contain IRQ line for the CAN controller - -Example: - - can0: can@f000c000 { - compatible = "atmel,at91sam9x5-can"; - reg = <0xf000c000 0x300>; - interrupts = <40 4 5> - }; From 57769cb9ccbad92c5126264584346ebc8501b353 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Sat, 30 Nov 2024 18:08:34 +0100 Subject: [PATCH 0920/1450] mailmap: add an entry for Oliver Hartkopp Map my retired company address and an accidentally used personal mail address within mailmap. Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20241130170911.2828-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 5ff0e5d681e7c..37ecf2632be38 100644 --- a/.mailmap +++ b/.mailmap @@ -529,6 +529,8 @@ Oleksij Rempel Oleksij Rempel Oleksij Rempel Oleksij Rempel +Oliver Hartkopp +Oliver Hartkopp Oliver Upton Ondřej Jirman Oza Pawandeep From 1263e69a7c47a68537476298f13f943fb954581e Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 19 Dec 2024 20:08:37 +0100 Subject: [PATCH 0921/1450] MAINTAINERS: assign em_canid.c additionally to CAN maintainers The extended match rule em_canid is used to classify CAN frames based on their CAN Identifier. To keep the CAN maintainers in the loop for relevant changes which might affect the CAN specific functionality add em_canid.c to the CAN NETWORK LAYER files. Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20241219190837.3087-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0456a33ef6579..a7716f48f50c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5065,6 +5065,7 @@ F: include/uapi/linux/can/gw.h F: include/uapi/linux/can/isotp.h F: include/uapi/linux/can/raw.h F: net/can/ +F: net/sched/em_canid.c CAN-J1939 NETWORK LAYER M: Robin van der Gracht From d50c837675a95f733e53a5e21eb168f8c9f5a73d Mon Sep 17 00:00:00 2001 From: Ariel Otilibili Date: Sat, 21 Dec 2024 12:06:49 +0100 Subject: [PATCH 0922/1450] can: dev: can_get_state_str(): Remove dead code The default switch case ends with a return; meaning this return is never reached. Coverity-ID: 1497123 Signed-off-by: Ariel Otilibili Link: https://patch.msgid.link/20241221111454.1074285-4-ariel.otilibili-anieli@eurecom.fr Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 6792c14fd7eb0..93035a7d50f34 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -85,8 +85,6 @@ const char *can_get_state_str(const enum can_state state) default: return ""; } - - return ""; } EXPORT_SYMBOL_GPL(can_get_state_str); From a502ea6fa94b1f7be72a24bcf9e3f5f6b7e6e90c Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 18 Dec 2024 17:21:16 +0100 Subject: [PATCH 0923/1450] udp: Deal with race between UDP socket address change and rehash If a UDP socket changes its local address while it's receiving datagrams, as a result of connect(), there is a period during which a lookup operation might fail to find it, after the address is changed but before the secondary hash (port and address) and the four-tuple hash (local and remote ports and addresses) are updated. Secondary hash chains were introduced by commit 30fff9231fad ("udp: bind() optimisation") and, as a result, a rehash operation became needed to make a bound socket reachable again after a connect(). This operation was introduced by commit 719f835853a9 ("udp: add rehash on connect()") which isn't however a complete fix: the socket will be found once the rehashing completes, but not while it's pending. This is noticeable with a socat(1) server in UDP4-LISTEN mode, and a client sending datagrams to it. After the server receives the first datagram (cf. _xioopen_ipdgram_listen()), it issues a connect() to the address of the sender, in order to set up a directed flow. Now, if the client, running on a different CPU thread, happens to send a (subsequent) datagram while the server's socket changes its address, but is not rehashed yet, this will result in a failed lookup and a port unreachable error delivered to the client, as apparent from the following reproducer: LEN=$(($(cat /proc/sys/net/core/wmem_default) / 4)) dd if=/dev/urandom bs=1 count=${LEN} of=tmp.in while :; do taskset -c 1 socat UDP4-LISTEN:1337,null-eof OPEN:tmp.out,create,trunc & sleep 0.1 || sleep 1 taskset -c 2 socat OPEN:tmp.in UDP4:localhost:1337,shut-null wait done where the client will eventually get ECONNREFUSED on a write() (typically the second or third one of a given iteration): 2024/11/13 21:28:23 socat[46901] E write(6, 0x556db2e3c000, 8192): Connection refused This issue was first observed as a seldom failure in Podman's tests checking UDP functionality while using pasta(1) to connect the container's network namespace, which leads us to a reproducer with the lookup error resulting in an ICMP packet on a tap device: LOCAL_ADDR="$(ip -j -4 addr show|jq -rM '.[] | .addr_info[0] | select(.scope == "global").local')" while :; do ./pasta --config-net -p pasta.pcap -u 1337 socat UDP4-LISTEN:1337,null-eof OPEN:tmp.out,create,trunc & sleep 0.2 || sleep 1 socat OPEN:tmp.in UDP4:${LOCAL_ADDR}:1337,shut-null wait cmp tmp.in tmp.out done Once this fails: tmp.in tmp.out differ: char 8193, line 29 we can finally have a look at what's going on: $ tshark -r pasta.pcap 1 0.000000 :: ? ff02::16 ICMPv6 110 Multicast Listener Report Message v2 2 0.168690 88.198.0.161 ? 88.198.0.164 UDP 8234 60260 ? 1337 Len=8192 3 0.168767 88.198.0.161 ? 88.198.0.164 UDP 8234 60260 ? 1337 Len=8192 4 0.168806 88.198.0.161 ? 88.198.0.164 UDP 8234 60260 ? 1337 Len=8192 5 0.168827 c6:47:05:8d:dc:04 ? Broadcast ARP 42 Who has 88.198.0.161? Tell 88.198.0.164 6 0.168851 9a:55:9a:55:9a:55 ? c6:47:05:8d:dc:04 ARP 42 88.198.0.161 is at 9a:55:9a:55:9a:55 7 0.168875 88.198.0.161 ? 88.198.0.164 UDP 8234 60260 ? 1337 Len=8192 8 0.168896 88.198.0.164 ? 88.198.0.161 ICMP 590 Destination unreachable (Port unreachable) 9 0.168926 88.198.0.161 ? 88.198.0.164 UDP 8234 60260 ? 1337 Len=8192 10 0.168959 88.198.0.161 ? 88.198.0.164 UDP 8234 60260 ? 1337 Len=8192 11 0.168989 88.198.0.161 ? 88.198.0.164 UDP 4138 60260 ? 1337 Len=4096 12 0.169010 88.198.0.161 ? 88.198.0.164 UDP 42 60260 ? 1337 Len=0 On the third datagram received, the network namespace of the container initiates an ARP lookup to deliver the ICMP message. In another variant of this reproducer, starting the client with: strace -f pasta --config-net -u 1337 socat UDP4-LISTEN:1337,null-eof OPEN:tmp.out,create,trunc 2>strace.log & and connecting to the socat server using a loopback address: socat OPEN:tmp.in UDP4:localhost:1337,shut-null we can more clearly observe a sendmmsg() call failing after the first datagram is delivered: [pid 278012] connect(173, 0x7fff96c95fc0, 16) = 0 [...] [pid 278012] recvmmsg(173, 0x7fff96c96020, 1024, MSG_DONTWAIT, NULL) = -1 EAGAIN (Resource temporarily unavailable) [pid 278012] sendmmsg(173, 0x561c5ad0a720, 1, MSG_NOSIGNAL) = 1 [...] [pid 278012] sendmmsg(173, 0x561c5ad0a720, 1, MSG_NOSIGNAL) = -1 ECONNREFUSED (Connection refused) and, somewhat confusingly, after a connect() on the same socket succeeded. Until commit 4cdeeee9252a ("net: udp: prefer listeners bound to an address"), the race between receive address change and lookup didn't actually cause visible issues, because, once the lookup based on the secondary hash chain failed, we would still attempt a lookup based on the primary hash (destination port only), and find the socket with the outdated secondary hash. That change, however, dropped port-only lookups altogether, as side effect, making the race visible. To fix this, while avoiding the need to make address changes and rehash atomic against lookups, reintroduce primary hash lookups as fallback, if lookups based on four-tuple and secondary hashes fail. To this end, introduce a simplified lookup implementation, which doesn't take care of SO_REUSEPORT groups: if we have one, there are multiple sockets that would match the four-tuple or secondary hash, meaning that we can't run into this race at all. v2: - instead of synchronising lookup operations against address change plus rehash, reintroduce a simplified version of the original primary hash lookup as fallback v1: - fix build with CONFIG_IPV6=n: add ifdef around sk_v6_rcv_saddr usage (Kuniyuki Iwashima) - directly use sk_rcv_saddr for IPv4 receive addresses instead of fetching inet_rcv_saddr (Kuniyuki Iwashima) - move inet_update_saddr() to inet_hashtables.h and use that to set IPv4/IPv6 addresses as suitable (Kuniyuki Iwashima) - rebase onto net-next, update commit message accordingly Reported-by: Ed Santiago Link: https://github.com/containers/podman/issues/24147 Analysed-by: David Gibson Fixes: 30fff9231fad ("udp: bind() optimisation") Signed-off-by: Stefano Brivio Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/udp.c | 50 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e8953e88efef9..4bc0a0686fcd8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -420,6 +420,49 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, } EXPORT_SYMBOL(udp_ehashfn); +/** + * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port) + * @net: Network namespace + * @saddr: Source address, network order + * @sport: Source port, network order + * @daddr: Destination address, network order + * @hnum: Destination port, host order + * @dif: Destination interface index + * @sdif: Destination bridge port index, if relevant + * @udptable: Set of UDP hash tables + * + * Simplified lookup to be used as fallback if no sockets are found due to a + * potential race between (receive) address change, and lookup happening before + * the rehash operation. This function ignores SO_REUSEPORT groups while scoring + * result sockets, because if we have one, we don't need the fallback at all. + * + * Called under rcu_read_lock(). + * + * Return: socket with highest matching score if any, NULL if none + */ +static struct sock *udp4_lib_lookup1(const struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, + const struct udp_table *udptable) +{ + unsigned int slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot = &udptable->hash[slot]; + struct sock *sk, *result = NULL; + int score, badness = 0; + + sk_for_each_rcu(sk, &hslot->head) { + score = compute_score(sk, net, + saddr, sport, daddr, hnum, dif, sdif); + if (score > badness) { + result = sk; + badness = score; + } + } + + return result; +} + /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(const struct net *net, __be32 saddr, __be16 sport, @@ -683,6 +726,19 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, hslot2, skb); + if (!IS_ERR_OR_NULL(result)) + goto done; + + /* Primary hash (destination port) lookup as fallback for this race: + * 1. __ip4_datagram_connect() sets sk_rcv_saddr + * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet + * 3. rehash operation updating _secondary and four-tuple_ hashes + * The primary hash doesn't need an update after 1., so, thanks to this + * further step, 1. and 3. don't need to be atomic against the lookup. + */ + result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, + udptable); + done: if (IS_ERR(result)) return NULL; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7c14c449804cb..6671daa67f4fa 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -170,6 +170,49 @@ static int compute_score(struct sock *sk, const struct net *net, return score; } +/** + * udp6_lib_lookup1() - Simplified lookup using primary hash (destination port) + * @net: Network namespace + * @saddr: Source address, network order + * @sport: Source port, network order + * @daddr: Destination address, network order + * @hnum: Destination port, host order + * @dif: Destination interface index + * @sdif: Destination bridge port index, if relevant + * @udptable: Set of UDP hash tables + * + * Simplified lookup to be used as fallback if no sockets are found due to a + * potential race between (receive) address change, and lookup happening before + * the rehash operation. This function ignores SO_REUSEPORT groups while scoring + * result sockets, because if we have one, we don't need the fallback at all. + * + * Called under rcu_read_lock(). + * + * Return: socket with highest matching score if any, NULL if none + */ +static struct sock *udp6_lib_lookup1(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + const struct udp_table *udptable) +{ + unsigned int slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot = &udptable->hash[slot]; + struct sock *sk, *result = NULL; + int score, badness = 0; + + sk_for_each_rcu(sk, &hslot->head) { + score = compute_score(sk, net, + saddr, sport, daddr, hnum, dif, sdif); + if (score > badness) { + result = sk; + badness = score; + } + } + + return result; +} + /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(const struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -347,6 +390,13 @@ struct sock *__udp6_lib_lookup(const struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, hslot2, skb); + if (!IS_ERR_OR_NULL(result)) + goto done; + + /* Cover address change/lookup/rehash race: see __udp4_lib_lookup() */ + result = udp6_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, + udptable); + done: if (IS_ERR(result)) return NULL; From b8ea3b1ff544b47c1d64a22860f33b755638164e Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Fri, 13 Dec 2024 22:50:21 +0530 Subject: [PATCH 0924/1450] smb: enable reuse of deferred file handles for write operations Previously, deferred file handles were reused only for read operations, this commit extends to reusing deferred handles for write operations. By reusing these handles we can reduce the need for open/close operations over the wire. Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index a58a3333ecc30..3b2d33291a7e6 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -990,7 +990,11 @@ int cifs_open(struct inode *inode, struct file *file) } /* Get the cached handle as SMB2 close is deferred */ - rc = cifs_get_readable_path(tcon, full_path, &cfile); + if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { + rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile); + } else { + rc = cifs_get_readable_path(tcon, full_path, &cfile); + } if (rc == 0) { if (file->f_flags == cfile->f_flags) { file->private_data = cfile; From f17224c2a7bdc11a17c96d9d8cb2d829f54d40bb Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 20 Dec 2024 21:59:37 +0000 Subject: [PATCH 0925/1450] cifs: Remove unused is_server_using_iface() The last use of is_server_using_iface() was removed in 2022 by commit aa45dadd34e4 ("cifs: change iface_list from array to sorted linked list") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Steve French --- fs/smb/client/cifsproto.h | 2 -- fs/smb/client/sess.c | 25 ------------------------- 2 files changed, 27 deletions(-) diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 754417cb3294e..d26f9bbb53829 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -614,8 +614,6 @@ int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); int cifs_try_adding_channels(struct cifs_ses *ses); -bool is_server_using_iface(struct TCP_Server_Info *server, - struct cifs_server_iface *iface); bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void cifs_ses_mark_for_reconnect(struct cifs_ses *ses); diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 3306fb6551368..91d4d409cb1dc 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -27,31 +27,6 @@ static int cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface); -bool -is_server_using_iface(struct TCP_Server_Info *server, - struct cifs_server_iface *iface) -{ - struct sockaddr_in *i4 = (struct sockaddr_in *)&iface->sockaddr; - struct sockaddr_in6 *i6 = (struct sockaddr_in6 *)&iface->sockaddr; - struct sockaddr_in *s4 = (struct sockaddr_in *)&server->dstaddr; - struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&server->dstaddr; - - if (server->dstaddr.ss_family != iface->sockaddr.ss_family) - return false; - if (server->dstaddr.ss_family == AF_INET) { - if (s4->sin_addr.s_addr != i4->sin_addr.s_addr) - return false; - } else if (server->dstaddr.ss_family == AF_INET6) { - if (memcmp(&s6->sin6_addr, &i6->sin6_addr, - sizeof(i6->sin6_addr)) != 0) - return false; - } else { - /* unknown family.. */ - return false; - } - return true; -} - bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) { int i; From 8673a6c2d9e483dfeeef83a1f06f59e05636f4d1 Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Fri, 20 Dec 2024 13:52:46 +0800 Subject: [PATCH 0926/1450] RDMA/hns: Fix mapping error of zero-hop WQE buffer Due to HW limitation, the three region of WQE buffer must be mapped and set to HW in a fixed order: SQ buffer, SGE buffer, and RQ buffer. Currently when one region is zero-hop while the other two are not, the zero-hop region will not be mapped. This violate the limitation above and leads to address error. Fixes: 38389eaa4db1 ("RDMA/hns: Add mtr support for mixed multihop addressing") Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hem.c | 43 ++++++++++++++++-------- drivers/infiniband/hw/hns/hns_roce_mr.c | 5 --- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index f84521be3bea4..605562122ecce 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -931,6 +931,7 @@ struct hns_roce_hem_item { size_t count; /* max ba numbers */ int start; /* start buf offset in this hem */ int end; /* end buf offset in this hem */ + bool exist_bt; }; /* All HEM items are linked in a tree structure */ @@ -959,6 +960,7 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } } + hem->exist_bt = exist_bt; hem->count = count; hem->start = start; hem->end = end; @@ -969,22 +971,22 @@ hem_list_alloc_item(struct hns_roce_dev *hr_dev, int start, int end, int count, } static void hem_list_free_item(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_item *hem, bool exist_bt) + struct hns_roce_hem_item *hem) { - if (exist_bt) + if (hem->exist_bt) dma_free_coherent(hr_dev->dev, hem->count * BA_BYTE_LEN, hem->addr, hem->dma_addr); kfree(hem); } static void hem_list_free_all(struct hns_roce_dev *hr_dev, - struct list_head *head, bool exist_bt) + struct list_head *head) { struct hns_roce_hem_item *hem, *temp_hem; list_for_each_entry_safe(hem, temp_hem, head, list) { list_del(&hem->list); - hem_list_free_item(hr_dev, hem, exist_bt); + hem_list_free_item(hr_dev, hem); } } @@ -1084,6 +1086,10 @@ int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, for (i = 0; i < region_cnt; i++) { r = (struct hns_roce_buf_region *)®ions[i]; + /* when r->hopnum = 0, the region should not occupy root_ba. */ + if (!r->hopnum) + continue; + if (r->hopnum > 1) { step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step > 0) @@ -1177,7 +1183,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, err_exit: for (level = 1; level < hopnum; level++) - hem_list_free_all(hr_dev, &temp_list[level], true); + hem_list_free_all(hr_dev, &temp_list[level]); return ret; } @@ -1218,16 +1224,26 @@ static int alloc_fake_root_bt(struct hns_roce_dev *hr_dev, void *cpu_base, { struct hns_roce_hem_item *hem; + /* This is on the has_mtt branch, if r->hopnum + * is 0, there is no root_ba to reuse for the + * region's fake hem, so a dma_alloc request is + * necessary here. + */ hem = hem_list_alloc_item(hr_dev, r->offset, r->offset + r->count - 1, - r->count, false); + r->count, !r->hopnum); if (!hem) return -ENOMEM; - hem_list_assign_bt(hem, cpu_base, phy_base); + /* The root_ba can be reused only when r->hopnum > 0. */ + if (r->hopnum) + hem_list_assign_bt(hem, cpu_base, phy_base); list_add(&hem->list, branch_head); list_add(&hem->sibling, leaf_head); - return r->count; + /* If r->hopnum == 0, 0 is returned, + * so that the root_bt entry is not occupied. + */ + return r->hopnum ? r->count : 0; } static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, @@ -1271,7 +1287,7 @@ setup_root_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, return -ENOMEM; total = 0; - for (i = 0; i < region_cnt && total < max_ba_num; i++) { + for (i = 0; i < region_cnt && total <= max_ba_num; i++) { r = ®ions[i]; if (!r->count) continue; @@ -1337,9 +1353,9 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev, region_cnt); if (ret) { for (i = 0; i < region_cnt; i++) - hem_list_free_all(hr_dev, &head.branch[i], false); + hem_list_free_all(hr_dev, &head.branch[i]); - hem_list_free_all(hr_dev, &head.root, true); + hem_list_free_all(hr_dev, &head.root); } return ret; @@ -1402,10 +1418,9 @@ void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++) for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++) - hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j], - j != 0); + hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j]); - hem_list_free_all(hr_dev, &hem_list->root_bt, true); + hem_list_free_all(hr_dev, &hem_list->root_bt); INIT_LIST_HEAD(&hem_list->btm_bt); hem_list->root_ba = 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index bf30b3a65a9ba..55b9283bfc6f0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -814,11 +814,6 @@ int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, for (i = 0, mapped_cnt = 0; i < mtr->hem_cfg.region_count && mapped_cnt < page_cnt; i++) { r = &mtr->hem_cfg.region[i]; - /* if hopnum is 0, no need to map pages in this region */ - if (!r->hopnum) { - mapped_cnt += r->count; - continue; - } if (r->offset + r->count > page_cnt) { ret = -EINVAL; From 0572eccf239ce4bd89bd531767ec5ab20e249290 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 20 Dec 2024 13:52:47 +0800 Subject: [PATCH 0927/1450] RDMA/hns: Fix accessing invalid dip_ctx during destroying QP If it fails to modify QP to RTR, dip_ctx will not be attached. And during detroying QP, the invalid dip_ctx pointer will be accessed. Fixes: faa62440a577 ("RDMA/hns: Fix different dgids mapping to the same dip_idx") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 697b17cca02e7..6dddadb90e026 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5619,6 +5619,9 @@ static void put_dip_ctx_idx(struct hns_roce_dev *hr_dev, { struct hns_roce_dip *hr_dip = hr_qp->dip; + if (!hr_dip) + return; + xa_lock(&hr_dev->qp_table.dip_xa); hr_dip->qp_cnt--; From fa5c4ba8cdbfd2c2d6422e001311c8213283ebbf Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 20 Dec 2024 13:52:48 +0800 Subject: [PATCH 0928/1450] RDMA/hns: Fix warning storm caused by invalid input in IO path WARN_ON() is called in the IO path. And it could lead to a warning storm. Use WARN_ON_ONCE() instead of WARN_ON(). Fixes: 12542f1de179 ("RDMA/hns: Refactor process about opcode in post_send()") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-4-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 6dddadb90e026..d0469d27c63cb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -468,7 +468,7 @@ static inline int set_ud_wqe(struct hns_roce_qp *qp, valid_num_sge = calc_wr_sge_num(wr, &msg_len); ret = set_ud_opcode(ud_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; ud_sq_wqe->msg_len = cpu_to_le32(msg_len); @@ -572,7 +572,7 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, rc_sq_wqe->msg_len = cpu_to_le32(msg_len); ret = set_rc_opcode(hr_dev, rc_sq_wqe, wr); - if (WARN_ON(ret)) + if (WARN_ON_ONCE(ret)) return ret; hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SO, From e3debdd48423d3d75b9d366399228d7225d902cd Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 20 Dec 2024 13:52:49 +0800 Subject: [PATCH 0929/1450] RDMA/hns: Fix missing flush CQE for DWQE Flush CQE handler has not been called if QP state gets into errored mode in DWQE path. So, the new added outstanding WQEs will never be flushed. It leads to a hung task timeout when using NFS over RDMA: __switch_to+0x7c/0xd0 __schedule+0x350/0x750 schedule+0x50/0xf0 schedule_timeout+0x2c8/0x340 wait_for_common+0xf4/0x2b0 wait_for_completion+0x20/0x40 __ib_drain_sq+0x140/0x1d0 [ib_core] ib_drain_sq+0x98/0xb0 [ib_core] rpcrdma_xprt_disconnect+0x68/0x270 [rpcrdma] xprt_rdma_close+0x20/0x60 [rpcrdma] xprt_autoclose+0x64/0x1cc [sunrpc] process_one_work+0x1d8/0x4e0 worker_thread+0x154/0x420 kthread+0x108/0x150 ret_from_fork+0x10/0x18 Fixes: 01584a5edcc4 ("RDMA/hns: Add support of direct wqe") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20241220055249.146943-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index d0469d27c63cb..0144e7210d05a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -670,6 +670,10 @@ static void write_dwqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, #define HNS_ROCE_SL_SHIFT 2 struct hns_roce_v2_rc_send_wqe *rc_sq_wqe = wqe; + if (unlikely(qp->state == IB_QPS_ERR)) { + flush_cqe(hr_dev, qp); + return; + } /* All kinds of DirectWQE have the same header field layout */ hr_reg_enable(rc_sq_wqe, RC_SEND_WQE_FLAG); hr_reg_write(rc_sq_wqe, RC_SEND_WQE_DB_SL_L, qp->sl); From d685d55dfc86b1a4bdcec77c3c1f8a83f181264e Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 11 Dec 2024 09:10:55 +0900 Subject: [PATCH 0930/1450] tracing/kprobe: Make trace_kprobe's module callback called after jump_label update Make sure the trace_kprobe's module notifer callback function is called after jump_label's callback is called. Since the trace_kprobe's callback eventually checks jump_label address during registering new kprobe on the loading module, jump_label must be updated before this registration happens. Link: https://lore.kernel.org/all/173387585556.995044.3157941002975446119.stgit@devnote2/ Fixes: 614243181050 ("tracing/kprobes: Support module init function probing") Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 263fac44d3ca3..935a886af40c9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -725,7 +725,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, static struct notifier_block trace_kprobe_module_nb = { .notifier_call = trace_kprobe_module_callback, - .priority = 1 /* Invoked after kprobe module callback */ + .priority = 2 /* Invoked after kprobe and jump_label module callback */ }; static int trace_kprobe_register_module_notifier(void) { From a53da2fb25a31f4fb8eaeb93c7b1134fc14fd209 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 13 Dec 2024 09:28:33 -0800 Subject: [PATCH 0931/1450] drm/xe: Revert some changes that break a mesa debug tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a mesa debug tool for decoding devcoredump files. Recent changes to improve the devcoredump output broke that tool. So revert the changes until the tool can be extended to support the new fields. Signed-off-by: John Harrison Fixes: c28fd6c358db ("drm/xe/devcoredump: Improve section headings and add tile info") Fixes: ec1455ce7e35 ("drm/xe/devcoredump: Add ASCII85 dump helper function") Cc: John Harrison Cc: Julia Filipchuk Cc: Lucas De Marchi Cc: Thomas Hellström Cc: Rodrigo Vivi Cc: intel-xe@lists.freedesktop.org Reviewed-by: Jonathan Cavitt Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20241213172833.1733376-1-John.C.Harrison@Intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 70fb86a85dc9fd66014d7eb2fe356f50702ceeb6) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_devcoredump.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index f8947e7e917ec..21a50d539426b 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -109,7 +109,11 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, drm_puts(&p, "\n**** GuC CT ****\n"); xe_guc_ct_snapshot_print(ss->guc.ct, &p); - drm_puts(&p, "\n**** Contexts ****\n"); + /* + * Don't add a new section header here because the mesa debug decoder + * tool expects the context information to be in the 'GuC CT' section. + */ + /* drm_puts(&p, "\n**** Contexts ****\n"); */ xe_guc_exec_queue_snapshot_print(ss->ge, &p); drm_puts(&p, "\n**** Job ****\n"); @@ -363,6 +367,15 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char buff[ASCII85_BUFSZ], *line_buff; size_t line_pos = 0; + /* + * Splitting blobs across multiple lines is not compatible with the mesa + * debug decoder tool. Note that even dropping the explicit '\n' below + * doesn't help because the GuC log is so big some underlying implementation + * still splits the lines at 512K characters. So just bail completely for + * the moment. + */ + return; + #define DMESG_MAX_LINE_LEN 800 #define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "\n\0" */ From 528cef1b4170f328d28d4e9b437380d8e5a2d18f Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 13 Dec 2024 13:24:14 +0100 Subject: [PATCH 0932/1450] drm/xe: Use non-interruptible wait when moving BO to system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure a non-interruptible wait is used when moving a bo to XE_PL_SYSTEM. This prevents dma_mappings from being removed prematurely while a GPU job is still in progress, even if the CPU receives a signal during the operation. Fixes: 75521e8b56e8 ("drm/xe: Perform dma_map when moving system buffer objects to TT") Cc: Thomas Hellström Cc: Matthew Brost Cc: Lucas De Marchi Cc: stable@vger.kernel.org # v6.11+ Suggested-by: Matthew Auld Reviewed-by: Matthew Auld Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20241213122415.3880017-1-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit dc5e20ae1f8a7c354dc9833faa2720254e5a5443) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_bo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index ae6b337cdc54d..1aec4133008e9 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -724,7 +724,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, new_mem->mem_type == XE_PL_SYSTEM) { long timeout = dma_resv_wait_timeout(ttm_bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, - true, + false, MAX_SCHEDULE_TIMEOUT); if (timeout < 0) { ret = timeout; From 5e0a67fdb894d34c5f109e969320eef9ddae7480 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 13 Dec 2024 13:24:15 +0100 Subject: [PATCH 0933/1450] drm/xe: Wait for migration job before unmapping pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a potential GPU page fault during tt -> system moves by waiting for migration jobs to complete before unmapping SG. This ensures that IOMMU mappings are not prematurely torn down while a migration job is still in progress. v2: Use intr=false(Matt A) v3: Update commit message(Matt A) v4: s/DMA_RESV_USAGE_BOOKKEEP/DMA_RESV_USAGE_KERNEL(Thomas) Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3466 Fixes: 75521e8b56e8 ("drm/xe: Perform dma_map when moving system buffer objects to TT") Cc: Thomas Hellström Cc: Matthew Brost Cc: Lucas De Marchi Cc: stable@vger.kernel.org # v6.11+ Cc: Matthew Auld Reviewed-by: Matthew Auld Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20241213122415.3880017-2-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit cda06412c06893a6f07a2fbf89d42a0972ec9e8e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_bo.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 1aec4133008e9..f61a8ef380944 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -848,8 +848,16 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, out: if ((!ttm_bo->resource || ttm_bo->resource->mem_type == XE_PL_SYSTEM) && - ttm_bo->ttm) + ttm_bo->ttm) { + long timeout = dma_resv_wait_timeout(ttm_bo->base.resv, + DMA_RESV_USAGE_KERNEL, + false, + MAX_SCHEDULE_TIMEOUT); + if (timeout < 0) + ret = timeout; + xe_tt_unmap_sg(ttm_bo->ttm); + } return ret; } From af12ba67d09ebe2b31ab997cea1a930864028562 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Mon, 16 Dec 2024 23:32:53 +0100 Subject: [PATCH 0934/1450] drm/xe/pf: Use correct function to check LMEM provisioning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a typo in function call and instead of VF LMEM we were looking at VF GGTT provisioning. Fix that. Fixes: 234670cea9a2 ("drm/xe/pf: Skip fair VFs provisioning if already provisioned") Signed-off-by: Michal Wajdeczko Cc: Piotr Piórkowski Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20241216223253.819-1-michal.wajdeczko@intel.com (cherry picked from commit a8d0aa0e7fcd20c9f1992688c0f0d07a68287403) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 192643d63d224..ca49860168f6d 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -2046,7 +2046,7 @@ static int pf_validate_vf_config(struct xe_gt *gt, unsigned int vfid) valid_any = valid_any || (valid_ggtt && is_primary); if (IS_DGFX(xe)) { - bool valid_lmem = pf_get_vf_config_ggtt(primary_gt, vfid); + bool valid_lmem = pf_get_vf_config_lmem(primary_gt, vfid); valid_any = valid_any || (valid_lmem && is_primary); valid_all = valid_all && valid_lmem; From fe39b222a4139354d32ff9d46b88757f63f71d63 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Tue, 17 Dec 2024 21:31:21 -0800 Subject: [PATCH 0935/1450] drm/xe: Fix fault on fd close after unbind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If userspace holds an fd open, unbinds the device and then closes it, the driver shouldn't try to access the hardware. Protect it by using drm_dev_enter()/drm_dev_exit(). This fixes the following page fault: <6> [IGT] xe_wedged: exiting, ret=98 <1> BUG: unable to handle page fault for address: ffffc901bc5e508c <1> #PF: supervisor read access in kernel mode <1> #PF: error_code(0x0000) - not-present page ... <4> xe_lrc_update_timestamp+0x1c/0xd0 [xe] <4> xe_exec_queue_update_run_ticks+0x50/0xb0 [xe] <4> xe_exec_queue_fini+0x16/0xb0 [xe] <4> __guc_exec_queue_fini_async+0xc4/0x190 [xe] <4> guc_exec_queue_fini_async+0xa0/0xe0 [xe] <4> guc_exec_queue_fini+0x23/0x40 [xe] <4> xe_exec_queue_destroy+0xb3/0xf0 [xe] <4> xe_file_close+0xd4/0x1a0 [xe] <4> drm_file_free+0x210/0x280 [drm] <4> drm_close_helper.isra.0+0x6d/0x80 [drm] <4> drm_release_noglobal+0x20/0x90 [drm] Fixes: 514447a12190 ("drm/xe: Stop accumulating LRC timestamp on job_free") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3421 Reviewed-by: Umesh Nerlige Ramappa Link: https://patchwork.freedesktop.org/patch/msgid/20241218053122.2730195-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 4ca1fd418338d4d135428a0eb1e16e3b3ce17ee8) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_exec_queue.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index fd0f3b3c9101d..268cd3123be9d 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -762,9 +763,11 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) */ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) { + struct xe_device *xe = gt_to_xe(q->gt); struct xe_file *xef; struct xe_lrc *lrc; u32 old_ts, new_ts; + int idx; /* * Jobs that are run during driver load may use an exec_queue, but are @@ -774,6 +777,10 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) if (!q->vm || !q->vm->xef) return; + /* Synchronize with unbind while holding the xe file open */ + if (!drm_dev_enter(&xe->drm, &idx)) + return; + xef = q->vm->xef; /* @@ -787,6 +794,8 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) lrc = q->lrc[0]; new_ts = xe_lrc_update_timestamp(lrc, &old_ts); xef->run_ticks[q->class] += (new_ts - old_ts) * q->width; + + drm_dev_exit(idx); } /** From a072ffd896efa6a6c8a0334c712fbc98a63c789c Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Wed, 18 Dec 2024 15:25:58 -0800 Subject: [PATCH 0936/1450] eth: fbnic: fix csr boundary for RPM RAM section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CSR dump support leverages the FBNIC_BOUNDS macro, which pads the end condition for each section by adding an offset of 1. However, the RPC RAM section, which is dumped differently from other sections, does not rely on this macro and instead directly uses end boundary address. Hence, subtracting 1 from the end address results in skipping a register. Fixes 3d12862b216d (“eth: fbnic: Add support to dump registers”) Signed-off-by: Mohsin Bashir Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20241218232614.439329-1-mohsin.bashr@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic_csr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c index 2118901b25e91..aeb9f333f4c7a 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c @@ -64,7 +64,7 @@ static void fbnic_csr_get_regs_rpc_ram(struct fbnic_dev *fbd, u32 **data_p) u32 i, j; *(data++) = start; - *(data++) = end - 1; + *(data++) = end; /* FBNIC_RPC_TCAM_ACT */ for (i = 0; i < FBNIC_RPC_TCAM_ACT_NUM_ENTRIES; i++) { From 2b6ffcd7873b7e8a62c3e15a6f305bfc747c466b Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Thu, 19 Dec 2024 11:41:19 +0900 Subject: [PATCH 0937/1450] net: stmmac: restructure the error path of stmmac_probe_config_dt() Current implementation of stmmac_probe_config_dt() does not release the OF node reference obtained by of_parse_phandle() in some error paths. The problem is that some error paths call stmmac_remove_config_dt() to clean up but others use and unwind ladder. These two types of error handling have not kept in sync and have been a recurring source of bugs. Re-write the error handling in stmmac_probe_config_dt() to use an unwind ladder. Consequently, stmmac_remove_config_dt() is not needed anymore, thus remove it. This bug was found by an experimental verification tool that I am developing. Fixes: 4838a5405028 ("net: stmmac: Fix wrapper drivers not detecting PHY") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241219024119.2017012-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/stmmac_platform.c | 43 ++++++++----------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 3ac32444e4922..dc9884130b912 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -405,22 +405,6 @@ static int stmmac_of_get_mac_mode(struct device_node *np) return -ENODEV; } -/** - * stmmac_remove_config_dt - undo the effects of stmmac_probe_config_dt() - * @pdev: platform_device structure - * @plat: driver data platform structure - * - * Release resources claimed by stmmac_probe_config_dt(). - */ -static void stmmac_remove_config_dt(struct platform_device *pdev, - struct plat_stmmacenet_data *plat) -{ - clk_disable_unprepare(plat->stmmac_clk); - clk_disable_unprepare(plat->pclk); - of_node_put(plat->phy_node); - of_node_put(plat->mdio_node); -} - /** * stmmac_probe_config_dt - parse device-tree driver parameters * @pdev: platform_device structure @@ -490,8 +474,10 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dev_warn(&pdev->dev, "snps,phy-addr property is deprecated\n"); rc = stmmac_mdio_setup(plat, np, &pdev->dev); - if (rc) - return ERR_PTR(rc); + if (rc) { + ret = ERR_PTR(rc); + goto error_put_phy; + } of_property_read_u32(np, "tx-fifo-depth", &plat->tx_fifo_size); @@ -581,8 +567,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), GFP_KERNEL); if (!dma_cfg) { - stmmac_remove_config_dt(pdev, plat); - return ERR_PTR(-ENOMEM); + ret = ERR_PTR(-ENOMEM); + goto error_put_mdio; } plat->dma_cfg = dma_cfg; @@ -610,8 +596,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) rc = stmmac_mtl_setup(pdev, plat); if (rc) { - stmmac_remove_config_dt(pdev, plat); - return ERR_PTR(rc); + ret = ERR_PTR(rc); + goto error_put_mdio; } /* clock setup */ @@ -663,6 +649,10 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) clk_disable_unprepare(plat->pclk); error_pclk_get: clk_disable_unprepare(plat->stmmac_clk); +error_put_mdio: + of_node_put(plat->mdio_node); +error_put_phy: + of_node_put(plat->phy_node); return ret; } @@ -671,16 +661,17 @@ static void devm_stmmac_remove_config_dt(void *data) { struct plat_stmmacenet_data *plat = data; - /* Platform data argument is unused */ - stmmac_remove_config_dt(NULL, plat); + clk_disable_unprepare(plat->stmmac_clk); + clk_disable_unprepare(plat->pclk); + of_node_put(plat->mdio_node); + of_node_put(plat->phy_node); } /** * devm_stmmac_probe_config_dt * @pdev: platform_device structure * @mac: MAC address to use - * Description: Devres variant of stmmac_probe_config_dt(). Does not require - * the user to call stmmac_remove_config_dt() at driver detach. + * Description: Devres variant of stmmac_probe_config_dt(). */ struct plat_stmmacenet_data * devm_stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) From d9a093d2d12aec87ed0a2ac660b3a62261bef966 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 19 Dec 2024 13:47:52 +0800 Subject: [PATCH 0938/1450] net: enetc: add Tx checksum offload for i.MX95 ENETC In addition to supporting Rx checksum offload, i.MX95 ENETC also supports Tx checksum offload. The transmit checksum offload is implemented through the Tx BD. To support Tx checksum offload, software needs to fill some auxiliary information in Tx BD, such as IP version, IP header offset and size, whether L4 is UDP or TCP, etc. Same as Rx checksum offload, Tx checksum offload capability isn't defined in register, so tx_csum bit is added to struct enetc_drvdata to indicate whether the device supports Tx checksum offload. Signed-off-by: Wei Fang Reviewed-by: Frank Li Reviewed-by: Claudiu Manoil Link: https://patch.msgid.link/20241219054755.1615626-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 53 ++++++++++++++++--- drivers/net/ethernet/freescale/enetc/enetc.h | 2 + .../net/ethernet/freescale/enetc/enetc_hw.h | 15 ++++-- .../freescale/enetc/enetc_pf_common.c | 3 ++ 4 files changed, 63 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 535969fa0fdbe..88f12c88110ff 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -146,6 +146,27 @@ static int enetc_ptp_parse(struct sk_buff *skb, u8 *udp, return 0; } +static bool enetc_tx_csum_offload_check(struct sk_buff *skb) +{ + switch (skb->csum_offset) { + case offsetof(struct tcphdr, check): + case offsetof(struct udphdr, check): + return true; + default: + return false; + } +} + +static bool enetc_skb_is_ipv6(struct sk_buff *skb) +{ + return vlan_get_protocol(skb) == htons(ETH_P_IPV6); +} + +static bool enetc_skb_is_tcp(struct sk_buff *skb) +{ + return skb->csum_offset == offsetof(struct tcphdr, check); +} + static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) { bool do_vlan, do_onestep_tstamp = false, do_twostep_tstamp = false; @@ -163,6 +184,29 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) dma_addr_t dma; u8 flags = 0; + enetc_clear_tx_bd(&temp_bd); + if (skb->ip_summed == CHECKSUM_PARTIAL) { + /* Can not support TSD and checksum offload at the same time */ + if (priv->active_offloads & ENETC_F_TXCSUM && + enetc_tx_csum_offload_check(skb) && !tx_ring->tsd_enable) { + temp_bd.l3_aux0 = FIELD_PREP(ENETC_TX_BD_L3_START, + skb_network_offset(skb)); + temp_bd.l3_aux1 = FIELD_PREP(ENETC_TX_BD_L3_HDR_LEN, + skb_network_header_len(skb) / 4); + temp_bd.l3_aux1 |= FIELD_PREP(ENETC_TX_BD_L3T, + enetc_skb_is_ipv6(skb)); + if (enetc_skb_is_tcp(skb)) + temp_bd.l4_aux = FIELD_PREP(ENETC_TX_BD_L4T, + ENETC_TXBD_L4T_TCP); + else + temp_bd.l4_aux = FIELD_PREP(ENETC_TX_BD_L4T, + ENETC_TXBD_L4T_UDP); + flags |= ENETC_TXBD_FLAGS_CSUM_LSO | ENETC_TXBD_FLAGS_L4CS; + } else if (skb_checksum_help(skb)) { + return 0; + } + } + i = tx_ring->next_to_use; txbd = ENETC_TXBD(*tx_ring, i); prefetchw(txbd); @@ -173,7 +217,6 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) temp_bd.addr = cpu_to_le64(dma); temp_bd.buf_len = cpu_to_le16(len); - temp_bd.lstatus = 0; tx_swbd = &tx_ring->tx_swbd[i]; tx_swbd->dma = dma; @@ -594,7 +637,7 @@ static netdev_tx_t enetc_start_xmit(struct sk_buff *skb, { struct enetc_ndev_priv *priv = netdev_priv(ndev); struct enetc_bdr *tx_ring; - int count, err; + int count; /* Queue one-step Sync packet if already locked */ if (skb->cb[0] & ENETC_F_TX_ONESTEP_SYNC_TSTAMP) { @@ -627,11 +670,6 @@ static netdev_tx_t enetc_start_xmit(struct sk_buff *skb, return NETDEV_TX_BUSY; } - if (skb->ip_summed == CHECKSUM_PARTIAL) { - err = skb_checksum_help(skb); - if (err) - goto drop_packet_err; - } enetc_lock_mdio(); count = enetc_map_tx_buffs(tx_ring, skb); enetc_unlock_mdio(); @@ -3274,6 +3312,7 @@ static const struct enetc_drvdata enetc_pf_data = { static const struct enetc_drvdata enetc4_pf_data = { .sysclk_freq = ENETC_CLK_333M, + .tx_csum = true, .pmac_offset = ENETC4_PMAC_OFFSET, .eth_ops = &enetc4_pf_ethtool_ops, }; diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 72fa03dbc2ddb..e82eb9a9137c9 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -234,6 +234,7 @@ enum enetc_errata { struct enetc_drvdata { u32 pmac_offset; /* Only valid for PSI which supports 802.1Qbu */ + u8 tx_csum:1; u64 sysclk_freq; const struct ethtool_ops *eth_ops; }; @@ -341,6 +342,7 @@ enum enetc_active_offloads { ENETC_F_QBV = BIT(9), ENETC_F_QCI = BIT(10), ENETC_F_QBU = BIT(11), + ENETC_F_TXCSUM = BIT(12), }; enum enetc_flags_bit { diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index 55ba949230ffd..0e259baf36ee6 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -558,7 +558,16 @@ union enetc_tx_bd { __le16 frm_len; union { struct { - u8 reserved[3]; + u8 l3_aux0; +#define ENETC_TX_BD_L3_START GENMASK(6, 0) +#define ENETC_TX_BD_IPCS BIT(7) + u8 l3_aux1; +#define ENETC_TX_BD_L3_HDR_LEN GENMASK(6, 0) +#define ENETC_TX_BD_L3T BIT(7) + u8 l4_aux; +#define ENETC_TX_BD_L4T GENMASK(7, 5) +#define ENETC_TXBD_L4T_UDP 1 +#define ENETC_TXBD_L4T_TCP 2 u8 flags; }; /* default layout */ __le32 txstart; @@ -582,10 +591,10 @@ union enetc_tx_bd { }; enum enetc_txbd_flags { - ENETC_TXBD_FLAGS_RES0 = BIT(0), /* reserved */ + ENETC_TXBD_FLAGS_L4CS = BIT(0), /* For ENETC 4.1 and later */ ENETC_TXBD_FLAGS_TSE = BIT(1), ENETC_TXBD_FLAGS_W = BIT(2), - ENETC_TXBD_FLAGS_RES3 = BIT(3), /* reserved */ + ENETC_TXBD_FLAGS_CSUM_LSO = BIT(3), /* For ENETC 4.1 and later */ ENETC_TXBD_FLAGS_TXSTART = BIT(4), ENETC_TXBD_FLAGS_EX = BIT(6), ENETC_TXBD_FLAGS_F = BIT(7) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c b/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c index 0eecfc833164f..09f2d7ec44eb2 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c @@ -119,6 +119,9 @@ void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev, ndev->priv_flags |= IFF_UNICAST_FLT; + if (si->drvdata->tx_csum) + priv->active_offloads |= ENETC_F_TXCSUM; + /* TODO: currently, i.MX95 ENETC driver does not support advanced features */ if (!is_enetc_rev1(si)) { ndev->hw_features &= ~(NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_LOOPBACK); From 93c5d5a0ddf8ad39661a087edb45286a0a55f7e6 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 19 Dec 2024 13:47:53 +0800 Subject: [PATCH 0939/1450] net: enetc: update max chained Tx BD number for i.MX95 ENETC The max chained Tx BDs of latest ENETC (i.MX95 ENETC, rev 4.1) has been increased to 63, but since the range of MAX_SKB_FRAGS is 17~45, so for i.MX95 ENETC and later revision, it is better to set ENETC4_MAX_SKB_FRAGS to MAX_SKB_FRAGS. In addition, add max_frags in struct enetc_drvdata to indicate the max chained BDs supported by device. Because the max number of chained BDs supported by LS1028A and i.MX95 ENETC is different. Signed-off-by: Wei Fang Reviewed-by: Frank Li Reviewed-by: Claudiu Manoil Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241219054755.1615626-3-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 13 +++++++++---- drivers/net/ethernet/freescale/enetc/enetc.h | 13 +++++++++++-- .../net/ethernet/freescale/enetc/enetc_pf_common.c | 1 + drivers/net/ethernet/freescale/enetc/enetc_vf.c | 1 + 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 88f12c88110ff..76c33506991b2 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -534,6 +534,7 @@ static void enetc_tso_complete_csum(struct enetc_bdr *tx_ring, struct tso_t *tso static int enetc_map_tx_tso_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) { + struct enetc_ndev_priv *priv = netdev_priv(tx_ring->ndev); int hdr_len, total_len, data_len; struct enetc_tx_swbd *tx_swbd; union enetc_tx_bd *txbd; @@ -599,7 +600,7 @@ static int enetc_map_tx_tso_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb bd_data_num++; tso_build_data(skb, &tso, size); - if (unlikely(bd_data_num >= ENETC_MAX_SKB_FRAGS && data_len)) + if (unlikely(bd_data_num >= priv->max_frags && data_len)) goto err_chained_bd; } @@ -660,7 +661,7 @@ static netdev_tx_t enetc_start_xmit(struct sk_buff *skb, count = enetc_map_tx_tso_buffs(tx_ring, skb); enetc_unlock_mdio(); } else { - if (unlikely(skb_shinfo(skb)->nr_frags > ENETC_MAX_SKB_FRAGS)) + if (unlikely(skb_shinfo(skb)->nr_frags > priv->max_frags)) if (unlikely(skb_linearize(skb))) goto drop_packet_err; @@ -678,7 +679,7 @@ static netdev_tx_t enetc_start_xmit(struct sk_buff *skb, if (unlikely(!count)) goto drop_packet_err; - if (enetc_bd_unused(tx_ring) < ENETC_TXBDS_MAX_NEEDED) + if (enetc_bd_unused(tx_ring) < ENETC_TXBDS_MAX_NEEDED(priv->max_frags)) netif_stop_subqueue(ndev, tx_ring->index); return NETDEV_TX_OK; @@ -946,7 +947,8 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) if (unlikely(tx_frm_cnt && netif_carrier_ok(ndev) && __netif_subqueue_stopped(ndev, tx_ring->index) && !test_bit(ENETC_TX_DOWN, &priv->flags) && - (enetc_bd_unused(tx_ring) >= ENETC_TXBDS_MAX_NEEDED))) { + (enetc_bd_unused(tx_ring) >= + ENETC_TXBDS_MAX_NEEDED(priv->max_frags)))) { netif_wake_subqueue(ndev, tx_ring->index); } @@ -3307,18 +3309,21 @@ EXPORT_SYMBOL_GPL(enetc_pci_remove); static const struct enetc_drvdata enetc_pf_data = { .sysclk_freq = ENETC_CLK_400M, .pmac_offset = ENETC_PMAC_OFFSET, + .max_frags = ENETC_MAX_SKB_FRAGS, .eth_ops = &enetc_pf_ethtool_ops, }; static const struct enetc_drvdata enetc4_pf_data = { .sysclk_freq = ENETC_CLK_333M, .tx_csum = true, + .max_frags = ENETC4_MAX_SKB_FRAGS, .pmac_offset = ENETC4_PMAC_OFFSET, .eth_ops = &enetc4_pf_ethtool_ops, }; static const struct enetc_drvdata enetc_vf_data = { .sysclk_freq = ENETC_CLK_400M, + .max_frags = ENETC_MAX_SKB_FRAGS, .eth_ops = &enetc_vf_ethtool_ops, }; diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index e82eb9a9137c9..1e680f0f51236 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -59,9 +59,16 @@ struct enetc_rx_swbd { /* ENETC overhead: optional extension BD + 1 BD gap */ #define ENETC_TXBDS_NEEDED(val) ((val) + 2) -/* max # of chained Tx BDs is 15, including head and extension BD */ +/* For LS1028A, max # of chained Tx BDs is 15, including head and + * extension BD. + */ #define ENETC_MAX_SKB_FRAGS 13 -#define ENETC_TXBDS_MAX_NEEDED ENETC_TXBDS_NEEDED(ENETC_MAX_SKB_FRAGS + 1) +/* For ENETC v4 and later versions, max # of chained Tx BDs is 63, + * including head and extension BD, but the range of MAX_SKB_FRAGS + * is 17 ~ 45, so set ENETC4_MAX_SKB_FRAGS to MAX_SKB_FRAGS. + */ +#define ENETC4_MAX_SKB_FRAGS MAX_SKB_FRAGS +#define ENETC_TXBDS_MAX_NEEDED(x) ENETC_TXBDS_NEEDED((x) + 1) struct enetc_ring_stats { unsigned int packets; @@ -235,6 +242,7 @@ enum enetc_errata { struct enetc_drvdata { u32 pmac_offset; /* Only valid for PSI which supports 802.1Qbu */ u8 tx_csum:1; + u8 max_frags; u64 sysclk_freq; const struct ethtool_ops *eth_ops; }; @@ -377,6 +385,7 @@ struct enetc_ndev_priv { u16 msg_enable; u8 preemptible_tcs; + u8 max_frags; /* The maximum number of BDs for fragments */ enum enetc_active_offloads active_offloads; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c b/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c index 09f2d7ec44eb2..00b73a9487466 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c @@ -101,6 +101,7 @@ void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev, priv->msg_enable = (NETIF_MSG_WOL << 1) - 1; priv->sysclk_freq = si->drvdata->sysclk_freq; + priv->max_frags = si->drvdata->max_frags; ndev->netdev_ops = ndev_ops; enetc_set_ethtool_ops(ndev); ndev->watchdog_timeo = 5 * HZ; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c index a5f8ce576b6e8..63d78b2b86709 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c @@ -136,6 +136,7 @@ static void enetc_vf_netdev_setup(struct enetc_si *si, struct net_device *ndev, priv->msg_enable = (NETIF_MSG_IFUP << 1) - 1; priv->sysclk_freq = si->drvdata->sysclk_freq; + priv->max_frags = si->drvdata->max_frags; ndev->netdev_ops = ndev_ops; enetc_set_ethtool_ops(ndev); ndev->watchdog_timeo = 5 * HZ; From 69797ff888d3dbab035a0d2516b554285b094e3c Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 19 Dec 2024 13:47:54 +0800 Subject: [PATCH 0940/1450] net: enetc: add LSO support for i.MX95 ENETC PF ENETC rev 4.1 supports large send offload (LSO), segmenting large TCP and UDP transmit units into multiple Ethernet frames. To support LSO, software needs to fill some auxiliary information in Tx BD, such as LSO header length, frame length, LSO maximum segment size, etc. At 1Gbps link rate, TCP segmentation was tested using iperf3, and the CPU performance before and after applying the patch was compared through the top command. It can be seen that LSO saves a significant amount of CPU cycles compared to software TSO. Before applying the patch: %Cpu(s): 0.1 us, 4.1 sy, 0.0 ni, 85.7 id, 0.0 wa, 0.5 hi, 9.7 si After applying the patch: %Cpu(s): 0.1 us, 2.3 sy, 0.0 ni, 94.5 id, 0.0 wa, 0.4 hi, 2.6 si Signed-off-by: Wei Fang Reviewed-by: Frank Li Reviewed-by: Claudiu Manoil Link: https://patch.msgid.link/20241219054755.1615626-4-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 264 +++++++++++++++++- drivers/net/ethernet/freescale/enetc/enetc.h | 14 + .../net/ethernet/freescale/enetc/enetc4_hw.h | 23 ++ .../net/ethernet/freescale/enetc/enetc_hw.h | 16 +- .../freescale/enetc/enetc_pf_common.c | 3 + 5 files changed, 310 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 76c33506991b2..6a6fc819dfdee 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -532,6 +532,230 @@ static void enetc_tso_complete_csum(struct enetc_bdr *tx_ring, struct tso_t *tso } } +static int enetc_lso_count_descs(const struct sk_buff *skb) +{ + /* 4 BDs: 1 BD for LSO header + 1 BD for extended BD + 1 BD + * for linear area data but not include LSO header, namely + * skb_headlen(skb) - lso_hdr_len (it may be 0, but that's + * okay, we only need to consider the worst case). And 1 BD + * for gap. + */ + return skb_shinfo(skb)->nr_frags + 4; +} + +static int enetc_lso_get_hdr_len(const struct sk_buff *skb) +{ + int hdr_len, tlen; + + tlen = skb_is_gso_tcp(skb) ? tcp_hdrlen(skb) : sizeof(struct udphdr); + hdr_len = skb_transport_offset(skb) + tlen; + + return hdr_len; +} + +static void enetc_lso_start(struct sk_buff *skb, struct enetc_lso_t *lso) +{ + lso->lso_seg_size = skb_shinfo(skb)->gso_size; + lso->ipv6 = enetc_skb_is_ipv6(skb); + lso->tcp = skb_is_gso_tcp(skb); + lso->l3_hdr_len = skb_network_header_len(skb); + lso->l3_start = skb_network_offset(skb); + lso->hdr_len = enetc_lso_get_hdr_len(skb); + lso->total_len = skb->len - lso->hdr_len; +} + +static void enetc_lso_map_hdr(struct enetc_bdr *tx_ring, struct sk_buff *skb, + int *i, struct enetc_lso_t *lso) +{ + union enetc_tx_bd txbd_tmp, *txbd; + struct enetc_tx_swbd *tx_swbd; + u16 frm_len, frm_len_ext; + u8 flags, e_flags = 0; + dma_addr_t addr; + char *hdr; + + /* Get the first BD of the LSO BDs chain */ + txbd = ENETC_TXBD(*tx_ring, *i); + tx_swbd = &tx_ring->tx_swbd[*i]; + prefetchw(txbd); + + /* Prepare LSO header: MAC + IP + TCP/UDP */ + hdr = tx_ring->tso_headers + *i * TSO_HEADER_SIZE; + memcpy(hdr, skb->data, lso->hdr_len); + addr = tx_ring->tso_headers_dma + *i * TSO_HEADER_SIZE; + + /* {frm_len_ext, frm_len} indicates the total length of + * large transmit data unit. frm_len contains the 16 least + * significant bits and frm_len_ext contains the 4 most + * significant bits. + */ + frm_len = lso->total_len & 0xffff; + frm_len_ext = (lso->total_len >> 16) & 0xf; + + /* Set the flags of the first BD */ + flags = ENETC_TXBD_FLAGS_EX | ENETC_TXBD_FLAGS_CSUM_LSO | + ENETC_TXBD_FLAGS_LSO | ENETC_TXBD_FLAGS_L4CS; + + enetc_clear_tx_bd(&txbd_tmp); + txbd_tmp.addr = cpu_to_le64(addr); + txbd_tmp.hdr_len = cpu_to_le16(lso->hdr_len); + + /* first BD needs frm_len and offload flags set */ + txbd_tmp.frm_len = cpu_to_le16(frm_len); + txbd_tmp.flags = flags; + + txbd_tmp.l3_aux0 = FIELD_PREP(ENETC_TX_BD_L3_START, lso->l3_start); + /* l3_hdr_size in 32-bits (4 bytes) */ + txbd_tmp.l3_aux1 = FIELD_PREP(ENETC_TX_BD_L3_HDR_LEN, + lso->l3_hdr_len / 4); + if (lso->ipv6) + txbd_tmp.l3_aux1 |= ENETC_TX_BD_L3T; + else + txbd_tmp.l3_aux0 |= ENETC_TX_BD_IPCS; + + txbd_tmp.l4_aux = FIELD_PREP(ENETC_TX_BD_L4T, lso->tcp ? + ENETC_TXBD_L4T_TCP : ENETC_TXBD_L4T_UDP); + + /* For the LSO header we do not set the dma address since + * we do not want it unmapped when we do cleanup. We still + * set len so that we count the bytes sent. + */ + tx_swbd->len = lso->hdr_len; + tx_swbd->do_twostep_tstamp = false; + tx_swbd->check_wb = false; + + /* Actually write the header in the BD */ + *txbd = txbd_tmp; + + /* Get the next BD, and the next BD is extended BD */ + enetc_bdr_idx_inc(tx_ring, i); + txbd = ENETC_TXBD(*tx_ring, *i); + tx_swbd = &tx_ring->tx_swbd[*i]; + prefetchw(txbd); + + enetc_clear_tx_bd(&txbd_tmp); + if (skb_vlan_tag_present(skb)) { + /* Setup the VLAN fields */ + txbd_tmp.ext.vid = cpu_to_le16(skb_vlan_tag_get(skb)); + txbd_tmp.ext.tpid = ENETC_TPID_8021Q; + e_flags = ENETC_TXBD_E_FLAGS_VLAN_INS; + } + + /* Write the BD */ + txbd_tmp.ext.e_flags = e_flags; + txbd_tmp.ext.lso_sg_size = cpu_to_le16(lso->lso_seg_size); + txbd_tmp.ext.frm_len_ext = cpu_to_le16(frm_len_ext); + *txbd = txbd_tmp; +} + +static int enetc_lso_map_data(struct enetc_bdr *tx_ring, struct sk_buff *skb, + int *i, struct enetc_lso_t *lso, int *count) +{ + union enetc_tx_bd txbd_tmp, *txbd = NULL; + struct enetc_tx_swbd *tx_swbd; + skb_frag_t *frag; + dma_addr_t dma; + u8 flags = 0; + int len, f; + + len = skb_headlen(skb) - lso->hdr_len; + if (len > 0) { + dma = dma_map_single(tx_ring->dev, skb->data + lso->hdr_len, + len, DMA_TO_DEVICE); + if (dma_mapping_error(tx_ring->dev, dma)) + return -ENOMEM; + + enetc_bdr_idx_inc(tx_ring, i); + txbd = ENETC_TXBD(*tx_ring, *i); + tx_swbd = &tx_ring->tx_swbd[*i]; + prefetchw(txbd); + *count += 1; + + enetc_clear_tx_bd(&txbd_tmp); + txbd_tmp.addr = cpu_to_le64(dma); + txbd_tmp.buf_len = cpu_to_le16(len); + + tx_swbd->dma = dma; + tx_swbd->len = len; + tx_swbd->is_dma_page = 0; + tx_swbd->dir = DMA_TO_DEVICE; + } + + frag = &skb_shinfo(skb)->frags[0]; + for (f = 0; f < skb_shinfo(skb)->nr_frags; f++, frag++) { + if (txbd) + *txbd = txbd_tmp; + + len = skb_frag_size(frag); + dma = skb_frag_dma_map(tx_ring->dev, frag); + if (dma_mapping_error(tx_ring->dev, dma)) + return -ENOMEM; + + /* Get the next BD */ + enetc_bdr_idx_inc(tx_ring, i); + txbd = ENETC_TXBD(*tx_ring, *i); + tx_swbd = &tx_ring->tx_swbd[*i]; + prefetchw(txbd); + *count += 1; + + enetc_clear_tx_bd(&txbd_tmp); + txbd_tmp.addr = cpu_to_le64(dma); + txbd_tmp.buf_len = cpu_to_le16(len); + + tx_swbd->dma = dma; + tx_swbd->len = len; + tx_swbd->is_dma_page = 1; + tx_swbd->dir = DMA_TO_DEVICE; + } + + /* Last BD needs 'F' bit set */ + flags |= ENETC_TXBD_FLAGS_F; + txbd_tmp.flags = flags; + *txbd = txbd_tmp; + + tx_swbd->is_eof = 1; + tx_swbd->skb = skb; + + return 0; +} + +static int enetc_lso_hw_offload(struct enetc_bdr *tx_ring, struct sk_buff *skb) +{ + struct enetc_tx_swbd *tx_swbd; + struct enetc_lso_t lso = {0}; + int err, i, count = 0; + + /* Initialize the LSO handler */ + enetc_lso_start(skb, &lso); + i = tx_ring->next_to_use; + + enetc_lso_map_hdr(tx_ring, skb, &i, &lso); + /* First BD and an extend BD */ + count += 2; + + err = enetc_lso_map_data(tx_ring, skb, &i, &lso, &count); + if (err) + goto dma_err; + + /* Go to the next BD */ + enetc_bdr_idx_inc(tx_ring, &i); + tx_ring->next_to_use = i; + enetc_update_tx_ring_tail(tx_ring); + + return count; + +dma_err: + do { + tx_swbd = &tx_ring->tx_swbd[i]; + enetc_free_tx_frame(tx_ring, tx_swbd); + if (i == 0) + i = tx_ring->bd_count; + i--; + } while (--count); + + return 0; +} + static int enetc_map_tx_tso_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) { struct enetc_ndev_priv *priv = netdev_priv(tx_ring->ndev); @@ -652,14 +876,26 @@ static netdev_tx_t enetc_start_xmit(struct sk_buff *skb, tx_ring = priv->tx_ring[skb->queue_mapping]; if (skb_is_gso(skb)) { - if (enetc_bd_unused(tx_ring) < tso_count_descs(skb)) { - netif_stop_subqueue(ndev, tx_ring->index); - return NETDEV_TX_BUSY; - } + /* LSO data unit lengths of up to 256KB are supported */ + if (priv->active_offloads & ENETC_F_LSO && + (skb->len - enetc_lso_get_hdr_len(skb)) <= + ENETC_LSO_MAX_DATA_LEN) { + if (enetc_bd_unused(tx_ring) < enetc_lso_count_descs(skb)) { + netif_stop_subqueue(ndev, tx_ring->index); + return NETDEV_TX_BUSY; + } - enetc_lock_mdio(); - count = enetc_map_tx_tso_buffs(tx_ring, skb); - enetc_unlock_mdio(); + count = enetc_lso_hw_offload(tx_ring, skb); + } else { + if (enetc_bd_unused(tx_ring) < tso_count_descs(skb)) { + netif_stop_subqueue(ndev, tx_ring->index); + return NETDEV_TX_BUSY; + } + + enetc_lock_mdio(); + count = enetc_map_tx_tso_buffs(tx_ring, skb); + enetc_unlock_mdio(); + } } else { if (unlikely(skb_shinfo(skb)->nr_frags > priv->max_frags)) if (unlikely(skb_linearize(skb))) @@ -1799,6 +2035,9 @@ void enetc_get_si_caps(struct enetc_si *si) rss = enetc_rd(hw, ENETC_SIRSSCAPR); si->num_rss = ENETC_SIRSSCAPR_GET_NUM_RSS(rss); } + + if (val & ENETC_SIPCAPR0_LSO) + si->hw_features |= ENETC_SI_F_LSO; } EXPORT_SYMBOL_GPL(enetc_get_si_caps); @@ -2095,6 +2334,14 @@ static int enetc_setup_default_rss_table(struct enetc_si *si, int num_groups) return 0; } +static void enetc_set_lso_flags_mask(struct enetc_hw *hw) +{ + enetc_wr(hw, ENETC4_SILSOSFMR0, + SILSOSFMR0_VAL_SET(ENETC4_TCP_NL_SEG_FLAGS_DMASK, + ENETC4_TCP_NL_SEG_FLAGS_DMASK)); + enetc_wr(hw, ENETC4_SILSOSFMR1, 0); +} + int enetc_configure_si(struct enetc_ndev_priv *priv) { struct enetc_si *si = priv->si; @@ -2108,6 +2355,9 @@ int enetc_configure_si(struct enetc_ndev_priv *priv) /* enable SI */ enetc_wr(hw, ENETC_SIMR, ENETC_SIMR_EN); + if (si->hw_features & ENETC_SI_F_LSO) + enetc_set_lso_flags_mask(hw); + /* TODO: RSS support for i.MX95 will be supported later, and the * is_enetc_rev1() condition will be removed */ diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 1e680f0f51236..4ad4eb5c5a747 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -41,6 +41,18 @@ struct enetc_tx_swbd { u8 qbv_en:1; }; +struct enetc_lso_t { + bool ipv6; + bool tcp; + u8 l3_hdr_len; + u8 hdr_len; /* LSO header length */ + u8 l3_start; + u16 lso_seg_size; + int total_len; /* total data length, not include LSO header */ +}; + +#define ENETC_LSO_MAX_DATA_LEN SZ_256K + #define ENETC_RX_MAXFRM_SIZE ENETC_MAC_MAXFRM_SIZE #define ENETC_RXB_TRUESIZE 2048 /* PAGE_SIZE >> 1 */ #define ENETC_RXB_PAD NET_SKB_PAD /* add extra space if needed */ @@ -238,6 +250,7 @@ enum enetc_errata { #define ENETC_SI_F_PSFP BIT(0) #define ENETC_SI_F_QBV BIT(1) #define ENETC_SI_F_QBU BIT(2) +#define ENETC_SI_F_LSO BIT(3) struct enetc_drvdata { u32 pmac_offset; /* Only valid for PSI which supports 802.1Qbu */ @@ -351,6 +364,7 @@ enum enetc_active_offloads { ENETC_F_QCI = BIT(10), ENETC_F_QBU = BIT(11), ENETC_F_TXCSUM = BIT(12), + ENETC_F_LSO = BIT(13), }; enum enetc_flags_bit { diff --git a/drivers/net/ethernet/freescale/enetc/enetc4_hw.h b/drivers/net/ethernet/freescale/enetc/enetc4_hw.h index 26b2206774481..695cb07c74bcf 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc4_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc4_hw.h @@ -12,6 +12,29 @@ #define NXP_ENETC_VENDOR_ID 0x1131 #define NXP_ENETC_PF_DEV_ID 0xe101 +/**********************Station interface registers************************/ +/* Station interface LSO segmentation flag mask register 0/1 */ +#define ENETC4_SILSOSFMR0 0x1300 +#define SILSOSFMR0_TCP_MID_SEG GENMASK(27, 16) +#define SILSOSFMR0_TCP_1ST_SEG GENMASK(11, 0) +#define SILSOSFMR0_VAL_SET(first, mid) (FIELD_PREP(SILSOSFMR0_TCP_MID_SEG, mid) | \ + FIELD_PREP(SILSOSFMR0_TCP_1ST_SEG, first)) + +#define ENETC4_SILSOSFMR1 0x1304 +#define SILSOSFMR1_TCP_LAST_SEG GENMASK(11, 0) +#define ENETC4_TCP_FLAGS_FIN BIT(0) +#define ENETC4_TCP_FLAGS_SYN BIT(1) +#define ENETC4_TCP_FLAGS_RST BIT(2) +#define ENETC4_TCP_FLAGS_PSH BIT(3) +#define ENETC4_TCP_FLAGS_ACK BIT(4) +#define ENETC4_TCP_FLAGS_URG BIT(5) +#define ENETC4_TCP_FLAGS_ECE BIT(6) +#define ENETC4_TCP_FLAGS_CWR BIT(7) +#define ENETC4_TCP_FLAGS_NS BIT(8) +/* According to tso_build_hdr(), clear all special flags for not last packet. */ +#define ENETC4_TCP_NL_SEG_FLAGS_DMASK (ENETC4_TCP_FLAGS_FIN | \ + ENETC4_TCP_FLAGS_RST | ENETC4_TCP_FLAGS_PSH) + /***************************ENETC port registers**************************/ #define ENETC4_ECAPR0 0x0 #define ECAPR0_RFS BIT(2) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index 0e259baf36ee6..4098f01479bc0 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -25,6 +25,7 @@ #define ENETC_SIPCAPR0 0x20 #define ENETC_SIPCAPR0_RSS BIT(8) #define ENETC_SIPCAPR0_RFS BIT(2) +#define ENETC_SIPCAPR0_LSO BIT(1) #define ENETC_SIPCAPR1 0x24 #define ENETC_SITGTGR 0x30 #define ENETC_SIRBGCR 0x38 @@ -554,7 +555,10 @@ static inline u64 _enetc_rd_reg64_wa(void __iomem *reg) union enetc_tx_bd { struct { __le64 addr; - __le16 buf_len; + union { + __le16 buf_len; + __le16 hdr_len; /* For LSO, ENETC 4.1 and later */ + }; __le16 frm_len; union { struct { @@ -578,13 +582,16 @@ union enetc_tx_bd { __le32 tstamp; __le16 tpid; __le16 vid; - u8 reserved[6]; + __le16 lso_sg_size; /* For ENETC 4.1 and later */ + __le16 frm_len_ext; /* For ENETC 4.1 and later */ + u8 reserved[2]; u8 e_flags; u8 flags; } ext; /* Tx BD extension */ struct { __le32 tstamp; - u8 reserved[10]; + u8 reserved[8]; + __le16 lso_err_count; /* For ENETC 4.1 and later */ u8 status; u8 flags; } wb; /* writeback descriptor */ @@ -593,6 +600,7 @@ union enetc_tx_bd { enum enetc_txbd_flags { ENETC_TXBD_FLAGS_L4CS = BIT(0), /* For ENETC 4.1 and later */ ENETC_TXBD_FLAGS_TSE = BIT(1), + ENETC_TXBD_FLAGS_LSO = BIT(1), /* For ENETC 4.1 and later */ ENETC_TXBD_FLAGS_W = BIT(2), ENETC_TXBD_FLAGS_CSUM_LSO = BIT(3), /* For ENETC 4.1 and later */ ENETC_TXBD_FLAGS_TXSTART = BIT(4), @@ -663,6 +671,8 @@ union enetc_rx_bd { #define ENETC_CBD_FLAGS_SF BIT(7) /* short format */ #define ENETC_CBD_STATUS_MASK 0xf +#define ENETC_TPID_8021Q 0 + struct enetc_cmd_rfse { u8 smac_h[6]; u8 smac_m[6]; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c b/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c index 00b73a9487466..31dedc665a165 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c @@ -123,6 +123,9 @@ void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev, if (si->drvdata->tx_csum) priv->active_offloads |= ENETC_F_TXCSUM; + if (si->hw_features & ENETC_SI_F_LSO) + priv->active_offloads |= ENETC_F_LSO; + /* TODO: currently, i.MX95 ENETC driver does not support advanced features */ if (!is_enetc_rev1(si)) { ndev->hw_features &= ~(NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_LOOPBACK); From c12e82c053f6f444a6644ae937b037a3272d6c5a Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 19 Dec 2024 13:47:55 +0800 Subject: [PATCH 0941/1450] net: enetc: add UDP segmentation offload support Set NETIF_F_GSO_UDP_L4 bit of hw_features and features because i.MX95 enetc and LS1028A driver implements UDP segmentation. - i.MX95 ENETC supports UDP segmentation via LSO. - LS1028A ENETC supports UDP segmentation since the commit 3d5b459ba0e3 ("net: tso: add UDP segmentation support"). Signed-off-by: Wei Fang Reviewed-by: Frank Li Link: https://patch.msgid.link/20241219054755.1615626-5-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc_pf_common.c | 6 ++++-- drivers/net/ethernet/freescale/enetc/enetc_vf.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c b/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c index 31dedc665a165..3fd9b07278757 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf_common.c @@ -110,11 +110,13 @@ void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev, ndev->hw_features = NETIF_F_SG | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_LOOPBACK | - NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6; + NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_GSO_UDP_L4; ndev->features = NETIF_F_HIGHDMA | NETIF_F_SG | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | - NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6; + NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_GSO_UDP_L4; ndev->vlan_features = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c index 63d78b2b86709..3768752b6008b 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c @@ -145,11 +145,13 @@ static void enetc_vf_netdev_setup(struct enetc_si *si, struct net_device *ndev, ndev->hw_features = NETIF_F_SG | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | - NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6; + NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_GSO_UDP_L4; ndev->features = NETIF_F_HIGHDMA | NETIF_F_SG | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | - NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6; + NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_GSO_UDP_L4; ndev->vlan_features = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6; From 4f4aa4aa28142d53f8b06585c478476cfe325cfc Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 19 Dec 2024 15:28:59 +0800 Subject: [PATCH 0942/1450] net: fix memory leak in tcp_conn_request() If inet_csk_reqsk_queue_hash_add() return false, tcp_conn_request() will return without free the dst memory, which allocated in af_ops->route_req. Here is the kmemleak stack: unreferenced object 0xffff8881198631c0 (size 240): comm "softirq", pid 0, jiffies 4299266571 (age 1802.392s) hex dump (first 32 bytes): 00 10 9b 03 81 88 ff ff 80 98 da bc ff ff ff ff ................ 81 55 18 bb ff ff ff ff 00 00 00 00 00 00 00 00 .U.............. backtrace: [] kmem_cache_alloc+0x60c/0xa80 [] dst_alloc+0x55/0x250 [] rt_dst_alloc+0x46/0x1d0 [] __mkroute_output+0x29a/0xa50 [] ip_route_output_key_hash+0x10b/0x240 [] ip_route_output_flow+0x1d/0x90 [] inet_csk_route_req+0x2c5/0x500 [] tcp_conn_request+0x691/0x12c0 [] tcp_rcv_state_process+0x3c8/0x11b0 [] tcp_v4_do_rcv+0x156/0x3b0 [] tcp_v4_rcv+0x1cf8/0x1d80 [] ip_protocol_deliver_rcu+0xf6/0x360 [] ip_local_deliver_finish+0xe6/0x1e0 [] ip_local_deliver+0xee/0x360 [] ip_rcv+0xad/0x2f0 [] __netif_receive_skb_one_core+0x123/0x140 Call dst_release() to free the dst memory when inet_csk_reqsk_queue_hash_add() return false in tcp_conn_request(). Fixes: ff46e3b44219 ("Fix race for duplicate reqsk on identical SYN") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20241219072859.3783576-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bdf13ac26ef0..4811727b8a022 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7328,6 +7328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, req->timeout))) { reqsk_free(req); + dst_release(dst); return 0; } From b5a7b661a073727219fedc35f5619f62418ffe72 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Thu, 19 Dec 2024 21:03:36 +0800 Subject: [PATCH 0943/1450] net: Fix netns for ip_tunnel_init_flow() The device denoted by tunnel->parms.link resides in the underlay net namespace. Therefore pass tunnel->net to ip_tunnel_init_flow(). Fixes: db53cd3d88dc ("net: Handle l3mdev in ip_tunnel_init_flow") Signed-off-by: Xiao Liang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241219130336.103839-1-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 3 +-- net/ipv4/ip_tunnel.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 4b5fd71c897dd..32d2e61f2b823 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -423,8 +423,7 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, parms = mlxsw_sp_ipip_netdev_parms4(to_dev); ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp, - 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0, - 0); + 0, 0, tun->net, parms.link, tun->fwmark, 0, 0); rt = ip_route_output_key(tun->net, &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 25505f9b724c3..09b73acf037ae 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -294,7 +294,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - iph->tos & INET_DSCP_MASK, dev_net(dev), + iph->tos & INET_DSCP_MASK, tunnel->net, tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); @@ -611,7 +611,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), - tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark, + tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark, skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) @@ -774,7 +774,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, tos & INET_DSCP_MASK, - dev_net(dev), READ_ONCE(tunnel->parms.link), + tunnel->net, READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) From a4fd163aed2edd967a244499754dec991d8b4c7d Mon Sep 17 00:00:00 2001 From: Ilya Shchipletsov Date: Thu, 19 Dec 2024 08:23:07 +0000 Subject: [PATCH 0944/1450] netrom: check buffer length before accessing it Syzkaller reports an uninit value read from ax25cmp when sending raw message through ieee802154 implementation. ===================================================== BUG: KMSAN: uninit-value in ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 nr_dev_get+0x20e/0x450 net/netrom/nr_route.c:601 nr_route_frame+0x1a2/0xfc0 net/netrom/nr_route.c:774 nr_xmit+0x5a/0x1c0 net/netrom/nr_dev.c:144 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3548 [inline] dev_hard_start_xmit+0x247/0xa10 net/core/dev.c:3564 __dev_queue_xmit+0x33b8/0x5130 net/core/dev.c:4349 dev_queue_xmit include/linux/netdevice.h:3134 [inline] raw_sendmsg+0x654/0xc10 net/ieee802154/socket.c:299 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780 sock_alloc_send_skb include/net/sock.h:1884 [inline] raw_sendmsg+0x36d/0xc10 net/ieee802154/socket.c:282 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 5037 Comm: syz-executor166 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 ===================================================== This issue occurs because the skb buffer is too small, and it's actual allocation is aligned. This hides an actual issue, which is that nr_route_frame does not validate the buffer size before using it. Fix this issue by checking skb->len before accessing any fields in skb->data. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Co-developed-by: Nikita Marushkin Signed-off-by: Nikita Marushkin Signed-off-by: Ilya Shchipletsov Link: https://patch.msgid.link/20241219082308.3942-1-rabbelkin@mail.ru Signed-off-by: Jakub Kicinski --- net/netrom/nr_route.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 2b5e246b8d9a7..b94cb2ffbaf8f 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -754,6 +754,12 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) int ret; struct sk_buff *skbn; + /* + * Reject malformed packets early. Check that it contains at least 2 + * addresses and 1 byte more for Time-To-Live + */ + if (skb->len < 2 * sizeof(ax25_address) + 1) + return 0; nr_src = (ax25_address *)(skb->data + 0); nr_dest = (ax25_address *)(skb->data + 7); From 4e86729d1ff329815a6e8a920cb554a1d4cb5b8d Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Thu, 19 Dec 2024 19:21:14 +0300 Subject: [PATCH 0945/1450] net/sctp: Prevent autoclose integer overflow in sctp_association_init() While by default max_autoclose equals to INT_MAX / HZ, one may set net.sctp.max_autoclose to UINT_MAX. There is code in sctp_association_init() that can consequently trigger overflow. Cc: stable@vger.kernel.org Fixes: 9f70f46bd4c7 ("sctp: properly latch and use autoclose value from sock to association") Signed-off-by: Nikolay Kuratov Acked-by: Xin Long Link: https://patch.msgid.link/20241219162114.2863827-1-kniv@yandex-team.ru Signed-off-by: Jakub Kicinski --- net/sctp/associola.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index c45c192b78787..0b0794f164cf2 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -137,7 +137,8 @@ static struct sctp_association *sctp_association_init( = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; - asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + (unsigned long)sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) From 46e0ccfb88f02ab2eb20a41d519d6e4c028652f2 Mon Sep 17 00:00:00 2001 From: Radu Rendec Date: Thu, 19 Dec 2024 11:36:05 -0500 Subject: [PATCH 0946/1450] net: vxlan: rename SKB_DROP_REASON_VXLAN_NO_REMOTE The SKB_DROP_REASON_VXLAN_NO_REMOTE skb drop reason was introduced in the specific context of vxlan. As it turns out, there are similar cases when a packet needs to be dropped in other parts of the network stack, such as the bridge module. Rename SKB_DROP_REASON_VXLAN_NO_REMOTE and give it a more generic name, so that it can be used in other parts of the network stack. This is not a functional change, and the numeric value of the drop reason even remains unchanged. Signed-off-by: Radu Rendec Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241219163606.717758-2-rrendec@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 4 ++-- drivers/net/vxlan/vxlan_mdb.c | 2 +- include/net/dropreason-core.h | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 0c356e0a61ef0..05c10acb2a57e 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2798,7 +2798,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); - kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_NO_REMOTE); + kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); return NETDEV_TX_OK; } } @@ -2821,7 +2821,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) if (fdst) vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); else - kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_NO_REMOTE); + kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); } return NETDEV_TX_OK; diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c index 8735891ee1286..816ab1aa05262 100644 --- a/drivers/net/vxlan/vxlan_mdb.c +++ b/drivers/net/vxlan/vxlan_mdb.c @@ -1712,7 +1712,7 @@ netdev_tx_t vxlan_mdb_xmit(struct vxlan_dev *vxlan, vxlan_xmit_one(skb, vxlan->dev, src_vni, rcu_dereference(fremote->rd), false); else - kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_NO_REMOTE); + kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); return NETDEV_TX_OK; } diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index ead4170a1d0a9..6e32106d72294 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -106,7 +106,7 @@ FN(VXLAN_VNI_NOT_FOUND) \ FN(MAC_INVALID_SOURCE) \ FN(VXLAN_ENTRY_EXISTS) \ - FN(VXLAN_NO_REMOTE) \ + FN(NO_TX_TARGET) \ FN(IP_TUNNEL_ECN) \ FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ @@ -497,8 +497,8 @@ enum skb_drop_reason { * entry or an entry pointing to a nexthop. */ SKB_DROP_REASON_VXLAN_ENTRY_EXISTS, - /** @SKB_DROP_REASON_VXLAN_NO_REMOTE: no remote found for xmit */ - SKB_DROP_REASON_VXLAN_NO_REMOTE, + /** @SKB_DROP_REASON_NO_TX_TARGET: no target found for xmit */ + SKB_DROP_REASON_NO_TX_TARGET, /** * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. From 623e43c2f5023853cbf71d6a60898d448a06416a Mon Sep 17 00:00:00 2001 From: Radu Rendec Date: Thu, 19 Dec 2024 11:36:06 -0500 Subject: [PATCH 0947/1450] net: bridge: add skb drop reasons to the most common drop points The bridge input code may drop frames for various reasons and at various points in the ingress handling logic. Currently kfree_skb() is used everywhere, and therefore no drop reason is specified. Add drop reasons to the most common drop points. Drop reasons are not added exhaustively to the entire bridge code. The intention is to incrementally add drop reasons to the rest of the bridge code in follow up patches. Signed-off-by: Radu Rendec Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241219163606.717758-3-rrendec@redhat.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 12 ++++++++++++ net/bridge/br_forward.c | 16 ++++++++++++---- net/bridge/br_input.c | 20 +++++++++++++++----- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 6e32106d72294..3a6602f379783 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -111,6 +111,8 @@ FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ FN(ARP_PVLAN_DISABLE) \ + FN(MAC_IEEE_MAC_CONTROL) \ + FN(BRIDGE_INGRESS_STP_STATE) \ FNe(MAX) /** @@ -520,6 +522,16 @@ enum skb_drop_reason { * enabled. */ SKB_DROP_REASON_ARP_PVLAN_DISABLE, + /** + * @SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL: the destination MAC address + * is an IEEE MAC Control address. + */ + SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL, + /** + * @SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE: the STP state of the + * ingress bridge port does not allow frames to be forwarded. + */ + SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index e19b583ff2c6d..29097e984b4f7 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -201,6 +201,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig, u16 vid) { + enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET; struct net_bridge_port *prev = NULL; struct net_bridge_port *p; @@ -234,8 +235,11 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, continue; prev = maybe_deliver(prev, p, skb, local_orig); - if (IS_ERR(prev)) + if (IS_ERR(prev)) { + reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM : + SKB_DROP_REASON_NOT_SPECIFIED; goto out; + } } if (!prev) @@ -249,7 +253,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, out: if (!local_rcv) - kfree_skb(skb); + kfree_skb_reason(skb, reason); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING @@ -289,6 +293,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { + enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET; struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; bool allow_mode_include = true; @@ -329,8 +334,11 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, } prev = maybe_deliver(prev, port, skb, local_orig); - if (IS_ERR(prev)) + if (IS_ERR(prev)) { + reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM : + SKB_DROP_REASON_NOT_SPECIFIED; goto out; + } delivered: if ((unsigned long)lport >= (unsigned long)port) p = rcu_dereference(p->next); @@ -349,6 +357,6 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, out: if (!local_rcv) - kfree_skb(skb); + kfree_skb_reason(skb, reason); } #endif diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index ceaa5a89b947f..232133a0fd21c 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -75,6 +75,7 @@ static int br_pass_frame_up(struct sk_buff *skb, bool promisc) /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; @@ -96,8 +97,10 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (br_mst_is_enabled(br)) { state = BR_STATE_FORWARDING; } else { - if (p->state == BR_STATE_DISABLED) + if (p->state == BR_STATE_DISABLED) { + reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; + } state = p->state; } @@ -155,8 +158,10 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } } - if (state == BR_STATE_LEARNING) + if (state == BR_STATE_LEARNING) { + reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; + } BR_INPUT_SKB_CB(skb)->brdev = br->dev; BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); @@ -223,7 +228,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb out: return 0; drop: - kfree_skb(skb); + kfree_skb_reason(skb, reason); goto out; } EXPORT_SYMBOL_GPL(br_handle_frame_finish); @@ -324,6 +329,7 @@ static int br_process_frame_type(struct net_bridge_port *p, */ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; @@ -331,8 +337,10 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; - if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) + if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) { + reason = SKB_DROP_REASON_MAC_INVALID_SOURCE; goto drop; + } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) @@ -374,6 +382,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; case 0x01: /* IEEE MAC (Pause) */ + reason = SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL; goto drop; case 0x0E: /* 802.1AB LLDP */ @@ -423,8 +432,9 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) return nf_hook_bridge_pre(skb, pskb); default: + reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; drop: - kfree_skb(skb); + kfree_skb_reason(skb, reason); } return RX_HANDLER_CONSUMED; } From 85101bda1387e85eabf77cd416bfc38e14f1bce6 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 19 Dec 2024 17:30:04 +0000 Subject: [PATCH 0948/1450] sfc: Use netdev refcount tracking in struct efx_async_filter_insertion I was debugging some netdev refcount issues in OpenOnload, and one of the places I was looking at was in the sfc driver. Only struct efx_async_filter_insertion was not using netdev refcount tracker, so add it here. GFP_ATOMIC because this code path is called by ndo_rx_flow_steer which holds RCU. This patch should be a no-op if !CONFIG_NET_DEV_REFCNT_TRACKER Signed-off-by: YiFei Zhu Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241219173004.2615655-1-zhuyifei@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/net_driver.h | 2 ++ drivers/net/ethernet/sfc/rx_common.c | 5 +++-- drivers/net/ethernet/sfc/siena/net_driver.h | 2 ++ drivers/net/ethernet/sfc/siena/rx_common.c | 5 +++-- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index 620ba6ef3514b..f70a7b7d6345c 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -831,6 +831,7 @@ struct efx_arfs_rule { /** * struct efx_async_filter_insertion - Request to asynchronously insert a filter * @net_dev: Reference to the netdevice + * @net_dev_tracker: reference tracker entry for @net_dev * @spec: The filter to insert * @work: Workitem for this request * @rxq_index: Identifies the channel for which this request was made @@ -838,6 +839,7 @@ struct efx_arfs_rule { */ struct efx_async_filter_insertion { struct net_device *net_dev; + netdevice_tracker net_dev_tracker; struct efx_filter_spec spec; struct work_struct work; u16 rxq_index; diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index ab358fe13e1df..4cc83203e1880 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -897,7 +897,7 @@ static void efx_filter_rfs_work(struct work_struct *data) /* Release references */ clear_bit(slot_idx, &efx->rps_slot_map); - dev_put(req->net_dev); + netdev_put(req->net_dev, &req->net_dev_tracker); } int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, @@ -989,7 +989,8 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, } /* Queue the request */ - dev_hold(req->net_dev = net_dev); + req->net_dev = net_dev; + netdev_hold(req->net_dev, &req->net_dev_tracker, GFP_ATOMIC); INIT_WORK(&req->work, efx_filter_rfs_work); req->rxq_index = rxq_index; req->flow_id = flow_id; diff --git a/drivers/net/ethernet/sfc/siena/net_driver.h b/drivers/net/ethernet/sfc/siena/net_driver.h index 9785eff10607b..2be3bad3c9933 100644 --- a/drivers/net/ethernet/sfc/siena/net_driver.h +++ b/drivers/net/ethernet/sfc/siena/net_driver.h @@ -753,6 +753,7 @@ struct efx_arfs_rule { /** * struct efx_async_filter_insertion - Request to asynchronously insert a filter * @net_dev: Reference to the netdevice + * @net_dev_tracker: reference tracker entry for @net_dev * @spec: The filter to insert * @work: Workitem for this request * @rxq_index: Identifies the channel for which this request was made @@ -760,6 +761,7 @@ struct efx_arfs_rule { */ struct efx_async_filter_insertion { struct net_device *net_dev; + netdevice_tracker net_dev_tracker; struct efx_filter_spec spec; struct work_struct work; u16 rxq_index; diff --git a/drivers/net/ethernet/sfc/siena/rx_common.c b/drivers/net/ethernet/sfc/siena/rx_common.c index 082e35c6caaae..2839d0e0a9c16 100644 --- a/drivers/net/ethernet/sfc/siena/rx_common.c +++ b/drivers/net/ethernet/sfc/siena/rx_common.c @@ -888,7 +888,7 @@ static void efx_filter_rfs_work(struct work_struct *data) /* Release references */ clear_bit(slot_idx, &efx->rps_slot_map); - dev_put(req->net_dev); + netdev_put(req->net_dev, &req->net_dev_tracker); } int efx_siena_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, @@ -980,7 +980,8 @@ int efx_siena_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, } /* Queue the request */ - dev_hold(req->net_dev = net_dev); + req->net_dev = net_dev; + netdev_hold(req->net_dev, &req->net_dev_tracker, GFP_ATOMIC); INIT_WORK(&req->work, efx_filter_rfs_work); req->rxq_index = rxq_index; req->flow_id = flow_id; From f288c7a1ba268a9ed58a7971142a98a1e41a3c73 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 Dec 2024 16:31:16 -0800 Subject: [PATCH 0949/1450] selftests: drv-net: assume stats refresh is 0 if no ethtool -c support Tests using HW stats wait for them to stabilize, using data from ethtool -c as the delay. Not all drivers implement ethtool -c so handle the errors gracefully. Reviewed-by: Andrew Lunn Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241220003116.1458863-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/lib/py/env.py | 9 +++++++-- tools/testing/selftests/net/lib/py/utils.py | 6 ++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 1ea9bb695e940..fea343f209eab 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -5,7 +5,7 @@ from pathlib import Path from lib.py import KsftSkipEx, KsftXfailEx from lib.py import ksft_setup -from lib.py import cmd, ethtool, ip +from lib.py import cmd, ethtool, ip, CmdExitFailure from lib.py import NetNS, NetdevSimDev from .remote import Remote @@ -234,7 +234,12 @@ def wait_hw_stats_settle(self): Good drivers will tell us via ethtool what their sync period is. """ if self._stats_settle_time is None: - data = ethtool("-c " + self.ifname, json=True)[0] + data = {} + try: + data = ethtool("-c " + self.ifname, json=True)[0] + except CmdExitFailure as e: + if "Operation not supported" not in e.cmd.stderr: + raise self._stats_settle_time = 0.025 + \ data.get('stats-block-usecs', 0) / 1000 / 1000 diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 72590c3f90f19..9e3bcddcf3e83 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -10,7 +10,9 @@ class CmdExitFailure(Exception): - pass + def __init__(self, msg, cmd_obj): + super().__init__(msg) + self.cmd = cmd_obj class cmd: @@ -48,7 +50,7 @@ def process(self, terminate=True, fail=None, timeout=5): if len(stderr) > 0 and stderr[-1] == "\n": stderr = stderr[:-1] raise CmdExitFailure("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" % - (self.proc.args, stdout, stderr)) + (self.proc.args, stdout, stderr), self) class bkg(cmd): From aa4ad7c3f283fa94b80cf84605661700aa39d708 Mon Sep 17 00:00:00 2001 From: Yuyang Huang Date: Sat, 21 Dec 2024 19:00:07 +0900 Subject: [PATCH 0950/1450] netlink: correct nlmsg size for multicast notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corrected the netlink message size calculation for multicast group join/leave notifications. The previous calculation did not account for the inclusion of both IPv4/IPv6 addresses and ifa_cacheinfo in the payload. This fix ensures that the allocated message size is sufficient to hold all necessary information. This patch also includes the following improvements: * Uses GFP_KERNEL instead of GFP_ATOMIC when holding the RTNL mutex. * Uses nla_total_size(sizeof(struct in6_addr)) instead of nla_total_size(16). * Removes unnecessary EXPORT_SYMBOL(). Fixes: 2c2b61d2138f ("netlink: add IGMP/MLD join/leave notifications") Cc: Maciej Żenczykowski Cc: Lorenzo Colitti Signed-off-by: Yuyang Huang Link: https://patch.msgid.link/20241221100007.1910089-1-yuyanghuang@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/igmp.c | 6 ++++-- net/ipv6/addrconf.c | 1 - net/ipv6/mcast.c | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 8a370ef37d3f6..3da126cea884d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1473,7 +1473,9 @@ static void inet_ifmcaddr_notify(struct net_device *dev, int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + - nla_total_size(sizeof(__be32)), GFP_ATOMIC); + nla_total_size(sizeof(__be32)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); if (!skb) goto error; @@ -1484,7 +1486,7 @@ static void inet_ifmcaddr_notify(struct net_device *dev, goto error; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2e2684886953d..4da409bc45777 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5239,7 +5239,6 @@ int inet6_fill_ifmcaddr(struct sk_buff *skb, nlmsg_end(skb, nlh); return 0; } -EXPORT_SYMBOL(inet6_fill_ifmcaddr); static int inet6_fill_ifacaddr(struct sk_buff *skb, const struct ifacaddr6 *ifaca, diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 587831c148dee..9dfdb40988b0f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -920,7 +920,9 @@ static void inet6_ifmcaddr_notify(struct net_device *dev, int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + - nla_total_size(16), GFP_ATOMIC); + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); if (!skb) goto error; @@ -931,7 +933,7 @@ static void inet6_ifmcaddr_notify(struct net_device *dev, goto error; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err); From f3af3ba1083836d174ada619366783fa17272f66 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 19 Dec 2024 10:49:28 +0100 Subject: [PATCH 0951/1450] vsock/test: Use NSEC_PER_SEC Replace 1000000000ULL with NSEC_PER_SEC. No functional change intended. Reviewed-by: Luigi Leonardi Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-1-a416e554d9d7@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 48f17641ca504..38fd8d96eb83e 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "vsock_test_zerocopy.h" #include "timeout.h" @@ -559,7 +560,7 @@ static time_t current_nsec(void) exit(EXIT_FAILURE); } - return (ts.tv_sec * 1000000000ULL) + ts.tv_nsec; + return (ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec; } #define RCVTIMEO_TIMEOUT_SEC 1 @@ -599,7 +600,7 @@ static void test_seqpacket_timeout_client(const struct test_opts *opts) } read_overhead_ns = current_nsec() - read_enter_ns - - 1000000000ULL * RCVTIMEO_TIMEOUT_SEC; + NSEC_PER_SEC * RCVTIMEO_TIMEOUT_SEC; if (read_overhead_ns > READ_OVERHEAD_NSEC) { fprintf(stderr, From ef8bd18f475e969753b1b72588a4932195d420f3 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 19 Dec 2024 10:49:29 +0100 Subject: [PATCH 0952/1450] vsock/test: Introduce option to select tests Allow for selecting specific test IDs to be executed. Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-2-a416e554d9d7@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/util.c | 29 +++++++++++++++++++++++++++-- tools/testing/vsock/util.h | 2 ++ tools/testing/vsock/vsock_test.c | 11 +++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 34e9dac0a105f..81b9a31059d81 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -486,8 +486,7 @@ void list_tests(const struct test_case *test_cases) exit(EXIT_FAILURE); } -void skip_test(struct test_case *test_cases, size_t test_cases_len, - const char *test_id_str) +static unsigned long parse_test_id(const char *test_id_str, size_t test_cases_len) { unsigned long test_id; char *endptr = NULL; @@ -505,9 +504,35 @@ void skip_test(struct test_case *test_cases, size_t test_cases_len, exit(EXIT_FAILURE); } + return test_id; +} + +void skip_test(struct test_case *test_cases, size_t test_cases_len, + const char *test_id_str) +{ + unsigned long test_id = parse_test_id(test_id_str, test_cases_len); test_cases[test_id].skip = true; } +void pick_test(struct test_case *test_cases, size_t test_cases_len, + const char *test_id_str) +{ + static bool skip_all = true; + unsigned long test_id; + + if (skip_all) { + unsigned long i; + + for (i = 0; i < test_cases_len; ++i) + test_cases[i].skip = true; + + skip_all = false; + } + + test_id = parse_test_id(test_id_str, test_cases_len); + test_cases[test_id].skip = false; +} + unsigned long hash_djb2(const void *data, size_t len) { unsigned long hash = 5381; diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index ba84d296d8b71..e62f46b2b92a7 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -62,6 +62,8 @@ void run_tests(const struct test_case *test_cases, void list_tests(const struct test_case *test_cases); void skip_test(struct test_case *test_cases, size_t test_cases_len, const char *test_id_str); +void pick_test(struct test_case *test_cases, size_t test_cases_len, + const char *test_id_str); unsigned long hash_djb2(const void *data, size_t len); size_t iovec_bytes(const struct iovec *iov, size_t iovnum); unsigned long iovec_hash_djb2(const struct iovec *iov, size_t iovnum); diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 38fd8d96eb83e..8bb2ab41c55f5 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1644,6 +1644,11 @@ static const struct option longopts[] = { .has_arg = required_argument, .val = 's', }, + { + .name = "pick", + .has_arg = required_argument, + .val = 't', + }, { .name = "help", .has_arg = no_argument, @@ -1681,6 +1686,8 @@ static void usage(void) " --peer-cid CID of the other side\n" " --peer-port AF_VSOCK port used for the test [default: %d]\n" " --list List of tests that will be executed\n" + " --pick Test ID to execute selectively;\n" + " use multiple --pick options to select more tests\n" " --skip Test ID to skip;\n" " use multiple --skip options to skip more tests\n", DEFAULT_PEER_PORT @@ -1737,6 +1744,10 @@ int main(int argc, char **argv) skip_test(test_cases, ARRAY_SIZE(test_cases) - 1, optarg); break; + case 't': + pick_test(test_cases, ARRAY_SIZE(test_cases) - 1, + optarg); + break; case '?': default: usage(); From 50f9434463a0be5b972ee442ba6a9704c9afb02a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 19 Dec 2024 10:49:30 +0100 Subject: [PATCH 0953/1450] vsock/test: Add README blurb about kmemleak usage Document the suggested use of kmemleak for memory leak detection. Suggested-by: Stefano Garzarella Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-3-a416e554d9d7@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/README | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/vsock/README b/tools/testing/vsock/README index 84ee217ba8eed..680ce666ceb56 100644 --- a/tools/testing/vsock/README +++ b/tools/testing/vsock/README @@ -36,6 +36,21 @@ Invoke test binaries in both directions as follows: --control-port=1234 \ --peer-cid=3 +Some tests are designed to produce kernel memory leaks. Leaks detection, +however, is deferred to Kernel Memory Leak Detector. It is recommended to enable +kmemleak (CONFIG_DEBUG_KMEMLEAK=y) and explicitly trigger a scan after each test +suite run, e.g. + + # echo clear > /sys/kernel/debug/kmemleak + # $TEST_BINARY ... + # echo "wait for any grace periods" && sleep 2 + # echo scan > /sys/kernel/debug/kmemleak + # echo "wait for kmemleak" && sleep 5 + # echo scan > /sys/kernel/debug/kmemleak + # cat /sys/kernel/debug/kmemleak + +For more information see Documentation/dev-tools/kmemleak.rst. + vsock_perf utility ------------------- 'vsock_perf' is a simple tool to measure vsock performance. It works in From f52e7f593b49344b9497c289cbb2ada213f60a7a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 19 Dec 2024 10:49:31 +0100 Subject: [PATCH 0954/1450] vsock/test: Adapt send_byte()/recv_byte() to handle MSG_ZEROCOPY For a zerocopy send(), buffer (always byte 'A') needs to be preserved (thus it can not be on the stack) or the data recv()ed check in recv_byte() might fail. While there, change the printf format to 0x%02x so the '\0' bytes can be seen. Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-4-a416e554d9d7@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 81b9a31059d81..7058dc614c25f 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -401,7 +401,7 @@ void recv_buf(int fd, void *buf, size_t len, int flags, ssize_t expected_ret) */ void send_byte(int fd, int expected_ret, int flags) { - const uint8_t byte = 'A'; + static const uint8_t byte = 'A'; send_buf(fd, &byte, sizeof(byte), flags, expected_ret); } @@ -420,7 +420,7 @@ void recv_byte(int fd, int expected_ret, int flags) recv_buf(fd, &byte, sizeof(byte), flags, expected_ret); if (byte != 'A') { - fprintf(stderr, "unexpected byte read %c\n", byte); + fprintf(stderr, "unexpected byte read 0x%02x\n", byte); exit(EXIT_FAILURE); } } From f66ef469a72d19764f943067307a570f83b00dca Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 19 Dec 2024 10:49:32 +0100 Subject: [PATCH 0955/1450] vsock/test: Add test for accept_queue memory leak Attempt to enqueue a child after the queue was flushed, but before SOCK_DONE flag has been set. Test tries to produce a memory leak, kmemleak should be employed. Dealing with a race condition, test by its very nature may lead to a false negative. Fixed by commit d7b0ff5a8667 ("virtio/vsock: Fix accept_queue memory leak"). Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-5-a416e554d9d7@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 52 ++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 8bb2ab41c55f5..2a8fcb062d9d2 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -29,6 +29,10 @@ #include "control.h" #include "util.h" +/* Basic messages for control_writeulong(), control_readulong() */ +#define CONTROL_CONTINUE 1 +#define CONTROL_DONE 0 + static void test_stream_connection_reset(const struct test_opts *opts) { union { @@ -1474,6 +1478,49 @@ static void test_stream_cred_upd_on_set_rcvlowat(const struct test_opts *opts) test_stream_credit_update_test(opts, false); } +/* The goal of test leak_acceptq is to stress the race between connect() and + * close(listener). Implementation of client/server loops boils down to: + * + * client server + * ------ ------ + * write(CONTINUE) + * expect(CONTINUE) + * listen() + * write(LISTENING) + * expect(LISTENING) + * connect() close() + */ +#define ACCEPTQ_LEAK_RACE_TIMEOUT 2 /* seconds */ + +static void test_stream_leak_acceptq_client(const struct test_opts *opts) +{ + time_t tout; + int fd; + + tout = current_nsec() + ACCEPTQ_LEAK_RACE_TIMEOUT * NSEC_PER_SEC; + do { + control_writeulong(CONTROL_CONTINUE); + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd >= 0) + close(fd); + } while (current_nsec() < tout); + + control_writeulong(CONTROL_DONE); +} + +/* Test for a memory leak. User is expected to run kmemleak scan, see README. */ +static void test_stream_leak_acceptq_server(const struct test_opts *opts) +{ + int fd; + + while (control_readulong() == CONTROL_CONTINUE) { + fd = vsock_stream_listen(VMADDR_CID_ANY, opts->peer_port); + control_writeln("LISTENING"); + close(fd); + } +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1604,6 +1651,11 @@ static struct test_case test_cases[] = { .run_client = test_seqpacket_unsent_bytes_client, .run_server = test_seqpacket_unsent_bytes_server, }, + { + .name = "SOCK_STREAM leak accept queue", + .run_client = test_stream_leak_acceptq_client, + .run_server = test_stream_leak_acceptq_server, + }, {}, }; From ec50efee8cf814035d82f3b42dad916144d98b38 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 19 Dec 2024 10:49:33 +0100 Subject: [PATCH 0956/1450] vsock/test: Add test for sk_error_queue memory leak Ask for MSG_ZEROCOPY completion notification, but do not recv() it. Test attempts to create a memory leak, kmemleak should be employed. Fixed by commit fbf7085b3ad1 ("vsock: Fix sk_error_queue memory leak"). Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-6-a416e554d9d7@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 45 ++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 2a8fcb062d9d2..2dec6290b075f 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1521,6 +1521,46 @@ static void test_stream_leak_acceptq_server(const struct test_opts *opts) } } +/* Test for a memory leak. User is expected to run kmemleak scan, see README. */ +static void test_stream_msgzcopy_leak_errq_client(const struct test_opts *opts) +{ + struct pollfd fds = { 0 }; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + enable_so_zerocopy_check(fd); + send_byte(fd, 1, MSG_ZEROCOPY); + + fds.fd = fd; + fds.events = 0; + if (poll(&fds, 1, -1) < 0) { + perror("poll"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_msgzcopy_leak_errq_server(const struct test_opts *opts) +{ + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + recv_byte(fd, 1, 0); + vsock_wait_remote_close(fd); + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1656,6 +1696,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_leak_acceptq_client, .run_server = test_stream_leak_acceptq_server, }, + { + .name = "SOCK_STREAM MSG_ZEROCOPY leak MSG_ERRQUEUE", + .run_client = test_stream_msgzcopy_leak_errq_client, + .run_server = test_stream_msgzcopy_leak_errq_server, + }, {}, }; From d127ac8b1d4d3524d292b597100fef96dd909c9b Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 19 Dec 2024 10:49:34 +0100 Subject: [PATCH 0957/1450] vsock/test: Add test for MSG_ZEROCOPY completion memory leak Exercise the ENOMEM error path by attempting to hit net.core.optmem_max limit on send(). Test aims to create a memory leak, kmemleak should be employed. Fixed by commit 60cf6206a1f5 ("virtio/vsock: Improve MSG_ZEROCOPY error handling"). Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241219-test-vsock-leaks-v4-7-a416e554d9d7@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 152 +++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 2dec6290b075f..1eebbc0d5f616 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1561,6 +1561,153 @@ static void test_stream_msgzcopy_leak_errq_server(const struct test_opts *opts) close(fd); } +/* Test msgzcopy_leak_zcskb is meant to exercise sendmsg() error handling path, + * that might leak an skb. The idea is to fail virtio_transport_init_zcopy_skb() + * by hitting net.core.optmem_max limit in sock_omalloc(), specifically + * + * vsock_connectible_sendmsg + * virtio_transport_stream_enqueue + * virtio_transport_send_pkt_info + * virtio_transport_init_zcopy_skb + * . msg_zerocopy_realloc + * . msg_zerocopy_alloc + * . sock_omalloc + * . sk_omem_alloc + size > sysctl_optmem_max + * return -ENOMEM + * + * We abuse the implementation detail of net/socket.c:____sys_sendmsg(). + * sk_omem_alloc can be precisely bumped by sock_kmalloc(), as it is used to + * fetch user-provided control data. + * + * While this approach works for now, it relies on assumptions regarding the + * implementation and configuration (for example, order of net.core.optmem_max + * can not exceed MAX_PAGE_ORDER), which may not hold in the future. A more + * resilient testing could be implemented by leveraging the Fault injection + * framework (CONFIG_FAULT_INJECTION), e.g. + * + * client# echo N > /sys/kernel/debug/failslab/ignore-gfp-wait + * client# echo 0 > /sys/kernel/debug/failslab/verbose + * + * void client(const struct test_opts *opts) + * { + * char buf[16]; + * int f, s, i; + * + * f = open("/proc/self/fail-nth", O_WRONLY); + * + * for (i = 1; i < 32; i++) { + * control_writeulong(CONTROL_CONTINUE); + * + * s = vsock_stream_connect(opts->peer_cid, opts->peer_port); + * enable_so_zerocopy_check(s); + * + * sprintf(buf, "%d", i); + * write(f, buf, strlen(buf)); + * + * send(s, &(char){ 0 }, 1, MSG_ZEROCOPY); + * + * write(f, "0", 1); + * close(s); + * } + * + * control_writeulong(CONTROL_DONE); + * close(f); + * } + * + * void server(const struct test_opts *opts) + * { + * int fd; + * + * while (control_readulong() == CONTROL_CONTINUE) { + * fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + * vsock_wait_remote_close(fd); + * close(fd); + * } + * } + * + * Refer to Documentation/fault-injection/fault-injection.rst. + */ +#define MAX_PAGE_ORDER 10 /* usually */ +#define PAGE_SIZE 4096 + +/* Test for a memory leak. User is expected to run kmemleak scan, see README. */ +static void test_stream_msgzcopy_leak_zcskb_client(const struct test_opts *opts) +{ + size_t optmem_max, ctl_len, chunk_size; + struct msghdr msg = { 0 }; + struct iovec iov; + char *chunk; + int fd, res; + FILE *f; + + f = fopen("/proc/sys/net/core/optmem_max", "r"); + if (!f) { + perror("fopen(optmem_max)"); + exit(EXIT_FAILURE); + } + + if (fscanf(f, "%zu", &optmem_max) != 1) { + fprintf(stderr, "fscanf(optmem_max) failed\n"); + exit(EXIT_FAILURE); + } + + fclose(f); + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + enable_so_zerocopy_check(fd); + + ctl_len = optmem_max - 1; + if (ctl_len > PAGE_SIZE << MAX_PAGE_ORDER) { + fprintf(stderr, "Try with net.core.optmem_max = 100000\n"); + exit(EXIT_FAILURE); + } + + chunk_size = CMSG_SPACE(ctl_len); + chunk = malloc(chunk_size); + if (!chunk) { + perror("malloc"); + exit(EXIT_FAILURE); + } + memset(chunk, 0, chunk_size); + + iov.iov_base = &(char){ 0 }; + iov.iov_len = 1; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = chunk; + msg.msg_controllen = ctl_len; + + errno = 0; + res = sendmsg(fd, &msg, MSG_ZEROCOPY); + if (res >= 0 || errno != ENOMEM) { + fprintf(stderr, "Expected ENOMEM, got errno=%d res=%d\n", + errno, res); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_msgzcopy_leak_zcskb_server(const struct test_opts *opts) +{ + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + vsock_wait_remote_close(fd); + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1701,6 +1848,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_msgzcopy_leak_errq_client, .run_server = test_stream_msgzcopy_leak_errq_server, }, + { + .name = "SOCK_STREAM MSG_ZEROCOPY leak completion skb", + .run_client = test_stream_msgzcopy_leak_zcskb_client, + .run_server = test_stream_msgzcopy_leak_zcskb_server, + }, {}, }; From d46ef4ee381f0f73b13714f319662f48f0c8b471 Mon Sep 17 00:00:00 2001 From: Divya Koppera Date: Thu, 19 Dec 2024 18:03:07 +0530 Subject: [PATCH 0958/1450] net: phy: microchip_rds_ptp: Add header file for Microchip rds ptp library This rds ptp header file will cover ptp macros for future phys in Microchip where addresses will be same but base offset and mmd address may changes. Reviewed-by: Andrew Lunn Reviewed-by: Vadim Fedorenko Signed-off-by: Divya Koppera Link: https://patch.msgid.link/20241219123311.30213-2-divya.koppera@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/microchip_rds_ptp.h | 223 ++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 drivers/net/phy/microchip_rds_ptp.h diff --git a/drivers/net/phy/microchip_rds_ptp.h b/drivers/net/phy/microchip_rds_ptp.h new file mode 100644 index 0000000000000..e95c065728b5b --- /dev/null +++ b/drivers/net/phy/microchip_rds_ptp.h @@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (C) 2024 Microchip Technology + */ + +#ifndef _MICROCHIP_RDS_PTP_H +#define _MICROCHIP_RDS_PTP_H + +#include +#include +#include +#include +#include +#include + +#define MCHP_RDS_PTP_CMD_CTL 0x0 +#define MCHP_RDS_PTP_CMD_CTL_LTC_STEP_NSEC BIT(6) +#define MCHP_RDS_PTP_CMD_CTL_LTC_STEP_SEC BIT(5) +#define MCHP_RDS_PTP_CMD_CTL_CLOCK_LOAD BIT(4) +#define MCHP_RDS_PTP_CMD_CTL_CLOCK_READ BIT(3) +#define MCHP_RDS_PTP_CMD_CTL_EN BIT(1) +#define MCHP_RDS_PTP_CMD_CTL_DIS BIT(0) + +#define MCHP_RDS_PTP_REF_CLK_CFG 0x2 +#define MCHP_RDS_PTP_REF_CLK_SRC_250MHZ 0x0 +#define MCHP_RDS_PTP_REF_CLK_PERIOD_OVERRIDE BIT(9) +#define MCHP_RDS_PTP_REF_CLK_PERIOD 4 +#define MCHP_RDS_PTP_REF_CLK_CFG_SET (MCHP_RDS_PTP_REF_CLK_SRC_250MHZ |\ + MCHP_RDS_PTP_REF_CLK_PERIOD_OVERRIDE |\ + MCHP_RDS_PTP_REF_CLK_PERIOD) + +#define MCHP_RDS_PTP_LTC_SEC_HI 0x5 +#define MCHP_RDS_PTP_LTC_SEC_MID 0x6 +#define MCHP_RDS_PTP_LTC_SEC_LO 0x7 +#define MCHP_RDS_PTP_LTC_NS_HI 0x8 +#define MCHP_RDS_PTP_LTC_NS_LO 0x9 +#define MCHP_RDS_PTP_LTC_RATE_ADJ_HI 0xc +#define MCHP_RDS_PTP_LTC_RATE_ADJ_HI_DIR BIT(15) +#define MCHP_RDS_PTP_LTC_RATE_ADJ_LO 0xd +#define MCHP_RDS_PTP_STEP_ADJ_HI 0x12 +#define MCHP_RDS_PTP_STEP_ADJ_HI_DIR BIT(15) +#define MCHP_RDS_PTP_STEP_ADJ_LO 0x13 +#define MCHP_RDS_PTP_LTC_READ_SEC_HI 0x29 +#define MCHP_RDS_PTP_LTC_READ_SEC_MID 0x2a +#define MCHP_RDS_PTP_LTC_READ_SEC_LO 0x2b +#define MCHP_RDS_PTP_LTC_READ_NS_HI 0x2c +#define MCHP_RDS_PTP_LTC_READ_NS_LO 0x2d +#define MCHP_RDS_PTP_OP_MODE 0x41 +#define MCHP_RDS_PTP_OP_MODE_DIS 0 +#define MCHP_RDS_PTP_OP_MODE_STANDALONE 1 +#define MCHP_RDS_PTP_LATENCY_CORRECTION_CTL 0x44 +#define MCHP_RDS_PTP_PREDICTOR_EN BIT(6) +#define MCHP_RDS_PTP_TX_PRED_DIS BIT(1) +#define MCHP_RDS_PTP_RX_PRED_DIS BIT(0) +#define MCHP_RDS_PTP_LATENCY_SETTING (MCHP_RDS_PTP_PREDICTOR_EN | \ + MCHP_RDS_PTP_TX_PRED_DIS | \ + MCHP_RDS_PTP_RX_PRED_DIS) + +#define MCHP_RDS_PTP_INT_EN 0x0 +#define MCHP_RDS_PTP_INT_STS 0x01 +#define MCHP_RDS_PTP_INT_TX_TS_OVRFL_EN BIT(3) +#define MCHP_RDS_PTP_INT_TX_TS_EN BIT(2) +#define MCHP_RDS_PTP_INT_RX_TS_OVRFL_EN BIT(1) +#define MCHP_RDS_PTP_INT_RX_TS_EN BIT(0) +#define MCHP_RDS_PTP_INT_ALL_MSK (MCHP_RDS_PTP_INT_TX_TS_OVRFL_EN | \ + MCHP_RDS_PTP_INT_TX_TS_EN | \ + MCHP_RDS_PTP_INT_RX_TS_OVRFL_EN |\ + MCHP_RDS_PTP_INT_RX_TS_EN) + +#define MCHP_RDS_PTP_CAP_INFO 0x2e +#define MCHP_RDS_PTP_TX_TS_CNT(v) (((v) & GENMASK(11, 8)) >> 8) +#define MCHP_RDS_PTP_RX_TS_CNT(v) ((v) & GENMASK(3, 0)) + +#define MCHP_RDS_PTP_RX_PARSE_CONFIG 0x42 +#define MCHP_RDS_PTP_RX_PARSE_L2_ADDR_EN 0x44 +#define MCHP_RDS_PTP_RX_PARSE_IPV4_ADDR_EN 0x45 + +#define MCHP_RDS_PTP_RX_TIMESTAMP_CONFIG 0x4e +#define MCHP_RDS_PTP_RX_TIMESTAMP_CONFIG_PTP_FCS_DIS BIT(0) + +#define MCHP_RDS_PTP_RX_VERSION 0x48 +#define MCHP_RDS_PTP_RX_TIMESTAMP_EN 0x4d + +#define MCHP_RDS_PTP_RX_INGRESS_NS_HI 0x54 +#define MCHP_RDS_PTP_RX_INGRESS_NS_HI_TS_VALID BIT(15) + +#define MCHP_RDS_PTP_RX_INGRESS_NS_LO 0x55 +#define MCHP_RDS_PTP_RX_INGRESS_SEC_HI 0x56 +#define MCHP_RDS_PTP_RX_INGRESS_SEC_LO 0x57 +#define MCHP_RDS_PTP_RX_MSG_HDR2 0x59 + +#define MCHP_RDS_PTP_TX_PARSE_CONFIG 0x82 +#define MCHP_RDS_PTP_PARSE_CONFIG_LAYER2_EN BIT(0) +#define MCHP_RDS_PTP_PARSE_CONFIG_IPV4_EN BIT(1) +#define MCHP_RDS_PTP_PARSE_CONFIG_IPV6_EN BIT(2) + +#define MCHP_RDS_PTP_TX_PARSE_L2_ADDR_EN 0x84 +#define MCHP_RDS_PTP_TX_PARSE_IPV4_ADDR_EN 0x85 + +#define MCHP_RDS_PTP_TX_VERSION 0x88 +#define MCHP_RDS_PTP_MAX_VERSION(x) (((x) & GENMASK(7, 0)) << 8) +#define MCHP_RDS_PTP_MIN_VERSION(x) ((x) & GENMASK(7, 0)) + +#define MCHP_RDS_PTP_TX_TIMESTAMP_EN 0x8d +#define MCHP_RDS_PTP_TIMESTAMP_EN_SYNC BIT(0) +#define MCHP_RDS_PTP_TIMESTAMP_EN_DREQ BIT(1) +#define MCHP_RDS_PTP_TIMESTAMP_EN_PDREQ BIT(2) +#define MCHP_RDS_PTP_TIMESTAMP_EN_PDRES BIT(3) +#define MCHP_RDS_PTP_TIMESTAMP_EN_ALL (MCHP_RDS_PTP_TIMESTAMP_EN_SYNC |\ + MCHP_RDS_PTP_TIMESTAMP_EN_DREQ |\ + MCHP_RDS_PTP_TIMESTAMP_EN_PDREQ |\ + MCHP_RDS_PTP_TIMESTAMP_EN_PDRES) + +#define MCHP_RDS_PTP_TX_TIMESTAMP_CONFIG 0x8e +#define MCHP_RDS_PTP_TX_TIMESTAMP_CONFIG_PTP_FCS_DIS BIT(0) + +#define MCHP_RDS_PTP_TX_MOD 0x8f +#define MCHP_RDS_TX_MOD_PTP_SYNC_TS_INSERT BIT(12) + +#define MCHP_RDS_PTP_TX_EGRESS_NS_HI 0x94 +#define MCHP_RDS_PTP_TX_EGRESS_NS_HI_TS_VALID BIT(15) + +#define MCHP_RDS_PTP_TX_EGRESS_NS_LO 0x95 +#define MCHP_RDS_PTP_TX_EGRESS_SEC_HI 0x96 +#define MCHP_RDS_PTP_TX_EGRESS_SEC_LO 0x97 +#define MCHP_RDS_PTP_TX_MSG_HDR2 0x99 + +#define MCHP_RDS_PTP_TSU_GEN_CONFIG 0xc0 +#define MCHP_RDS_PTP_TSU_GEN_CFG_TSU_EN BIT(0) + +#define MCHP_RDS_PTP_TSU_HARD_RESET 0xc1 +#define MCHP_RDS_PTP_TSU_HARDRESET BIT(0) + +/* Represents 1ppm adjustment in 2^32 format with + * each nsec contains 4 clock cycles in 250MHz. + * The value is calculated as following: (1/1000000)/((2^-32)/4) + */ +#define MCHP_RDS_PTP_1PPM_FORMAT 17179 +#define MCHP_RDS_PTP_FIFO_SIZE 8 +#define MCHP_RDS_PTP_MAX_ADJ 31249999 + +#define BASE_CLK(p) ((p)->clk_base_addr) +#define BASE_PORT(p) ((p)->port_base_addr) +#define PTP_MMD(p) ((p)->mmd) + +enum mchp_rds_ptp_base { + MCHP_RDS_PTP_PORT, + MCHP_RDS_PTP_CLOCK +}; + +enum mchp_rds_ptp_fifo_dir { + MCHP_RDS_PTP_INGRESS_FIFO, + MCHP_RDS_PTP_EGRESS_FIFO +}; + +struct mchp_rds_ptp_clock { + struct mii_timestamper mii_ts; + struct phy_device *phydev; + struct ptp_clock *ptp_clock; + + struct sk_buff_head tx_queue; + struct sk_buff_head rx_queue; + struct list_head rx_ts_list; + + struct ptp_clock_info caps; + + /* Lock for Rx ts fifo */ + spinlock_t rx_ts_lock; + int hwts_tx_type; + + enum hwtstamp_rx_filters rx_filter; + int layer; + int version; + u16 port_base_addr; + u16 clk_base_addr; + + /* Lock for phc */ + struct mutex ptp_lock; + u8 mmd; +}; + +struct mchp_rds_ptp_rx_ts { + struct list_head list; + u32 seconds; + u32 nsec; + u16 seq_id; +}; + +#if IS_ENABLED(CONFIG_MICROCHIP_PHY_RDS_PTP) + +struct mchp_rds_ptp_clock *mchp_rds_ptp_probe(struct phy_device *phydev, u8 mmd, + u16 clk_base, u16 port_base); + +int mchp_rds_ptp_top_config_intr(struct mchp_rds_ptp_clock *clock, + u16 reg, u16 val, bool enable); + +irqreturn_t mchp_rds_ptp_handle_interrupt(struct mchp_rds_ptp_clock *clock); + +#else + +static inline struct mchp_rds_ptp_clock *mchp_rds_ptp_probe(struct phy_device + *phydev, u8 mmd, + u16 clk_base, + u16 port_base) +{ + return NULL; +} + +static inline int mchp_rds_ptp_top_config_intr(struct mchp_rds_ptp_clock *clock, + u16 reg, u16 val, bool enable) +{ + return 0; +} + +static inline irqreturn_t mchp_rds_ptp_handle_interrupt(struct + mchp_rds_ptp_clock + * clock) +{ + return IRQ_NONE; +} + +#endif //CONFIG_MICROCHIP_PHY_RDS_PTP + +#endif //_MICROCHIP_RDS_PTP_H From fa51199c5f34172fc7fd248ca9105e4e0ca6d80a Mon Sep 17 00:00:00 2001 From: Divya Koppera Date: Thu, 19 Dec 2024 18:03:08 +0530 Subject: [PATCH 0959/1450] net: phy: microchip_rds_ptp : Add rds ptp library for Microchip phys Add rds ptp library for Microchip phys 1-step and 2-step modes are supported, over Ethernet and UDP(ipv4, ipv6) Reviewed-by: Vadim Fedorenko Signed-off-by: Divya Koppera Link: https://patch.msgid.link/20241219123311.30213-3-divya.koppera@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/microchip_rds_ptp.c | 1039 +++++++++++++++++++++++++++ 1 file changed, 1039 insertions(+) create mode 100644 drivers/net/phy/microchip_rds_ptp.c diff --git a/drivers/net/phy/microchip_rds_ptp.c b/drivers/net/phy/microchip_rds_ptp.c new file mode 100644 index 0000000000000..2936e46531cfc --- /dev/null +++ b/drivers/net/phy/microchip_rds_ptp.c @@ -0,0 +1,1039 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2024 Microchip Technology + +#include "microchip_rds_ptp.h" + +static int mchp_rds_phy_read_mmd(struct mchp_rds_ptp_clock *clock, + u32 offset, enum mchp_rds_ptp_base base) +{ + struct phy_device *phydev = clock->phydev; + u32 addr; + + addr = (offset + ((base == MCHP_RDS_PTP_PORT) ? BASE_PORT(clock) : + BASE_CLK(clock))); + + return phy_read_mmd(phydev, PTP_MMD(clock), addr); +} + +static int mchp_rds_phy_write_mmd(struct mchp_rds_ptp_clock *clock, + u32 offset, enum mchp_rds_ptp_base base, + u16 val) +{ + struct phy_device *phydev = clock->phydev; + u32 addr; + + addr = (offset + ((base == MCHP_RDS_PTP_PORT) ? BASE_PORT(clock) : + BASE_CLK(clock))); + + return phy_write_mmd(phydev, PTP_MMD(clock), addr, val); +} + +static int mchp_rds_phy_modify_mmd(struct mchp_rds_ptp_clock *clock, + u32 offset, enum mchp_rds_ptp_base base, + u16 mask, u16 val) +{ + struct phy_device *phydev = clock->phydev; + u32 addr; + + addr = (offset + ((base == MCHP_RDS_PTP_PORT) ? BASE_PORT(clock) : + BASE_CLK(clock))); + + return phy_modify_mmd(phydev, PTP_MMD(clock), addr, mask, val); +} + +static int mchp_rds_phy_set_bits_mmd(struct mchp_rds_ptp_clock *clock, + u32 offset, enum mchp_rds_ptp_base base, + u16 val) +{ + struct phy_device *phydev = clock->phydev; + u32 addr; + + addr = (offset + ((base == MCHP_RDS_PTP_PORT) ? BASE_PORT(clock) : + BASE_CLK(clock))); + + return phy_set_bits_mmd(phydev, PTP_MMD(clock), addr, val); +} + +static int mchp_rds_ptp_flush_fifo(struct mchp_rds_ptp_clock *clock, + enum mchp_rds_ptp_fifo_dir dir) +{ + int rc; + + if (dir == MCHP_RDS_PTP_EGRESS_FIFO) + skb_queue_purge(&clock->tx_queue); + else + skb_queue_purge(&clock->rx_queue); + + for (int i = 0; i < MCHP_RDS_PTP_FIFO_SIZE; ++i) { + rc = mchp_rds_phy_read_mmd(clock, + dir == MCHP_RDS_PTP_EGRESS_FIFO ? + MCHP_RDS_PTP_TX_MSG_HDR2 : + MCHP_RDS_PTP_RX_MSG_HDR2, + MCHP_RDS_PTP_PORT); + if (rc < 0) + return rc; + } + return mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_INT_STS, + MCHP_RDS_PTP_PORT); +} + +static int mchp_rds_ptp_config_intr(struct mchp_rds_ptp_clock *clock, + bool enable) +{ + /* Enable or disable ptp interrupts */ + return mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_INT_EN, + MCHP_RDS_PTP_PORT, + enable ? MCHP_RDS_PTP_INT_ALL_MSK : 0); +} + +static void mchp_rds_ptp_txtstamp(struct mii_timestamper *mii_ts, + struct sk_buff *skb, int type) +{ + struct mchp_rds_ptp_clock *clock = container_of(mii_ts, + struct mchp_rds_ptp_clock, + mii_ts); + + switch (clock->hwts_tx_type) { + case HWTSTAMP_TX_ONESTEP_SYNC: + if (ptp_msg_is_sync(skb, type)) { + kfree_skb(skb); + return; + } + fallthrough; + case HWTSTAMP_TX_ON: + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + skb_queue_tail(&clock->tx_queue, skb); + break; + case HWTSTAMP_TX_OFF: + default: + kfree_skb(skb); + break; + } +} + +static bool mchp_rds_ptp_get_sig_rx(struct sk_buff *skb, u16 *sig) +{ + struct ptp_header *ptp_header; + int type; + + skb_push(skb, ETH_HLEN); + type = ptp_classify_raw(skb); + if (type == PTP_CLASS_NONE) + return false; + + ptp_header = ptp_parse_header(skb, type); + if (!ptp_header) + return false; + + skb_pull_inline(skb, ETH_HLEN); + + *sig = (__force u16)(ntohs(ptp_header->sequence_id)); + + return true; +} + +static bool mchp_rds_ptp_match_skb(struct mchp_rds_ptp_clock *clock, + struct mchp_rds_ptp_rx_ts *rx_ts) +{ + struct skb_shared_hwtstamps *shhwtstamps; + struct sk_buff *skb, *skb_tmp; + unsigned long flags; + bool rc = false; + u16 skb_sig; + + spin_lock_irqsave(&clock->rx_queue.lock, flags); + skb_queue_walk_safe(&clock->rx_queue, skb, skb_tmp) { + if (!mchp_rds_ptp_get_sig_rx(skb, &skb_sig)) + continue; + + if (skb_sig != rx_ts->seq_id) + continue; + + __skb_unlink(skb, &clock->rx_queue); + + rc = true; + break; + } + spin_unlock_irqrestore(&clock->rx_queue.lock, flags); + + if (rc) { + shhwtstamps = skb_hwtstamps(skb); + shhwtstamps->hwtstamp = ktime_set(rx_ts->seconds, rx_ts->nsec); + netif_rx(skb); + } + + return rc; +} + +static void mchp_rds_ptp_match_rx_ts(struct mchp_rds_ptp_clock *clock, + struct mchp_rds_ptp_rx_ts *rx_ts) +{ + unsigned long flags; + + /* If we failed to match the skb add it to the queue for when + * the frame will come + */ + if (!mchp_rds_ptp_match_skb(clock, rx_ts)) { + spin_lock_irqsave(&clock->rx_ts_lock, flags); + list_add(&rx_ts->list, &clock->rx_ts_list); + spin_unlock_irqrestore(&clock->rx_ts_lock, flags); + } else { + kfree(rx_ts); + } +} + +static void mchp_rds_ptp_match_rx_skb(struct mchp_rds_ptp_clock *clock, + struct sk_buff *skb) +{ + struct mchp_rds_ptp_rx_ts *rx_ts, *tmp, *rx_ts_var = NULL; + struct skb_shared_hwtstamps *shhwtstamps; + unsigned long flags; + u16 skb_sig; + + if (!mchp_rds_ptp_get_sig_rx(skb, &skb_sig)) + return; + + /* Iterate over all RX timestamps and match it with the received skbs */ + spin_lock_irqsave(&clock->rx_ts_lock, flags); + list_for_each_entry_safe(rx_ts, tmp, &clock->rx_ts_list, list) { + /* Check if we found the signature we were looking for. */ + if (skb_sig != rx_ts->seq_id) + continue; + + shhwtstamps = skb_hwtstamps(skb); + shhwtstamps->hwtstamp = ktime_set(rx_ts->seconds, rx_ts->nsec); + netif_rx(skb); + + rx_ts_var = rx_ts; + + break; + } + spin_unlock_irqrestore(&clock->rx_ts_lock, flags); + + if (rx_ts_var) { + list_del(&rx_ts_var->list); + kfree(rx_ts_var); + } else { + skb_queue_tail(&clock->rx_queue, skb); + } +} + +static bool mchp_rds_ptp_rxtstamp(struct mii_timestamper *mii_ts, + struct sk_buff *skb, int type) +{ + struct mchp_rds_ptp_clock *clock = container_of(mii_ts, + struct mchp_rds_ptp_clock, + mii_ts); + + if (clock->rx_filter == HWTSTAMP_FILTER_NONE || + type == PTP_CLASS_NONE) + return false; + + if ((type & clock->version) == 0 || (type & clock->layer) == 0) + return false; + + /* Here if match occurs skb is sent to application, If not skb is added + * to queue and sending skb to application will get handled when + * interrupt occurs i.e., it get handles in interrupt handler. By + * any means skb will reach the application so we should not return + * false here if skb doesn't matches. + */ + mchp_rds_ptp_match_rx_skb(clock, skb); + + return true; +} + +static int mchp_rds_ptp_hwtstamp(struct mii_timestamper *mii_ts, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) +{ + struct mchp_rds_ptp_clock *clock = + container_of(mii_ts, struct mchp_rds_ptp_clock, + mii_ts); + struct mchp_rds_ptp_rx_ts *rx_ts, *tmp; + int txcfg = 0, rxcfg = 0; + unsigned long flags; + int rc; + + clock->hwts_tx_type = config->tx_type; + clock->rx_filter = config->rx_filter; + + switch (config->rx_filter) { + case HWTSTAMP_FILTER_NONE: + clock->layer = 0; + clock->version = 0; + break; + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + clock->layer = PTP_CLASS_L4; + clock->version = PTP_CLASS_V2; + break; + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + clock->layer = PTP_CLASS_L2; + clock->version = PTP_CLASS_V2; + break; + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + clock->layer = PTP_CLASS_L4 | PTP_CLASS_L2; + clock->version = PTP_CLASS_V2; + break; + default: + return -ERANGE; + } + + /* Setup parsing of the frames and enable the timestamping for ptp + * frames + */ + if (clock->layer & PTP_CLASS_L2) { + rxcfg = MCHP_RDS_PTP_PARSE_CONFIG_LAYER2_EN; + txcfg = MCHP_RDS_PTP_PARSE_CONFIG_LAYER2_EN; + } + if (clock->layer & PTP_CLASS_L4) { + rxcfg |= MCHP_RDS_PTP_PARSE_CONFIG_IPV4_EN | + MCHP_RDS_PTP_PARSE_CONFIG_IPV6_EN; + txcfg |= MCHP_RDS_PTP_PARSE_CONFIG_IPV4_EN | + MCHP_RDS_PTP_PARSE_CONFIG_IPV6_EN; + } + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_RX_PARSE_CONFIG, + MCHP_RDS_PTP_PORT, rxcfg); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_TX_PARSE_CONFIG, + MCHP_RDS_PTP_PORT, txcfg); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_RX_TIMESTAMP_EN, + MCHP_RDS_PTP_PORT, + MCHP_RDS_PTP_TIMESTAMP_EN_ALL); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_TX_TIMESTAMP_EN, + MCHP_RDS_PTP_PORT, + MCHP_RDS_PTP_TIMESTAMP_EN_ALL); + if (rc < 0) + return rc; + + if (clock->hwts_tx_type == HWTSTAMP_TX_ONESTEP_SYNC) + /* Enable / disable of the TX timestamp in the SYNC frames */ + rc = mchp_rds_phy_modify_mmd(clock, MCHP_RDS_PTP_TX_MOD, + MCHP_RDS_PTP_PORT, + MCHP_RDS_TX_MOD_PTP_SYNC_TS_INSERT, + MCHP_RDS_TX_MOD_PTP_SYNC_TS_INSERT); + else + rc = mchp_rds_phy_modify_mmd(clock, MCHP_RDS_PTP_TX_MOD, + MCHP_RDS_PTP_PORT, + MCHP_RDS_TX_MOD_PTP_SYNC_TS_INSERT, + (u16)~MCHP_RDS_TX_MOD_PTP_SYNC_TS_INSERT); + + if (rc < 0) + return rc; + + /* In case of multiple starts and stops, these needs to be cleared */ + spin_lock_irqsave(&clock->rx_ts_lock, flags); + list_for_each_entry_safe(rx_ts, tmp, &clock->rx_ts_list, list) { + list_del(&rx_ts->list); + kfree(rx_ts); + } + spin_unlock_irqrestore(&clock->rx_ts_lock, flags); + + rc = mchp_rds_ptp_flush_fifo(clock, MCHP_RDS_PTP_INGRESS_FIFO); + if (rc < 0) + return rc; + + rc = mchp_rds_ptp_flush_fifo(clock, MCHP_RDS_PTP_EGRESS_FIFO); + if (rc < 0) + return rc; + + /* Now enable the timestamping interrupts */ + rc = mchp_rds_ptp_config_intr(clock, + config->rx_filter != HWTSTAMP_FILTER_NONE); + + return rc < 0 ? rc : 0; +} + +static int mchp_rds_ptp_ts_info(struct mii_timestamper *mii_ts, + struct kernel_ethtool_ts_info *info) +{ + struct mchp_rds_ptp_clock *clock = container_of(mii_ts, + struct mchp_rds_ptp_clock, + mii_ts); + + info->phc_index = ptp_clock_index(clock->ptp_clock); + + info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_RX_HARDWARE | + SOF_TIMESTAMPING_RAW_HARDWARE; + + info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON) | + BIT(HWTSTAMP_TX_ONESTEP_SYNC); + + info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) | + BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) | + BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) | + BIT(HWTSTAMP_FILTER_PTP_V2_EVENT); + + return 0; +} + +static int mchp_rds_ptp_ltc_adjtime(struct ptp_clock_info *info, s64 delta) +{ + struct mchp_rds_ptp_clock *clock = container_of(info, + struct mchp_rds_ptp_clock, + caps); + struct timespec64 ts; + bool add = true; + int rc = 0; + u32 nsec; + s32 sec; + + /* The HW allows up to 15 sec to adjust the time, but here we limit to + * 10 sec the adjustment. The reason is, in case the adjustment is 14 + * sec and 999999999 nsec, then we add 8ns to compensate the actual + * increment so the value can be bigger than 15 sec. Therefore limit the + * possible adjustments so we will not have these corner cases + */ + if (delta > 10000000000LL || delta < -10000000000LL) { + /* The timeadjustment is too big, so fall back using set time */ + u64 now; + + info->gettime64(info, &ts); + + now = ktime_to_ns(timespec64_to_ktime(ts)); + ts = ns_to_timespec64(now + delta); + + info->settime64(info, &ts); + return 0; + } + sec = div_u64_rem(abs(delta), NSEC_PER_SEC, &nsec); + if (delta < 0 && nsec != 0) { + /* It is not allowed to adjust low the nsec part, therefore + * subtract more from second part and add to nanosecond such + * that would roll over, so the second part will increase + */ + sec--; + nsec = NSEC_PER_SEC - nsec; + } + + /* Calculate the adjustments and the direction */ + if (delta < 0) + add = false; + + if (nsec > 0) { + /* add 8 ns to cover the likely normal increment */ + nsec += 8; + + if (nsec >= NSEC_PER_SEC) { + /* carry into seconds */ + sec++; + nsec -= NSEC_PER_SEC; + } + } + + mutex_lock(&clock->ptp_lock); + if (sec) { + sec = abs(sec); + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_STEP_ADJ_LO, + MCHP_RDS_PTP_CLOCK, sec); + if (rc < 0) + goto out_unlock; + + rc = mchp_rds_phy_set_bits_mmd(clock, MCHP_RDS_PTP_STEP_ADJ_HI, + MCHP_RDS_PTP_CLOCK, + ((add ? + MCHP_RDS_PTP_STEP_ADJ_HI_DIR : + 0) | ((sec >> 16) & + GENMASK(13, 0)))); + if (rc < 0) + goto out_unlock; + + rc = mchp_rds_phy_set_bits_mmd(clock, MCHP_RDS_PTP_CMD_CTL, + MCHP_RDS_PTP_CLOCK, + MCHP_RDS_PTP_CMD_CTL_LTC_STEP_SEC); + if (rc < 0) + goto out_unlock; + } + + if (nsec) { + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_STEP_ADJ_LO, + MCHP_RDS_PTP_CLOCK, + nsec & GENMASK(15, 0)); + if (rc < 0) + goto out_unlock; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_STEP_ADJ_HI, + MCHP_RDS_PTP_CLOCK, + (nsec >> 16) & GENMASK(13, 0)); + if (rc < 0) + goto out_unlock; + + rc = mchp_rds_phy_set_bits_mmd(clock, MCHP_RDS_PTP_CMD_CTL, + MCHP_RDS_PTP_CLOCK, + MCHP_RDS_PTP_CMD_CTL_LTC_STEP_NSEC); + } + +out_unlock: + mutex_unlock(&clock->ptp_lock); + + return rc; +} + +static int mchp_rds_ptp_ltc_adjfine(struct ptp_clock_info *info, + long scaled_ppm) +{ + struct mchp_rds_ptp_clock *clock = container_of(info, + struct mchp_rds_ptp_clock, + caps); + u16 rate_lo, rate_hi; + bool faster = true; + u32 rate; + int rc; + + if (!scaled_ppm) + return 0; + + if (scaled_ppm < 0) { + scaled_ppm = -scaled_ppm; + faster = false; + } + + rate = MCHP_RDS_PTP_1PPM_FORMAT * (upper_16_bits(scaled_ppm)); + rate += (MCHP_RDS_PTP_1PPM_FORMAT * (lower_16_bits(scaled_ppm))) >> 16; + + rate_lo = rate & GENMASK(15, 0); + rate_hi = (rate >> 16) & GENMASK(13, 0); + + if (faster) + rate_hi |= MCHP_RDS_PTP_LTC_RATE_ADJ_HI_DIR; + + mutex_lock(&clock->ptp_lock); + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_LTC_RATE_ADJ_HI, + MCHP_RDS_PTP_CLOCK, rate_hi); + if (rc < 0) + goto error; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_LTC_RATE_ADJ_LO, + MCHP_RDS_PTP_CLOCK, rate_lo); + if (rc > 0) + rc = 0; +error: + mutex_unlock(&clock->ptp_lock); + + return rc; +} + +static int mchp_rds_ptp_ltc_gettime64(struct ptp_clock_info *info, + struct timespec64 *ts) +{ + struct mchp_rds_ptp_clock *clock = container_of(info, + struct mchp_rds_ptp_clock, + caps); + time64_t secs; + int rc = 0; + s64 nsecs; + + mutex_lock(&clock->ptp_lock); + /* Set read bit to 1 to save current values of 1588 local time counter + * into PTP LTC seconds and nanoseconds registers. + */ + rc = mchp_rds_phy_set_bits_mmd(clock, MCHP_RDS_PTP_CMD_CTL, + MCHP_RDS_PTP_CLOCK, + MCHP_RDS_PTP_CMD_CTL_CLOCK_READ); + if (rc < 0) + goto out_unlock; + + /* Get LTC clock values */ + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_LTC_READ_SEC_HI, + MCHP_RDS_PTP_CLOCK); + if (rc < 0) + goto out_unlock; + secs = rc << 16; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_LTC_READ_SEC_MID, + MCHP_RDS_PTP_CLOCK); + if (rc < 0) + goto out_unlock; + secs |= rc; + secs <<= 16; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_LTC_READ_SEC_LO, + MCHP_RDS_PTP_CLOCK); + if (rc < 0) + goto out_unlock; + secs |= rc; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_LTC_READ_NS_HI, + MCHP_RDS_PTP_CLOCK); + if (rc < 0) + goto out_unlock; + nsecs = (rc & GENMASK(13, 0)); + nsecs <<= 16; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_LTC_READ_NS_LO, + MCHP_RDS_PTP_CLOCK); + if (rc < 0) + goto out_unlock; + nsecs |= rc; + + set_normalized_timespec64(ts, secs, nsecs); + + if (rc > 0) + rc = 0; +out_unlock: + mutex_unlock(&clock->ptp_lock); + + return rc; +} + +static int mchp_rds_ptp_ltc_settime64(struct ptp_clock_info *info, + const struct timespec64 *ts) +{ + struct mchp_rds_ptp_clock *clock = container_of(info, + struct mchp_rds_ptp_clock, + caps); + int rc; + + mutex_lock(&clock->ptp_lock); + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_LTC_SEC_LO, + MCHP_RDS_PTP_CLOCK, + lower_16_bits(ts->tv_sec)); + if (rc < 0) + goto out_unlock; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_LTC_SEC_MID, + MCHP_RDS_PTP_CLOCK, + upper_16_bits(ts->tv_sec)); + if (rc < 0) + goto out_unlock; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_LTC_SEC_HI, + MCHP_RDS_PTP_CLOCK, + upper_32_bits(ts->tv_sec) & GENMASK(15, 0)); + if (rc < 0) + goto out_unlock; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_LTC_NS_LO, + MCHP_RDS_PTP_CLOCK, + lower_16_bits(ts->tv_nsec)); + if (rc < 0) + goto out_unlock; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_LTC_NS_HI, + MCHP_RDS_PTP_CLOCK, + upper_16_bits(ts->tv_nsec) & GENMASK(13, 0)); + if (rc < 0) + goto out_unlock; + + /* Set load bit to 1 to write PTP LTC seconds and nanoseconds + * registers to 1588 local time counter. + */ + rc = mchp_rds_phy_set_bits_mmd(clock, MCHP_RDS_PTP_CMD_CTL, + MCHP_RDS_PTP_CLOCK, + MCHP_RDS_PTP_CMD_CTL_CLOCK_LOAD); + if (rc > 0) + rc = 0; +out_unlock: + mutex_unlock(&clock->ptp_lock); + + return rc; +} + +static bool mchp_rds_ptp_get_sig_tx(struct sk_buff *skb, u16 *sig) +{ + struct ptp_header *ptp_header; + int type; + + type = ptp_classify_raw(skb); + if (type == PTP_CLASS_NONE) + return false; + + ptp_header = ptp_parse_header(skb, type); + if (!ptp_header) + return false; + + *sig = (__force u16)(ntohs(ptp_header->sequence_id)); + + return true; +} + +static void mchp_rds_ptp_match_tx_skb(struct mchp_rds_ptp_clock *clock, + u32 seconds, u32 nsec, u16 seq_id) +{ + struct skb_shared_hwtstamps shhwtstamps; + struct sk_buff *skb, *skb_tmp; + unsigned long flags; + bool rc = false; + u16 skb_sig; + + spin_lock_irqsave(&clock->tx_queue.lock, flags); + skb_queue_walk_safe(&clock->tx_queue, skb, skb_tmp) { + if (!mchp_rds_ptp_get_sig_tx(skb, &skb_sig)) + continue; + + if (skb_sig != seq_id) + continue; + + __skb_unlink(skb, &clock->tx_queue); + rc = true; + break; + } + spin_unlock_irqrestore(&clock->tx_queue.lock, flags); + + if (rc) { + shhwtstamps.hwtstamp = ktime_set(seconds, nsec); + skb_complete_tx_timestamp(skb, &shhwtstamps); + } +} + +static struct mchp_rds_ptp_rx_ts + *mchp_rds_ptp_get_rx_ts(struct mchp_rds_ptp_clock *clock) +{ + struct phy_device *phydev = clock->phydev; + struct mchp_rds_ptp_rx_ts *rx_ts = NULL; + u32 sec, nsec; + int rc; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_RX_INGRESS_NS_HI, + MCHP_RDS_PTP_PORT); + if (rc < 0) + goto error; + if (!(rc & MCHP_RDS_PTP_RX_INGRESS_NS_HI_TS_VALID)) { + phydev_err(phydev, "RX Timestamp is not valid!\n"); + goto error; + } + nsec = (rc & GENMASK(13, 0)) << 16; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_RX_INGRESS_NS_LO, + MCHP_RDS_PTP_PORT); + if (rc < 0) + goto error; + nsec |= rc; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_RX_INGRESS_SEC_HI, + MCHP_RDS_PTP_PORT); + if (rc < 0) + goto error; + sec = rc << 16; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_RX_INGRESS_SEC_LO, + MCHP_RDS_PTP_PORT); + if (rc < 0) + goto error; + sec |= rc; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_RX_MSG_HDR2, + MCHP_RDS_PTP_PORT); + if (rc < 0) + goto error; + + rx_ts = kmalloc(sizeof(*rx_ts), GFP_KERNEL); + if (!rx_ts) + return NULL; + + rx_ts->seconds = sec; + rx_ts->nsec = nsec; + rx_ts->seq_id = rc; + +error: + return rx_ts; +} + +static void mchp_rds_ptp_process_rx_ts(struct mchp_rds_ptp_clock *clock) +{ + int caps; + + do { + struct mchp_rds_ptp_rx_ts *rx_ts; + + rx_ts = mchp_rds_ptp_get_rx_ts(clock); + if (rx_ts) + mchp_rds_ptp_match_rx_ts(clock, rx_ts); + + caps = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_CAP_INFO, + MCHP_RDS_PTP_PORT); + if (caps < 0) + return; + } while (MCHP_RDS_PTP_RX_TS_CNT(caps) > 0); +} + +static bool mchp_rds_ptp_get_tx_ts(struct mchp_rds_ptp_clock *clock, + u32 *sec, u32 *nsec, u16 *seq) +{ + int rc; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_TX_EGRESS_NS_HI, + MCHP_RDS_PTP_PORT); + if (rc < 0) + return false; + if (!(rc & MCHP_RDS_PTP_TX_EGRESS_NS_HI_TS_VALID)) + return false; + *nsec = (rc & GENMASK(13, 0)) << 16; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_TX_EGRESS_NS_LO, + MCHP_RDS_PTP_PORT); + if (rc < 0) + return false; + *nsec = *nsec | rc; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_TX_EGRESS_SEC_HI, + MCHP_RDS_PTP_PORT); + if (rc < 0) + return false; + *sec = rc << 16; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_TX_EGRESS_SEC_LO, + MCHP_RDS_PTP_PORT); + if (rc < 0) + return false; + *sec = *sec | rc; + + rc = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_TX_MSG_HDR2, + MCHP_RDS_PTP_PORT); + if (rc < 0) + return false; + + *seq = rc; + + return true; +} + +static void mchp_rds_ptp_process_tx_ts(struct mchp_rds_ptp_clock *clock) +{ + int caps; + + do { + u32 sec, nsec; + u16 seq; + + if (mchp_rds_ptp_get_tx_ts(clock, &sec, &nsec, &seq)) + mchp_rds_ptp_match_tx_skb(clock, sec, nsec, seq); + + caps = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_CAP_INFO, + MCHP_RDS_PTP_PORT); + if (caps < 0) + return; + } while (MCHP_RDS_PTP_TX_TS_CNT(caps) > 0); +} + +int mchp_rds_ptp_top_config_intr(struct mchp_rds_ptp_clock *clock, + u16 reg, u16 val, bool clear) +{ + if (clear) + return phy_clear_bits_mmd(clock->phydev, PTP_MMD(clock), reg, + val); + else + return phy_set_bits_mmd(clock->phydev, PTP_MMD(clock), reg, + val); +} +EXPORT_SYMBOL_GPL(mchp_rds_ptp_top_config_intr); + +irqreturn_t mchp_rds_ptp_handle_interrupt(struct mchp_rds_ptp_clock *clock) +{ + int irq_sts; + + /* To handle rogue interrupt scenarios */ + if (!clock) + return IRQ_NONE; + + do { + irq_sts = mchp_rds_phy_read_mmd(clock, MCHP_RDS_PTP_INT_STS, + MCHP_RDS_PTP_PORT); + if (irq_sts < 0) + return IRQ_NONE; + + if (irq_sts & MCHP_RDS_PTP_INT_RX_TS_EN) + mchp_rds_ptp_process_rx_ts(clock); + + if (irq_sts & MCHP_RDS_PTP_INT_TX_TS_EN) + mchp_rds_ptp_process_tx_ts(clock); + + if (irq_sts & MCHP_RDS_PTP_INT_TX_TS_OVRFL_EN) + mchp_rds_ptp_flush_fifo(clock, + MCHP_RDS_PTP_EGRESS_FIFO); + + if (irq_sts & MCHP_RDS_PTP_INT_RX_TS_OVRFL_EN) + mchp_rds_ptp_flush_fifo(clock, + MCHP_RDS_PTP_INGRESS_FIFO); + } while (irq_sts & (MCHP_RDS_PTP_INT_RX_TS_EN | + MCHP_RDS_PTP_INT_TX_TS_EN | + MCHP_RDS_PTP_INT_TX_TS_OVRFL_EN | + MCHP_RDS_PTP_INT_RX_TS_OVRFL_EN)); + + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(mchp_rds_ptp_handle_interrupt); + +static int mchp_rds_ptp_init(struct mchp_rds_ptp_clock *clock) +{ + int rc; + + /* Disable PTP */ + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_CMD_CTL, + MCHP_RDS_PTP_CLOCK, + MCHP_RDS_PTP_CMD_CTL_DIS); + if (rc < 0) + return rc; + + /* Disable TSU */ + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_TSU_GEN_CONFIG, + MCHP_RDS_PTP_PORT, 0); + if (rc < 0) + return rc; + + /* Clear PTP interrupt status registers */ + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_TSU_HARD_RESET, + MCHP_RDS_PTP_PORT, + MCHP_RDS_PTP_TSU_HARDRESET); + if (rc < 0) + return rc; + + /* Predictor enable */ + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_LATENCY_CORRECTION_CTL, + MCHP_RDS_PTP_CLOCK, + MCHP_RDS_PTP_LATENCY_SETTING); + if (rc < 0) + return rc; + + /* Configure PTP operational mode */ + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_OP_MODE, + MCHP_RDS_PTP_CLOCK, + MCHP_RDS_PTP_OP_MODE_STANDALONE); + if (rc < 0) + return rc; + + /* Reference clock configuration */ + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_REF_CLK_CFG, + MCHP_RDS_PTP_CLOCK, + MCHP_RDS_PTP_REF_CLK_CFG_SET); + if (rc < 0) + return rc; + + /* Classifier configurations */ + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_RX_PARSE_CONFIG, + MCHP_RDS_PTP_PORT, 0); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_TX_PARSE_CONFIG, + MCHP_RDS_PTP_PORT, 0); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_TX_PARSE_L2_ADDR_EN, + MCHP_RDS_PTP_PORT, 0); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_RX_PARSE_L2_ADDR_EN, + MCHP_RDS_PTP_PORT, 0); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_RX_PARSE_IPV4_ADDR_EN, + MCHP_RDS_PTP_PORT, 0); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_TX_PARSE_IPV4_ADDR_EN, + MCHP_RDS_PTP_PORT, 0); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_RX_VERSION, + MCHP_RDS_PTP_PORT, + MCHP_RDS_PTP_MAX_VERSION(0xff) | + MCHP_RDS_PTP_MIN_VERSION(0x0)); + if (rc < 0) + return rc; + + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_TX_VERSION, + MCHP_RDS_PTP_PORT, + MCHP_RDS_PTP_MAX_VERSION(0xff) | + MCHP_RDS_PTP_MIN_VERSION(0x0)); + if (rc < 0) + return rc; + + /* Enable TSU */ + rc = mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_TSU_GEN_CONFIG, + MCHP_RDS_PTP_PORT, + MCHP_RDS_PTP_TSU_GEN_CFG_TSU_EN); + if (rc < 0) + return rc; + + /* Enable PTP */ + return mchp_rds_phy_write_mmd(clock, MCHP_RDS_PTP_CMD_CTL, + MCHP_RDS_PTP_CLOCK, + MCHP_RDS_PTP_CMD_CTL_EN); +} + +struct mchp_rds_ptp_clock *mchp_rds_ptp_probe(struct phy_device *phydev, u8 mmd, + u16 clk_base_addr, + u16 port_base_addr) +{ + struct mchp_rds_ptp_clock *clock; + int rc; + + clock = devm_kzalloc(&phydev->mdio.dev, sizeof(*clock), GFP_KERNEL); + if (!clock) + return ERR_PTR(-ENOMEM); + + clock->port_base_addr = port_base_addr; + clock->clk_base_addr = clk_base_addr; + clock->mmd = mmd; + + mutex_init(&clock->ptp_lock); + /* Register PTP clock */ + clock->caps.owner = THIS_MODULE; + snprintf(clock->caps.name, 30, "%s", phydev->drv->name); + clock->caps.max_adj = MCHP_RDS_PTP_MAX_ADJ; + clock->caps.n_ext_ts = 0; + clock->caps.pps = 0; + clock->caps.adjfine = mchp_rds_ptp_ltc_adjfine; + clock->caps.adjtime = mchp_rds_ptp_ltc_adjtime; + clock->caps.gettime64 = mchp_rds_ptp_ltc_gettime64; + clock->caps.settime64 = mchp_rds_ptp_ltc_settime64; + clock->ptp_clock = ptp_clock_register(&clock->caps, + &phydev->mdio.dev); + if (IS_ERR(clock->ptp_clock)) + return ERR_PTR(-EINVAL); + + /* Check if PHC support is missing at the configuration level */ + if (!clock->ptp_clock) + return NULL; + + /* Initialize the SW */ + skb_queue_head_init(&clock->tx_queue); + skb_queue_head_init(&clock->rx_queue); + INIT_LIST_HEAD(&clock->rx_ts_list); + spin_lock_init(&clock->rx_ts_lock); + + clock->mii_ts.rxtstamp = mchp_rds_ptp_rxtstamp; + clock->mii_ts.txtstamp = mchp_rds_ptp_txtstamp; + clock->mii_ts.hwtstamp = mchp_rds_ptp_hwtstamp; + clock->mii_ts.ts_info = mchp_rds_ptp_ts_info; + + phydev->mii_ts = &clock->mii_ts; + + /* Timestamp selected by default to keep legacy API */ + phydev->default_timestamp = true; + + clock->phydev = phydev; + + rc = mchp_rds_ptp_init(clock); + if (rc < 0) + return ERR_PTR(rc); + + return clock; +} +EXPORT_SYMBOL_GPL(mchp_rds_ptp_probe); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MICROCHIP PHY RDS PTP driver"); +MODULE_AUTHOR("Divya Koppera"); From 2550afc61ef54274ac6f9355df9d33dad2910d3f Mon Sep 17 00:00:00 2001 From: Divya Koppera Date: Thu, 19 Dec 2024 18:03:09 +0530 Subject: [PATCH 0960/1450] net: phy: Kconfig: Add rds ptp library support and 1588 optional flag in Microchip phys Add ptp library support in Kconfig As some of Microchip T1 phys support ptp, add dependency of 1588 optional flag in Kconfig Reviewed-by: Andrew Lunn Reviewed-by: Vadim Fedorenko Signed-off-by: Divya Koppera Link: https://patch.msgid.link/20241219123311.30213-4-divya.koppera@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/Kconfig | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 15828f4710a94..dc625f2b3ae44 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -287,8 +287,15 @@ config MICROCHIP_PHY config MICROCHIP_T1_PHY tristate "Microchip T1 PHYs" + select MICROCHIP_PHY_RDS_PTP if NETWORK_PHY_TIMESTAMPING && \ + PTP_1588_CLOCK_OPTIONAL help - Supports the LAN87XX PHYs. + Supports the LAN8XXX PHYs. + +config MICROCHIP_PHY_RDS_PTP + tristate + help + Currently supports LAN887X T1 PHY config MICROSEMI_PHY tristate "Microsemi PHYs" From 85b39f7593e1383b235f1e9b3d943cc2e91b6b10 Mon Sep 17 00:00:00 2001 From: Divya Koppera Date: Thu, 19 Dec 2024 18:03:10 +0530 Subject: [PATCH 0961/1450] net: phy: Makefile: Add makefile support for rds ptp in Microchip phys Add makefile support for rds ptp library. Reviewed-by: Andrew Lunn Reviewed-by: Vadim Fedorenko Signed-off-by: Divya Koppera Link: https://patch.msgid.link/20241219123311.30213-5-divya.koppera@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index e6145153e837f..39b72b4642873 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -79,6 +79,7 @@ obj-$(CONFIG_MESON_GXL_PHY) += meson-gxl.o obj-$(CONFIG_MICREL_KS8995MA) += spi_ks8995.o obj-$(CONFIG_MICREL_PHY) += micrel.o obj-$(CONFIG_MICROCHIP_PHY) += microchip.o +obj-$(CONFIG_MICROCHIP_PHY_RDS_PTP) += microchip_rds_ptp.o obj-$(CONFIG_MICROCHIP_T1_PHY) += microchip_t1.o obj-$(CONFIG_MICROCHIP_T1S_PHY) += microchip_t1s.o obj-$(CONFIG_MICROSEMI_PHY) += mscc/ From 9fc3d6fe802923b026ecac16e59c0acdd6744d5d Mon Sep 17 00:00:00 2001 From: Divya Koppera Date: Thu, 19 Dec 2024 18:03:11 +0530 Subject: [PATCH 0962/1450] net: phy: microchip_t1 : Add initialization of ptp for lan887x Add initialization of ptp for lan887x. Reviewed-by: Andrew Lunn Signed-off-by: Divya Koppera Link: https://patch.msgid.link/20241219123311.30213-6-divya.koppera@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/microchip_t1.c | 41 +++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c index b17bf6708003e..73f28463bc356 100644 --- a/drivers/net/phy/microchip_t1.c +++ b/drivers/net/phy/microchip_t1.c @@ -10,11 +10,15 @@ #include #include #include +#include "microchip_rds_ptp.h" #define PHY_ID_LAN87XX 0x0007c150 #define PHY_ID_LAN937X 0x0007c180 #define PHY_ID_LAN887X 0x0007c1f0 +#define MCHP_RDS_PTP_LTC_BASE_ADDR 0xe000 +#define MCHP_RDS_PTP_PORT_BASE_ADDR (MCHP_RDS_PTP_LTC_BASE_ADDR + 0x800) + /* External Register Control Register */ #define LAN87XX_EXT_REG_CTL (0x14) #define LAN87XX_EXT_REG_CTL_RD_CTL (0x1000) @@ -229,6 +233,7 @@ #define LAN887X_INT_STS 0xf000 #define LAN887X_INT_MSK 0xf001 +#define LAN887X_INT_MSK_P1588_MOD_INT_MSK BIT(3) #define LAN887X_INT_MSK_T1_PHY_INT_MSK BIT(2) #define LAN887X_INT_MSK_LINK_UP_MSK BIT(1) #define LAN887X_INT_MSK_LINK_DOWN_MSK BIT(0) @@ -319,6 +324,8 @@ struct lan887x_regwr_map { struct lan887x_priv { u64 stats[ARRAY_SIZE(lan887x_hw_stats)]; + struct mchp_rds_ptp_clock *clock; + bool init_done; }; static int lan937x_dsp_workaround(struct phy_device *phydev, u16 ereg, u8 bank) @@ -1269,8 +1276,19 @@ static int lan887x_get_features(struct phy_device *phydev) static int lan887x_phy_init(struct phy_device *phydev) { + struct lan887x_priv *priv = phydev->priv; int ret; + if (!priv->init_done && phy_interrupt_is_valid(phydev)) { + priv->clock = mchp_rds_ptp_probe(phydev, MDIO_MMD_VEND1, + MCHP_RDS_PTP_LTC_BASE_ADDR, + MCHP_RDS_PTP_PORT_BASE_ADDR); + if (IS_ERR(priv->clock)) + return PTR_ERR(priv->clock); + + priv->init_done = true; + } + /* Clear loopback */ ret = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND1, LAN887X_MIS_CFG_REG2, @@ -1470,6 +1488,7 @@ static int lan887x_probe(struct phy_device *phydev) if (!priv) return -ENOMEM; + priv->init_done = false; phydev->priv = priv; return lan887x_phy_setup(phydev); @@ -1518,6 +1537,7 @@ static void lan887x_get_strings(struct phy_device *phydev, u8 *data) static int lan887x_config_intr(struct phy_device *phydev) { + struct lan887x_priv *priv = phydev->priv; int rc; if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { @@ -1537,12 +1557,24 @@ static int lan887x_config_intr(struct phy_device *phydev) rc = phy_read_mmd(phydev, MDIO_MMD_VEND1, LAN887X_INT_STS); } + if (rc < 0) + return rc; - return rc < 0 ? rc : 0; + if (phy_is_default_hwtstamp(phydev)) { + return mchp_rds_ptp_top_config_intr(priv->clock, + LAN887X_INT_MSK, + LAN887X_INT_MSK_P1588_MOD_INT_MSK, + (phydev->interrupts == + PHY_INTERRUPT_ENABLED)); + } + + return 0; } static irqreturn_t lan887x_handle_interrupt(struct phy_device *phydev) { + struct lan887x_priv *priv = phydev->priv; + int rc = IRQ_NONE; int irq_status; irq_status = phy_read_mmd(phydev, MDIO_MMD_VEND1, LAN887X_INT_STS); @@ -1553,10 +1585,13 @@ static irqreturn_t lan887x_handle_interrupt(struct phy_device *phydev) if (irq_status & LAN887X_MX_CHIP_TOP_LINK_MSK) { phy_trigger_machine(phydev); - return IRQ_HANDLED; + rc = IRQ_HANDLED; } - return IRQ_NONE; + if (irq_status & LAN887X_INT_MSK_P1588_MOD_INT_MSK) + rc = mchp_rds_ptp_handle_interrupt(priv->clock); + + return rc; } static int lan887x_cd_reset(struct phy_device *phydev, From ddbb5ddc43ad000a984149db5af1133433938404 Mon Sep 17 00:00:00 2001 From: Rongwei Liu Date: Thu, 19 Dec 2024 19:58:31 +0200 Subject: [PATCH 0963/1450] net/mlx5: LAG, Refactor lag logic Wrap the lag pf access into two new macros: 1. ldev_for_each() 2. ldev_for_each_reverse() The maximum number of lag ports and the index to `natvie_port_num` mapping will be handled by the two new macros. Users shouldn't use the for loop anymore. Signed-off-by: Rongwei Liu Reviewed-by: Saeed Mahameed Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/lag/debugfs.c | 13 +- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 181 +++++++++--------- .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 14 +- .../net/ethernet/mellanox/mlx5/core/lag/mp.c | 24 ++- .../ethernet/mellanox/mlx5/core/lag/mpesw.c | 10 +- .../mellanox/mlx5/core/lag/port_sel.c | 16 +- 6 files changed, 137 insertions(+), 121 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c index f4b777d4e1086..62b6faa4276aa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c @@ -105,20 +105,20 @@ static int mapping_show(struct seq_file *file, void *priv) struct mlx5_lag *ldev; bool hash = false; bool lag_active; + int i, idx = 0; int num_ports; - int i; ldev = mlx5_lag_dev(dev); mutex_lock(&ldev->lock); lag_active = __mlx5_lag_is_active(ldev); if (lag_active) { if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags)) { - mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, ports, + mlx5_infer_tx_enabled(&ldev->tracker, ldev, ports, &num_ports); hash = true; } else { - for (i = 0; i < ldev->ports; i++) - ports[i] = ldev->v2p_map[i]; + mlx5_ldev_for_each(i, 0, ldev) + ports[idx++] = ldev->v2p_map[i]; num_ports = ldev->ports; } } @@ -144,11 +144,8 @@ static int members_show(struct seq_file *file, void *priv) ldev = mlx5_lag_dev(dev); mutex_lock(&ldev->lock); - for (i = 0; i < ldev->ports; i++) { - if (!ldev->pf[i].dev) - continue; + mlx5_ldev_for_each(i, 0, ldev) seq_printf(file, "%s\n", dev_name(ldev->pf[i].dev->device)); - } mutex_unlock(&ldev->lock); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 7f68468c2e759..ed539ac4fef18 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -43,10 +43,6 @@ #include "mp.h" #include "mpesw.h" -enum { - MLX5_LAG_EGRESS_PORT_1 = 1, - MLX5_LAG_EGRESS_PORT_2, -}; /* General purpose, use for short periods of time. * Beware of lock dependencies (preferably, no locks should be acquired @@ -72,7 +68,7 @@ static u8 lag_active_port_bits(struct mlx5_lag *ldev) int num_enabled; int idx; - mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, enabled_ports, + mlx5_infer_tx_enabled(&ldev->tracker, ldev, enabled_ports, &num_enabled); for (idx = 0; idx < num_enabled; idx++) active_port |= BIT_MASK(enabled_ports[idx]); @@ -113,7 +109,7 @@ static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, int mode, return mlx5_cmd_exec_in(dev, create_lag, in); } -static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 num_ports, +static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, struct mlx5_lag *ldev, u8 *ports) { u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {}; @@ -148,33 +144,31 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev) } EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag); -static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports, +static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, struct mlx5_lag *ldev, u8 *ports, int *num_disabled) { int i; *num_disabled = 0; - for (i = 0; i < num_ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) if (!tracker->netdev_state[i].tx_enabled || !tracker->netdev_state[i].link_up) ports[(*num_disabled)++] = i; - } } -void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports, +void mlx5_infer_tx_enabled(struct lag_tracker *tracker, struct mlx5_lag *ldev, u8 *ports, int *num_enabled) { int i; *num_enabled = 0; - for (i = 0; i < num_ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) if (tracker->netdev_state[i].tx_enabled && tracker->netdev_state[i].link_up) ports[(*num_enabled)++] = i; - } if (*num_enabled == 0) - mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled); + mlx5_infer_tx_disabled(tracker, ldev, ports, num_enabled); } static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev, @@ -192,7 +186,7 @@ static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev, int j; if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) { - mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports, + mlx5_infer_tx_enabled(tracker, ldev, enabled_ports, &num_enabled); for (i = 0; i < num_enabled; i++) { err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1); @@ -203,7 +197,7 @@ static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev, buf[written - 2] = 0; mlx5_core_info(dev, "lag map active ports: %s\n", buf); } else { - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { for (j = 0; j < ldev->buckets; j++) { idx = i * ldev->buckets + j; err = scnprintf(buf + written, 10, @@ -286,7 +280,7 @@ int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, { int i; - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) if (ldev->pf[i].netdev == ndev) return i; @@ -310,7 +304,7 @@ static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev) * with mapping that points to active ports. */ static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker, - u8 num_ports, + struct mlx5_lag *ldev, u8 buckets, u8 *ports) { @@ -323,7 +317,7 @@ static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker, int i; int j; - for (i = 0; i < num_ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { if (tracker->netdev_state[i].tx_enabled && tracker->netdev_state[i].link_up) enabled[enabled_ports_num++] = i; @@ -334,15 +328,16 @@ static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker, /* Use native mapping by default where each port's buckets * point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc */ - for (i = 0; i < num_ports; i++) + mlx5_ldev_for_each(i, 0, ldev) { for (j = 0; j < buckets; j++) { idx = i * buckets + j; - ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i; + ports[idx] = i + 1; } + } /* If all ports are disabled/enabled keep native mapping */ - if (enabled_ports_num == num_ports || - disabled_ports_num == num_ports) + if (enabled_ports_num == ldev->ports || + disabled_ports_num == ldev->ports) return; /* Go over the disabled ports and for each assign a random active port */ @@ -358,7 +353,7 @@ static bool mlx5_lag_has_drop_rule(struct mlx5_lag *ldev) { int i; - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) if (ldev->pf[i].has_drop) return true; return false; @@ -368,7 +363,7 @@ static void mlx5_lag_drop_rule_cleanup(struct mlx5_lag *ldev) { int i; - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { if (!ldev->pf[i].has_drop) continue; @@ -396,7 +391,7 @@ static void mlx5_lag_drop_rule_setup(struct mlx5_lag *ldev, if (!ldev->tracker.has_inactive) return; - mlx5_infer_tx_disabled(tracker, ldev->ports, disabled_ports, &num_disabled); + mlx5_infer_tx_disabled(tracker, ldev, disabled_ports, &num_disabled); for (i = 0; i < num_disabled; i++) { disabled_index = disabled_ports[i]; @@ -442,7 +437,7 @@ static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports) return mlx5_cmd_modify_active_port(dev0, active_ports); } - return mlx5_cmd_modify_lag(dev0, ldev->ports, ports); + return mlx5_cmd_modify_lag(dev0, ldev, ports); } static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev *dev) @@ -458,7 +453,7 @@ static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev if (!ldev) goto unlock; - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) if (ldev->tracker.netdev_state[i].tx_enabled) ndev = ldev->pf[i].netdev; if (!ndev) @@ -483,9 +478,9 @@ void mlx5_modify_lag(struct mlx5_lag *ldev, int i; int j; - mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports); + mlx5_infer_tx_affinity_mapping(tracker, ldev, ldev->buckets, ports); - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { for (j = 0; j < ldev->buckets; j++) { idx = i * ldev->buckets + j; if (ports[idx] == ldev->v2p_map[idx]) @@ -596,9 +591,9 @@ static int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; struct mlx5_eswitch *master_esw = dev0->priv.eswitch; int err; - int i; + int i, j; - for (i = MLX5_LAG_P1 + 1; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 1, ldev) { struct mlx5_eswitch *slave_esw = ldev->pf[i].dev->priv.eswitch; err = mlx5_eswitch_offloads_single_fdb_add_one(master_esw, @@ -608,9 +603,9 @@ static int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) } return 0; err: - for (; i > MLX5_LAG_P1; i--) + mlx5_ldev_for_each_reverse(j, i, 1, ldev) mlx5_eswitch_offloads_single_fdb_del_one(master_esw, - ldev->pf[i].dev->priv.eswitch); + ldev->pf[j].dev->priv.eswitch); return err; } @@ -671,7 +666,7 @@ int mlx5_activate_lag(struct mlx5_lag *ldev, return err; if (mode != MLX5_LAG_MODE_MPESW) { - mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map); + mlx5_infer_tx_affinity_mapping(tracker, ldev, ldev->buckets, ldev->v2p_map); if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) { err = mlx5_lag_port_sel_create(ldev, tracker->hash_type, ldev->v2p_map); @@ -722,7 +717,7 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev) mlx5_lag_mp_reset(ldev); if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) { - for (i = MLX5_LAG_P1 + 1; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 1, ldev) mlx5_eswitch_offloads_single_fdb_del_one(master_esw, ldev->pf[i].dev->priv.eswitch); clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags); @@ -766,7 +761,7 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) return false; #ifdef CONFIG_MLX5_ESWITCH - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { dev = ldev->pf[i].dev; if (mlx5_eswitch_num_vfs(dev->priv.eswitch) && !is_mdev_switchdev_mode(dev)) return false; @@ -774,17 +769,17 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) dev = ldev->pf[MLX5_LAG_P1].dev; mode = mlx5_eswitch_mode(dev); - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode) return false; #else - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) if (mlx5_sriov_is_enabled(ldev->pf[i].dev)) return false; #endif roce_support = mlx5_get_roce_state(ldev->pf[MLX5_LAG_P1].dev); - for (i = 1; i < ldev->ports; i++) + mlx5_ldev_for_each(i, MLX5_LAG_P2, ldev) if (mlx5_get_roce_state(ldev->pf[i].dev) != roce_support) return false; @@ -795,10 +790,7 @@ void mlx5_lag_add_devices(struct mlx5_lag *ldev) { int i; - for (i = 0; i < ldev->ports; i++) { - if (!ldev->pf[i].dev) - continue; - + mlx5_ldev_for_each(i, 0, ldev) { if (ldev->pf[i].dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV) continue; @@ -812,10 +804,7 @@ void mlx5_lag_remove_devices(struct mlx5_lag *ldev) { int i; - for (i = 0; i < ldev->ports; i++) { - if (!ldev->pf[i].dev) - continue; - + mlx5_ldev_for_each(i, 0, ldev) { if (ldev->pf[i].dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV) continue; @@ -842,7 +831,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev) dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); } - for (i = 1; i < ldev->ports; i++) + mlx5_ldev_for_each(i, MLX5_LAG_P2, ldev) mlx5_nic_vport_disable_roce(ldev->pf[i].dev); } @@ -854,7 +843,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev) mlx5_lag_add_devices(ldev); if (shared_fdb) - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) if (!(ldev->pf[i].dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch); } @@ -864,7 +853,7 @@ static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev) struct mlx5_core_dev *dev; int i; - for (i = MLX5_LAG_P1 + 1; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, MLX5_LAG_P1 + 1, ldev) { dev = ldev->pf[i].dev; if (is_mdev_switchdev_mode(dev) && mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) && @@ -892,11 +881,11 @@ static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev) bool roce_lag = true; int i; - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) roce_lag = roce_lag && !mlx5_sriov_is_enabled(ldev->pf[i].dev); #ifdef CONFIG_MLX5_ESWITCH - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) roce_lag = roce_lag && is_mdev_legacy_mode(ldev->pf[i].dev); #endif @@ -956,7 +945,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) } else if (roce_lag) { dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); - for (i = 1; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, MLX5_LAG_P2, ldev) { if (mlx5_get_roce_state(ldev->pf[i].dev)) mlx5_nic_vport_enable_roce(ldev->pf[i].dev); } @@ -966,7 +955,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { err = mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch); if (err) break; @@ -977,7 +966,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) mlx5_rescan_drivers_locked(dev0); mlx5_deactivate_lag(ldev); mlx5_lag_add_devices(ldev); - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch); mlx5_core_err(dev0, "Failed to enable lag\n"); return; @@ -1010,12 +999,9 @@ struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev) int i; mutex_lock(&ldev->lock); - for (i = 0; i < ldev->ports; i++) { - if (ldev->pf[i].dev) { - devcom = ldev->pf[i].dev->priv.hca_devcom_comp; - break; - } - } + i = mlx5_get_next_ldev_func(ldev, 0); + if (i < MLX5_MAX_PORTS) + devcom = ldev->pf[i].dev->priv.hca_devcom_comp; mutex_unlock(&ldev->lock); return devcom; } @@ -1068,7 +1054,7 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev, u8 bond_status = 0; int num_slaves = 0; int changed = 0; - int idx; + int i, idx = -1; if (!netif_is_lag_master(upper)) return 0; @@ -1083,8 +1069,13 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev, */ rcu_read_lock(); for_each_netdev_in_bond_rcu(upper, ndev_tmp) { - idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp); - if (idx >= 0) { + mlx5_ldev_for_each(i, 0, ldev) { + if (ldev->pf[i].netdev == ndev_tmp) { + idx++; + break; + } + } + if (i < MLX5_MAX_PORTS) { slave = bond_slave_get_rcu(ndev_tmp); if (slave) has_inactive |= bond_is_slave_inactive(slave); @@ -1234,15 +1225,12 @@ static int mlx5_lag_netdev_event(struct notifier_block *this, } static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev, - struct mlx5_core_dev *dev, - struct net_device *netdev) + struct mlx5_core_dev *dev, + struct net_device *netdev) { unsigned int fn = mlx5_get_dev_index(dev); unsigned long flags; - if (fn >= ldev->ports) - return; - spin_lock_irqsave(&lag_lock, flags); ldev->pf[fn].netdev = netdev; ldev->tracker.netdev_state[fn].link_up = 0; @@ -1257,7 +1245,7 @@ static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev, int i; spin_lock_irqsave(&lag_lock, flags); - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { if (ldev->pf[i].netdev == netdev) { ldev->pf[i].netdev = NULL; break; @@ -1267,13 +1255,10 @@ static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev, } static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev, - struct mlx5_core_dev *dev) + struct mlx5_core_dev *dev) { unsigned int fn = mlx5_get_dev_index(dev); - if (fn >= ldev->ports) - return; - ldev->pf[fn].dev = dev; dev->priv.lag = ldev; } @@ -1281,16 +1266,13 @@ static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev, static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev) { - int i; + int fn; - for (i = 0; i < ldev->ports; i++) - if (ldev->pf[i].dev == dev) - break; - - if (i == ldev->ports) + fn = mlx5_get_dev_index(dev); + if (ldev->pf[fn].dev != dev) return; - ldev->pf[i].dev = NULL; + ldev->pf[fn].dev = NULL; dev->priv.lag = NULL; } @@ -1406,7 +1388,6 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, mutex_lock(&ldev->lock); mlx5_ldev_add_netdev(ldev, dev, netdev); - for (i = 0; i < ldev->ports; i++) if (!ldev->pf[i].netdev) break; @@ -1417,6 +1398,26 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, mlx5_queue_bond_work(ldev, 0); } +int mlx5_get_pre_ldev_func(struct mlx5_lag *ldev, int start_idx, int end_idx) +{ + int i; + + for (i = start_idx; i >= end_idx; i--) + if (ldev->pf[i].dev) + return i; + return -1; +} + +int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx) +{ + int i; + + for (i = start_idx; i < MLX5_MAX_PORTS; i++) + if (ldev->pf[i].dev) + return i; + return MLX5_MAX_PORTS; +} + bool mlx5_lag_is_roce(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; @@ -1467,7 +1468,7 @@ bool mlx5_lag_is_master(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; unsigned long flags; - bool res; + bool res = false; spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); @@ -1555,7 +1556,7 @@ u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, if (!(ldev && __mlx5_lag_is_roce(ldev))) goto unlock; - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { if (ldev->pf[i].netdev == slave) { port = i; break; @@ -1594,13 +1595,13 @@ struct mlx5_core_dev *mlx5_lag_get_next_peer_mdev(struct mlx5_core_dev *dev, int if (!ldev) goto unlock; - if (*i == ldev->ports) + if (*i == MLX5_MAX_PORTS) goto unlock; - for (idx = *i; idx < ldev->ports; idx++) + mlx5_ldev_for_each(idx, *i, ldev) if (ldev->pf[idx].dev != dev) break; - if (idx == ldev->ports) { + if (idx == MLX5_MAX_PORTS) { *i = idx; goto unlock; } @@ -1621,10 +1622,10 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, { int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out); struct mlx5_core_dev **mdev; + int ret = 0, i, j, idx = 0; struct mlx5_lag *ldev; unsigned long flags; int num_ports; - int ret, i, j; void *out; out = kvzalloc(outlen, GFP_KERNEL); @@ -1643,8 +1644,8 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, ldev = mlx5_lag_dev(dev); if (ldev && __mlx5_lag_is_active(ldev)) { num_ports = ldev->ports; - for (i = 0; i < ldev->ports; i++) - mdev[i] = ldev->pf[i].dev; + mlx5_ldev_for_each(i, 0, ldev) + mdev[idx++] = ldev->pf[i].dev; } else { num_ports = 1; mdev[MLX5_LAG_P1] = dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 50fcb1eee5748..1dada791815e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -103,7 +103,7 @@ int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, struct net_device *ndev); char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags); -void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports, +void mlx5_infer_tx_enabled(struct lag_tracker *tracker, struct mlx5_lag *ldev, u8 *ports, int *num_enabled); void mlx5_ldev_add_debugfs(struct mlx5_core_dev *dev); @@ -119,9 +119,21 @@ static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) if (!MLX5_CAP_GEN(dev, vport_group_manager) || !MLX5_CAP_GEN(dev, lag_master) || MLX5_CAP_GEN(dev, num_lag_ports) < 2 || + mlx5_get_dev_index(dev) >= MLX5_MAX_PORTS || MLX5_CAP_GEN(dev, num_lag_ports) > MLX5_MAX_PORTS) return false; return true; } +#define mlx5_ldev_for_each(i, start_index, ldev) \ + for (int tmp = start_index; tmp = mlx5_get_next_ldev_func(ldev, tmp), \ + i = tmp, tmp < MLX5_MAX_PORTS; tmp++) + +#define mlx5_ldev_for_each_reverse(i, start_index, end_index, ldev) \ + for (int tmp = start_index, tmp1 = end_index; \ + tmp = mlx5_get_pre_ldev_func(ldev, tmp, tmp1), \ + i = tmp, tmp >= tmp1; tmp--) + +int mlx5_get_pre_ldev_func(struct mlx5_lag *ldev, int start_idx, int end_idx); +int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx); #endif /* __MLX5_LAG_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c index b1aa494c76ba8..40406d04adc98 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -153,6 +153,7 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event, struct net_device *nh_dev0, *nh_dev1; struct fib_info *fi = fen_info->fi; struct lag_mp *mp = &ldev->lag_mp; + int i, dev_idx = 0; /* Handle delete event */ if (event == FIB_EVENT_ENTRY_DEL) { @@ -186,10 +187,12 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event, if (!nh_dev1) { if (__mlx5_lag_is_active(ldev)) { - int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev0); - - i++; - mlx5_lag_set_port_affinity(ldev, i); + mlx5_ldev_for_each(i, 0, ldev) { + dev_idx++; + if (ldev->pf[i].netdev == nh_dev0) + break; + } + mlx5_lag_set_port_affinity(ldev, dev_idx); mlx5_lag_fib_set(mp, fi, fen_info->dst, fen_info->dst_len); } @@ -214,6 +217,7 @@ static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev, struct fib_info *fi) { struct lag_mp *mp = &ldev->lag_mp; + int i, dev_idx = 0; /* Check the nh event is related to the route */ if (!mp->fib.mfi || mp->fib.mfi != fi) @@ -221,11 +225,15 @@ static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev, /* nh added/removed */ if (event == FIB_EVENT_NH_DEL) { - int i = mlx5_lag_dev_get_netdev_idx(ldev, fib_nh->fib_nh_dev); + mlx5_ldev_for_each(i, 0, ldev) { + if (ldev->pf[i].netdev == fib_nh->fib_nh_dev) + break; + dev_idx++; + } - if (i >= 0) { - i = (i + 1) % 2 + 1; /* peer port */ - mlx5_lag_set_port_affinity(ldev, i); + if (dev_idx >= 0) { + dev_idx = (dev_idx + 1) % 2 + 1; /* peer port */ + mlx5_lag_set_port_affinity(ldev, dev_idx); } } else if (event == FIB_EVENT_NH_ADD && fib_info_num_path(fi) == 2) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index 571ea26edd0ca..1123c8afcf9ef 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -15,7 +15,7 @@ static void mlx5_mpesw_metadata_cleanup(struct mlx5_lag *ldev) u32 pf_metadata; int i; - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { dev = ldev->pf[i].dev; esw = dev->priv.eswitch; pf_metadata = ldev->lag_mpesw.pf_metadata[i]; @@ -36,7 +36,7 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev) u32 pf_metadata; int i, err; - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { dev = ldev->pf[i].dev; esw = dev->priv.eswitch; pf_metadata = mlx5_esw_match_metadata_alloc(esw); @@ -52,7 +52,7 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev) goto err_metadata; } - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { dev = ldev->pf[i].dev; mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_MULTIPORT_ESW, (void *)0); @@ -98,7 +98,7 @@ static int enable_mpesw(struct mlx5_lag *ldev) dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { err = mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch); if (err) goto err_rescan_drivers; @@ -112,7 +112,7 @@ static int enable_mpesw(struct mlx5_lag *ldev) mlx5_deactivate_lag(ldev); err_add_devices: mlx5_lag_add_devices(ldev); - for (i = 0; i < ldev->ports; i++) + mlx5_ldev_for_each(i, 0, ldev) mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch); mlx5_mpesw_metadata_cleanup(ldev); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c index ab2717012b79b..f98f0735fce0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -44,9 +44,7 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, struct mlx5_flow_destination dest = {}; MLX5_DECLARE_FLOW_ACT(flow_act); struct mlx5_flow_namespace *ns; - int err, i; - int idx; - int j; + int err, i, j, k, idx; ft_attr.max_fte = ldev->ports * ldev->buckets; ft_attr.level = MLX5_LAG_FT_LEVEL_DEFINER; @@ -74,7 +72,7 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, dest.type = MLX5_FLOW_DESTINATION_TYPE_UPLINK; dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; flow_act.flags |= FLOW_ACT_NO_APPEND; - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { for (j = 0; j < ldev->buckets; j++) { u8 affinity; @@ -88,13 +86,13 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, &dest, 1); if (IS_ERR(lag_definer->rules[idx])) { err = PTR_ERR(lag_definer->rules[idx]); - do { + mlx5_ldev_for_each_reverse(k, i, 0, ldev) { while (j--) { - idx = i * ldev->buckets + j; + idx = k * ldev->buckets + j; mlx5_del_flow_rules(lag_definer->rules[idx]); } j = ldev->buckets; - } while (i--); + }; goto destroy_fg; } } @@ -346,7 +344,7 @@ static void mlx5_lag_destroy_definer(struct mlx5_lag *ldev, int i; int j; - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { for (j = 0; j < ldev->buckets; j++) { idx = i * ldev->buckets + j; mlx5_del_flow_rules(lag_definer->rules[idx]); @@ -565,7 +563,7 @@ static int __mlx5_lag_modify_definers_destinations(struct mlx5_lag *ldev, dest.type = MLX5_FLOW_DESTINATION_TYPE_UPLINK; dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; - for (i = 0; i < ldev->ports; i++) { + mlx5_ldev_for_each(i, 0, ldev) { for (j = 0; j < ldev->buckets; j++) { idx = i * ldev->buckets + j; if (ldev->v2p_map[idx] == ports[idx]) From 60d01cc468fdb0fbd6b878d66ef86f7e946b0669 Mon Sep 17 00:00:00 2001 From: Rongwei Liu Date: Thu, 19 Dec 2024 19:58:32 +0200 Subject: [PATCH 0964/1450] net/mlx5: LAG, Support LAG over Multi-Host NICs New multi-host NICs provide each host with partial ports, allowing each host to maintain its unique LAG configuration. On these multi-host NICs, the 'native_port_num' capability is no longer continuous on each host and can exceed the 'num_lag_ports' capability. Therefore, it is necessary to skip the PFs with ldev->pf[i].dev == NULL when querying/modifying the lag devices' information. There is no need to check dev.native_port_num against ldev->ports. Signed-off-by: Rongwei Liu Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 200 ++++++++++++++---- .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 3 + .../net/ethernet/mellanox/mlx5/core/lag/mp.c | 53 +++-- .../ethernet/mellanox/mlx5/core/lag/mpesw.c | 6 +- .../mellanox/mlx5/core/lag/port_sel.c | 41 +++- .../net/ethernet/mellanox/mlx5/core/main.c | 4 + 6 files changed, 229 insertions(+), 78 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index ed539ac4fef18..cea5aa314f6c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -76,23 +76,30 @@ static u8 lag_active_port_bits(struct mlx5_lag *ldev) return active_port; } -static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, int mode, - unsigned long flags) +static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, struct mlx5_lag *ldev, + int mode, unsigned long flags) { bool fdb_sel_mode = test_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, &flags); int port_sel_mode = get_port_sel_mode(mode, flags); u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {}; + u8 *ports = ldev->v2p_map; + int idx0, idx1; void *lag_ctx; lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx); MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG); MLX5_SET(lagc, lag_ctx, fdb_selection_mode, fdb_sel_mode); + idx0 = mlx5_lag_get_dev_index_by_seq(ldev, 0); + idx1 = mlx5_lag_get_dev_index_by_seq(ldev, 1); + + if (idx0 < 0 || idx1 < 0) + return -EINVAL; switch (port_sel_mode) { case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: - MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]); - MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[idx0]); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[idx1]); break; case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: if (!MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table_bypass)) @@ -114,12 +121,18 @@ static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, struct mlx5_lag *ldev, { u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {}; void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx); + int idx0, idx1; + + idx0 = mlx5_lag_get_dev_index_by_seq(ldev, 0); + idx1 = mlx5_lag_get_dev_index_by_seq(ldev, 1); + if (idx0 < 0 || idx1 < 0) + return -EINVAL; MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG); MLX5_SET(modify_lag_in, in, field_select, 0x1); - MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]); - MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[idx0]); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[idx1]); return mlx5_cmd_exec_in(dev, modify_lag, in); } @@ -287,6 +300,48 @@ int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, return -ENOENT; } +int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq) +{ + int i, num = 0; + + if (!ldev) + return -ENOENT; + + mlx5_ldev_for_each(i, 0, ldev) { + if (num == seq) + return i; + num++; + } + return -ENOENT; +} + +int mlx5_lag_num_devs(struct mlx5_lag *ldev) +{ + int i, num = 0; + + if (!ldev) + return 0; + + mlx5_ldev_for_each(i, 0, ldev) { + (void)i; + num++; + } + return num; +} + +int mlx5_lag_num_netdevs(struct mlx5_lag *ldev) +{ + int i, num = 0; + + if (!ldev) + return 0; + + mlx5_ldev_for_each(i, 0, ldev) + if (ldev->pf[i].netdev) + num++; + return num; +} + static bool __mlx5_lag_is_roce(struct mlx5_lag *ldev) { return ldev->mode == MLX5_LAG_MODE_ROCE; @@ -423,10 +478,15 @@ static int mlx5_cmd_modify_active_port(struct mlx5_core_dev *dev, u8 ports) static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports) { - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + struct mlx5_core_dev *dev0; u8 active_ports; int ret; + if (idx < 0) + return -EINVAL; + + dev0 = ldev->pf[idx].dev; if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags)) { ret = mlx5_lag_port_sel_modify(ldev, ports); if (ret || @@ -445,7 +505,7 @@ static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev struct net_device *ndev = NULL; struct mlx5_lag *ldev; unsigned long flags; - int i; + int i, last_idx; spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); @@ -456,8 +516,12 @@ static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev mlx5_ldev_for_each(i, 0, ldev) if (ldev->tracker.netdev_state[i].tx_enabled) ndev = ldev->pf[i].netdev; - if (!ndev) - ndev = ldev->pf[ldev->ports - 1].netdev; + if (!ndev) { + last_idx = mlx5_lag_get_dev_index_by_seq(ldev, ldev->ports - 1); + if (last_idx < 0) + goto unlock; + ndev = ldev->pf[last_idx].netdev; + } if (ndev) dev_hold(ndev); @@ -471,13 +535,18 @@ static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev void mlx5_modify_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker) { + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {}; - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_core_dev *dev0; int idx; int err; int i; int j; + if (first_idx < 0) + return; + + dev0 = ldev->pf[first_idx].dev; mlx5_infer_tx_affinity_mapping(tracker, ldev, ldev->buckets, ports); mlx5_ldev_for_each(i, 0, ldev) { @@ -518,8 +587,13 @@ void mlx5_modify_lag(struct mlx5_lag *ldev, static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev, unsigned long *flags) { - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + struct mlx5_core_dev *dev0; + if (first_idx < 0) + return -EINVAL; + + dev0 = ldev->pf[first_idx].dev; if (!MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table)) { if (ldev->ports > 2) return -EINVAL; @@ -539,11 +613,13 @@ static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev, enum mlx5_lag_mode mode, unsigned long *flags) { - struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1]; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + struct lag_func *dev0; - if (mode == MLX5_LAG_MODE_MPESW) + if (first_idx < 0 || mode == MLX5_LAG_MODE_MPESW) return; + dev0 = &ldev->pf[first_idx]; if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) && tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH) { if (ldev->ports > 2) @@ -588,12 +664,18 @@ char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags) static int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) { - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; - struct mlx5_eswitch *master_esw = dev0->priv.eswitch; - int err; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + struct mlx5_eswitch *master_esw; + struct mlx5_core_dev *dev0; int i, j; + int err; - mlx5_ldev_for_each(i, 1, ldev) { + if (first_idx < 0) + return -EINVAL; + + dev0 = ldev->pf[first_idx].dev; + master_esw = dev0->priv.eswitch; + mlx5_ldev_for_each(i, first_idx + 1, ldev) { struct mlx5_eswitch *slave_esw = ldev->pf[i].dev->priv.eswitch; err = mlx5_eswitch_offloads_single_fdb_add_one(master_esw, @@ -603,7 +685,7 @@ static int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) } return 0; err: - mlx5_ldev_for_each_reverse(j, i, 1, ldev) + mlx5_ldev_for_each_reverse(j, i, first_idx + 1, ldev) mlx5_eswitch_offloads_single_fdb_del_one(master_esw, ldev->pf[j].dev->priv.eswitch); return err; @@ -615,16 +697,21 @@ static int mlx5_create_lag(struct mlx5_lag *ldev, unsigned long flags) { bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags); - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {}; + struct mlx5_core_dev *dev0; int err; + if (first_idx < 0) + return -EINVAL; + + dev0 = ldev->pf[first_idx].dev; if (tracker) mlx5_lag_print_mapping(dev0, ldev, tracker, flags); mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n", shared_fdb, mlx5_get_str_port_sel_mode(mode, flags)); - err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags); + err = mlx5_cmd_create_lag(dev0, ldev, mode, flags); if (err) { mlx5_core_err(dev0, "Failed to create LAG (%d)\n", @@ -656,11 +743,16 @@ int mlx5_activate_lag(struct mlx5_lag *ldev, enum mlx5_lag_mode mode, bool shared_fdb) { + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); bool roce_lag = mode == MLX5_LAG_MODE_ROCE; - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_core_dev *dev0; unsigned long flags = 0; int err; + if (first_idx < 0) + return -EINVAL; + + dev0 = ldev->pf[first_idx].dev; err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags); if (err) return err; @@ -704,20 +796,26 @@ int mlx5_activate_lag(struct mlx5_lag *ldev, int mlx5_deactivate_lag(struct mlx5_lag *ldev) { - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; - struct mlx5_eswitch *master_esw = dev0->priv.eswitch; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {}; bool roce_lag = __mlx5_lag_is_roce(ldev); unsigned long flags = ldev->mode_flags; + struct mlx5_eswitch *master_esw; + struct mlx5_core_dev *dev0; int err; int i; + if (first_idx < 0) + return -EINVAL; + + dev0 = ldev->pf[first_idx].dev; + master_esw = dev0->priv.eswitch; ldev->mode = MLX5_LAG_MODE_NONE; ldev->mode_flags = 0; mlx5_lag_mp_reset(ldev); if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) { - mlx5_ldev_for_each(i, 1, ldev) + mlx5_ldev_for_each(i, first_idx + 1, ldev) mlx5_eswitch_offloads_single_fdb_del_one(master_esw, ldev->pf[i].dev->priv.eswitch); clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags); @@ -749,6 +847,7 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev) bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) { + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); #ifdef CONFIG_MLX5_ESWITCH struct mlx5_core_dev *dev; u8 mode; @@ -756,9 +855,8 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) bool roce_support; int i; - for (i = 0; i < ldev->ports; i++) - if (!ldev->pf[i].dev) - return false; + if (first_idx < 0 || mlx5_lag_num_devs(ldev) != ldev->ports) + return false; #ifdef CONFIG_MLX5_ESWITCH mlx5_ldev_for_each(i, 0, ldev) { @@ -767,7 +865,7 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) return false; } - dev = ldev->pf[MLX5_LAG_P1].dev; + dev = ldev->pf[first_idx].dev; mode = mlx5_eswitch_mode(dev); mlx5_ldev_for_each(i, 0, ldev) if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode) @@ -778,8 +876,8 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) if (mlx5_sriov_is_enabled(ldev->pf[i].dev)) return false; #endif - roce_support = mlx5_get_roce_state(ldev->pf[MLX5_LAG_P1].dev); - mlx5_ldev_for_each(i, MLX5_LAG_P2, ldev) + roce_support = mlx5_get_roce_state(ldev->pf[first_idx].dev); + mlx5_ldev_for_each(i, first_idx + 1, ldev) if (mlx5_get_roce_state(ldev->pf[i].dev) != roce_support) return false; @@ -817,11 +915,16 @@ void mlx5_lag_remove_devices(struct mlx5_lag *ldev) void mlx5_disable_lag(struct mlx5_lag *ldev) { bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags); - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + struct mlx5_core_dev *dev0; bool roce_lag; int err; int i; + if (idx < 0) + return; + + dev0 = ldev->pf[idx].dev; roce_lag = __mlx5_lag_is_roce(ldev); if (shared_fdb) { @@ -831,7 +934,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev) dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); } - mlx5_ldev_for_each(i, MLX5_LAG_P2, ldev) + mlx5_ldev_for_each(i, idx + 1, ldev) mlx5_nic_vport_disable_roce(ldev->pf[i].dev); } @@ -850,10 +953,14 @@ void mlx5_disable_lag(struct mlx5_lag *ldev) static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev) { + int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); struct mlx5_core_dev *dev; int i; - mlx5_ldev_for_each(i, MLX5_LAG_P1 + 1, ldev) { + if (idx < 0) + return false; + + mlx5_ldev_for_each(i, idx + 1, ldev) { dev = ldev->pf[i].dev; if (is_mdev_switchdev_mode(dev) && mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) && @@ -865,7 +972,7 @@ static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev) return false; } - dev = ldev->pf[MLX5_LAG_P1].dev; + dev = ldev->pf[idx].dev; if (is_mdev_switchdev_mode(dev) && mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) && mlx5_esw_offloads_devcom_is_ready(dev->priv.eswitch) && @@ -906,13 +1013,18 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond) static void mlx5_do_bond(struct mlx5_lag *ldev) { - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); struct lag_tracker tracker = { }; + struct mlx5_core_dev *dev0; struct net_device *ndev; bool do_bond, roce_lag; int err; int i; + if (idx < 0) + return; + + dev0 = ldev->pf[idx].dev; if (!mlx5_lag_is_ready(ldev)) { do_bond = false; } else { @@ -945,7 +1057,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) } else if (roce_lag) { dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; mlx5_rescan_drivers_locked(dev0); - mlx5_ldev_for_each(i, MLX5_LAG_P2, ldev) { + mlx5_ldev_for_each(i, idx + 1, ldev) { if (mlx5_get_roce_state(ldev->pf[i].dev)) mlx5_nic_vport_enable_roce(ldev->pf[i].dev); } @@ -1380,7 +1492,7 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, struct net_device *netdev) { struct mlx5_lag *ldev; - int i; + int num = 0; ldev = mlx5_lag_dev(dev); if (!ldev) @@ -1388,11 +1500,8 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, mutex_lock(&ldev->lock); mlx5_ldev_add_netdev(ldev, dev, netdev); - for (i = 0; i < ldev->ports; i++) - if (!ldev->pf[i].netdev) - break; - - if (i >= ldev->ports) + num = mlx5_lag_num_netdevs(ldev); + if (num >= ldev->ports) set_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags); mutex_unlock(&ldev->lock); mlx5_queue_bond_work(ldev, 0); @@ -1469,11 +1578,12 @@ bool mlx5_lag_is_master(struct mlx5_core_dev *dev) struct mlx5_lag *ldev; unsigned long flags; bool res = false; + int idx; spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); - res = ldev && __mlx5_lag_is_active(ldev) && - dev == ldev->pf[MLX5_LAG_P1].dev; + idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + res = ldev && __mlx5_lag_is_active(ldev) && idx >= 0 && dev == ldev->pf[idx].dev; spin_unlock_irqrestore(&lag_lock, flags); return res; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 1dada791815e4..01cf723669471 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -136,4 +136,7 @@ static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) int mlx5_get_pre_ldev_func(struct mlx5_lag *ldev, int start_idx, int end_idx); int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx); +int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq); +int mlx5_lag_num_devs(struct mlx5_lag *ldev); +int mlx5_lag_num_netdevs(struct mlx5_lag *ldev); #endif /* __MLX5_LAG_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c index 40406d04adc98..aee17fcf3b36c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -17,7 +17,10 @@ static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev) #define MLX5_LAG_MULTIPATH_OFFLOADS_SUPPORTED_PORTS 2 static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev) { - if (!mlx5_lag_is_ready(ldev)) + int idx0 = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + int idx1 = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P2); + + if (idx0 < 0 || idx1 < 0 || !mlx5_lag_is_ready(ldev)) return false; if (__mlx5_lag_is_active(ldev) && !__mlx5_lag_is_multipath(ldev)) @@ -26,8 +29,8 @@ static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev) if (ldev->ports > MLX5_LAG_MULTIPATH_OFFLOADS_SUPPORTED_PORTS) return false; - return mlx5_esw_multipath_prereq(ldev->pf[MLX5_LAG_P1].dev, - ldev->pf[MLX5_LAG_P2].dev); + return mlx5_esw_multipath_prereq(ldev->pf[idx0].dev, + ldev->pf[idx1].dev); } bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev) @@ -50,43 +53,45 @@ bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev) static void mlx5_lag_set_port_affinity(struct mlx5_lag *ldev, enum mlx5_lag_port_affinity port) { + int idx0 = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + int idx1 = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P2); struct lag_tracker tracker = {}; - if (!__mlx5_lag_is_multipath(ldev)) + if (idx0 < 0 || idx1 < 0 || !__mlx5_lag_is_multipath(ldev)) return; switch (port) { case MLX5_LAG_NORMAL_AFFINITY: - tracker.netdev_state[MLX5_LAG_P1].tx_enabled = true; - tracker.netdev_state[MLX5_LAG_P2].tx_enabled = true; - tracker.netdev_state[MLX5_LAG_P1].link_up = true; - tracker.netdev_state[MLX5_LAG_P2].link_up = true; + tracker.netdev_state[idx0].tx_enabled = true; + tracker.netdev_state[idx1].tx_enabled = true; + tracker.netdev_state[idx0].link_up = true; + tracker.netdev_state[idx1].link_up = true; break; case MLX5_LAG_P1_AFFINITY: - tracker.netdev_state[MLX5_LAG_P1].tx_enabled = true; - tracker.netdev_state[MLX5_LAG_P1].link_up = true; - tracker.netdev_state[MLX5_LAG_P2].tx_enabled = false; - tracker.netdev_state[MLX5_LAG_P2].link_up = false; + tracker.netdev_state[idx0].tx_enabled = true; + tracker.netdev_state[idx0].link_up = true; + tracker.netdev_state[idx1].tx_enabled = false; + tracker.netdev_state[idx1].link_up = false; break; case MLX5_LAG_P2_AFFINITY: - tracker.netdev_state[MLX5_LAG_P1].tx_enabled = false; - tracker.netdev_state[MLX5_LAG_P1].link_up = false; - tracker.netdev_state[MLX5_LAG_P2].tx_enabled = true; - tracker.netdev_state[MLX5_LAG_P2].link_up = true; + tracker.netdev_state[idx0].tx_enabled = false; + tracker.netdev_state[idx0].link_up = false; + tracker.netdev_state[idx1].tx_enabled = true; + tracker.netdev_state[idx1].link_up = true; break; default: - mlx5_core_warn(ldev->pf[MLX5_LAG_P1].dev, + mlx5_core_warn(ldev->pf[idx0].dev, "Invalid affinity port %d", port); return; } - if (tracker.netdev_state[MLX5_LAG_P1].tx_enabled) - mlx5_notifier_call_chain(ldev->pf[MLX5_LAG_P1].dev->priv.events, + if (tracker.netdev_state[idx0].tx_enabled) + mlx5_notifier_call_chain(ldev->pf[idx0].dev->priv.events, MLX5_DEV_EVENT_PORT_AFFINITY, (void *)0); - if (tracker.netdev_state[MLX5_LAG_P2].tx_enabled) - mlx5_notifier_call_chain(ldev->pf[MLX5_LAG_P2].dev->priv.events, + if (tracker.netdev_state[idx1].tx_enabled) + mlx5_notifier_call_chain(ldev->pf[idx1].dev->priv.events, MLX5_DEV_EVENT_PORT_AFFINITY, (void *)0); @@ -150,11 +155,15 @@ mlx5_lag_get_next_fib_dev(struct mlx5_lag *ldev, static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event, struct fib_entry_notifier_info *fen_info) { + int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); struct net_device *nh_dev0, *nh_dev1; struct fib_info *fi = fen_info->fi; struct lag_mp *mp = &ldev->lag_mp; int i, dev_idx = 0; + if (idx < 0) + return; + /* Handle delete event */ if (event == FIB_EVENT_ENTRY_DEL) { /* stop track */ @@ -180,7 +189,7 @@ static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event, } if (nh_dev0 == nh_dev1) { - mlx5_core_warn(ldev->pf[MLX5_LAG_P1].dev, + mlx5_core_warn(ldev->pf[idx].dev, "Multipath offload doesn't support routes with multiple nexthops of the same device"); return; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index 1123c8afcf9ef..ffac0bd6c8952 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -68,13 +68,15 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev) #define MLX5_LAG_MPESW_OFFLOADS_SUPPORTED_PORTS 4 static int enable_mpesw(struct mlx5_lag *ldev) { - struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + struct mlx5_core_dev *dev0; int err; int i; - if (ldev->mode != MLX5_LAG_MODE_NONE) + if (idx < 0 || ldev->mode != MLX5_LAG_MODE_NONE) return -EINVAL; + dev0 = ldev->pf[idx].dev; if (ldev->ports > MLX5_LAG_MPESW_OFFLOADS_SUPPORTED_PORTS) return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c index f98f0735fce0f..22241f52716ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -39,13 +39,18 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, struct mlx5_lag_definer *lag_definer, u8 *ports) { - struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_destination dest = {}; MLX5_DECLARE_FLOW_ACT(flow_act); struct mlx5_flow_namespace *ns; + struct mlx5_core_dev *dev; int err, i, j, k, idx; + if (first_idx < 0) + return -EINVAL; + + dev = ldev->pf[first_idx].dev; ft_attr.max_fte = ldev->ports * ldev->buckets; ft_attr.level = MLX5_LAG_FT_LEVEL_DEFINER; @@ -293,11 +298,16 @@ static struct mlx5_lag_definer * mlx5_lag_create_definer(struct mlx5_lag *ldev, enum netdev_lag_hash hash, enum mlx5_traffic_types tt, bool tunnel, u8 *ports) { - struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); struct mlx5_lag_definer *lag_definer; + struct mlx5_core_dev *dev; u32 *match_definer_mask; int format_id, err; + if (first_idx < 0) + return ERR_PTR(-EINVAL); + + dev = ldev->pf[first_idx].dev; lag_definer = kzalloc(sizeof(*lag_definer), GFP_KERNEL); if (!lag_definer) return ERR_PTR(-ENOMEM); @@ -339,12 +349,15 @@ mlx5_lag_create_definer(struct mlx5_lag *ldev, enum netdev_lag_hash hash, static void mlx5_lag_destroy_definer(struct mlx5_lag *ldev, struct mlx5_lag_definer *lag_definer) { - struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; - int idx; - int i; - int j; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + struct mlx5_core_dev *dev; + int idx, i, j; - mlx5_ldev_for_each(i, 0, ldev) { + if (first_idx < 0) + return; + + dev = ldev->pf[first_idx].dev; + mlx5_ldev_for_each(i, first_idx, ldev) { for (j = 0; j < ldev->buckets; j++) { idx = i * ldev->buckets + j; mlx5_del_flow_rules(lag_definer->rules[idx]); @@ -499,10 +512,15 @@ static void mlx5_lag_set_outer_ttc_params(struct mlx5_lag *ldev, static int mlx5_lag_create_ttc_table(struct mlx5_lag *ldev) { - struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; struct ttc_params ttc_params = {}; + struct mlx5_core_dev *dev; + if (first_idx < 0) + return -EINVAL; + + dev = ldev->pf[first_idx].dev; mlx5_lag_set_outer_ttc_params(ldev, &ttc_params); port_sel->outer.ttc = mlx5_create_ttc_table(dev, &ttc_params); return PTR_ERR_OR_ZERO(port_sel->outer.ttc); @@ -510,10 +528,15 @@ static int mlx5_lag_create_ttc_table(struct mlx5_lag *ldev) static int mlx5_lag_create_inner_ttc_table(struct mlx5_lag *ldev) { - struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + int first_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; struct ttc_params ttc_params = {}; + struct mlx5_core_dev *dev; + + if (first_idx < 0) + return -EINVAL; + dev = ldev->pf[first_idx].dev; mlx5_lag_set_inner_ttc_params(ldev, &ttc_params); port_sel->inner.ttc = mlx5_create_inner_ttc_table(dev, &ttc_params); return PTR_ERR_OR_ZERO(port_sel->inner.ttc); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 220a9ac75c8ba..869bfecdd8ffa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -664,6 +664,10 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) MLX5_SET(cmd_hca_cap, set_hca_cap, log_max_current_uc_list, ilog2(max_uc_list)); + /* enable absolute native port num */ + if (MLX5_CAP_GEN_MAX(dev, abs_native_port_num)) + MLX5_SET(cmd_hca_cap, set_hca_cap, abs_native_port_num, 1); + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); } From 95f68e06b41b9e88291796efa3969409d13fdd4c Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 19 Dec 2024 19:58:33 +0200 Subject: [PATCH 0965/1450] net/mlx5: fs, add counter object to flow destination Currently mlx5_flow_destination includes counter_id which is assigned in case we use flow counter on the flow steering rule. However, counter_id is not enough data in case of using HW Steering. Thus, have mlx5_fc object as part of mlx5_flow_destination instead of counter_id and assign it where needed. In case counter_id is received from user space, create a local counter object to represent it. Signed-off-by: Moshe Shemesh Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/infiniband/hw/mlx5/fs.c | 37 +++++++++---- .../mellanox/mlx5/core/diag/fs_tracepoint.h | 2 +- .../mellanox/mlx5/core/en_accel/ipsec_fs.c | 20 +++---- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- .../mellanox/mlx5/core/esw/acl/egress_lgcy.c | 2 +- .../mellanox/mlx5/core/esw/acl/ingress_lgcy.c | 2 +- .../ethernet/mellanox/mlx5/core/esw/bridge.c | 20 +++---- .../mellanox/mlx5/core/eswitch_offloads.c | 2 +- .../net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 +- .../net/ethernet/mellanox/mlx5/core/fs_core.c | 1 + .../ethernet/mellanox/mlx5/core/fs_counters.c | 53 +++++++++++++++++++ .../mellanox/mlx5/core/lib/macsec_fs.c | 8 +-- .../mellanox/mlx5/core/steering/sws/fs_dr.c | 2 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 +- include/linux/mlx5/fs.h | 4 +- 15 files changed, 117 insertions(+), 44 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 520034acf73aa..162814ae8cb4e 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -943,7 +943,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, } dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dst.counter_id = mlx5_fc_id(opfc->fc); + dst.counter = opfc->fc; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW; @@ -1113,8 +1113,8 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, handler->ibcounters = flow_act.counters; dest_arr[dest_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest_arr[dest_num].counter_id = - mlx5_fc_id(mcounters->hw_cntrs_hndl); + dest_arr[dest_num].counter = + mcounters->hw_cntrs_hndl; dest_num++; } @@ -1603,7 +1603,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, static struct mlx5_ib_flow_handler *raw_fs_rule_add( struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act, - u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type) + struct mlx5_fc *counter, void *cmd_in, int inlen, int dest_id, int dest_type) { struct mlx5_flow_destination *dst; struct mlx5_ib_flow_prio *ft_prio; @@ -1652,8 +1652,12 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add( } if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + if (WARN_ON(!counter)) { + err = -EINVAL; + goto unlock; + } dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dst[dst_num].counter_id = counter_id; + dst[dst_num].counter = counter; dst_num++; } @@ -1878,7 +1882,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs, return 0; } -static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id) +static bool +is_flow_counter(void *obj, u32 offset, u32 *counter_id, u32 *fc_bulk_size) { struct devx_obj *devx_obj = obj; u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode); @@ -1888,6 +1893,7 @@ static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id) if (offset && offset >= devx_obj->flow_counter_bulk_size) return false; + *fc_bulk_size = devx_obj->flow_counter_bulk_size; *counter_id = MLX5_GET(dealloc_flow_counter_in, devx_obj->dinbox, flow_counter_id); @@ -1904,13 +1910,13 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( { struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; - u32 *offset_attr, offset = 0, counter_id = 0; int dest_id, dest_type = -1, inlen, len, ret, i; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; struct ib_uobject **arr_flow_actions; struct ib_uflow_resources *uflow_res; struct mlx5_flow_act flow_act = {}; + struct mlx5_fc *counter = NULL; struct ib_qp *qp = NULL; void *devx_obj, *cmd_in; struct ib_uobject *uobj; @@ -1937,6 +1943,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); if (len) { + u32 *offset_attr, fc_bulk_size, offset = 0, counter_id = 0; devx_obj = arr_flow_actions[0]->object; if (uverbs_attr_is_valid(attrs, @@ -1956,8 +1963,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( offset = *offset_attr; } - if (!is_flow_counter(devx_obj, offset, &counter_id)) + if (!is_flow_counter(devx_obj, offset, &counter_id, &fc_bulk_size)) return -EINVAL; + counter = mlx5_fc_local_create(counter_id, offset, fc_bulk_size); + if (IS_ERR(counter)) + return PTR_ERR(counter); flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; } @@ -1968,8 +1978,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); - if (!uflow_res) - return -ENOMEM; + if (!uflow_res) { + ret = -ENOMEM; + goto destroy_counter; + } len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions); @@ -1996,7 +2008,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( flow_handler = raw_fs_rule_add(dev, fs_matcher, &flow_context, &flow_act, - counter_id, cmd_in, inlen, dest_id, dest_type); + counter, cmd_in, inlen, dest_id, dest_type); if (IS_ERR(flow_handler)) { ret = PTR_ERR(flow_handler); goto err_out; @@ -2007,6 +2019,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( return 0; err_out: ib_uverbs_flow_resources_free(uflow_res); +destroy_counter: + if (counter) + mlx5_fc_local_destroy(counter); return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h index 9aed29fa49006..d6e736c1fb246 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h @@ -292,7 +292,7 @@ TRACE_EVENT(mlx5_fs_add_rule, if (rule->dest_attr.type & MLX5_FLOW_DESTINATION_TYPE_COUNTER) __entry->counter_id = - rule->dest_attr.counter_id; + mlx5_fc_id(rule->dest_attr.counter); ), TP_printk("rule=%p fte=%p index=%u sw_action=<%s> [dst] %s\n", __entry->rule, __entry->fte, __entry->index, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index e51b03d4c717f..687bd95d2c3e2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -194,7 +194,7 @@ static int rx_add_rule_drop_auth_trailer(struct mlx5e_ipsec_sa_entry *sa_entry, flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; flow_act.flags = FLOW_ACT_NO_APPEND; dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest.counter_id = mlx5_fc_id(flow_counter); + dest.counter = flow_counter; if (rx == ipsec->rx_esw) spec->flow_context.flow_source = MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK; @@ -223,7 +223,7 @@ static int rx_add_rule_drop_auth_trailer(struct mlx5e_ipsec_sa_entry *sa_entry, } sa_entry->ipsec_rule.trailer.fc = flow_counter; - dest.counter_id = mlx5_fc_id(flow_counter); + dest.counter = flow_counter; MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.ipsec_syndrome, 2); rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); if (IS_ERR(rule)) { @@ -275,7 +275,7 @@ static int rx_add_rule_drop_replay(struct mlx5e_ipsec_sa_entry *sa_entry, struct flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; flow_act.flags = FLOW_ACT_NO_APPEND; dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest.counter_id = mlx5_fc_id(flow_counter); + dest.counter = flow_counter; if (rx == ipsec->rx_esw) spec->flow_context.flow_source = MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK; @@ -348,7 +348,7 @@ static int ipsec_rx_status_drop_all_create(struct mlx5e_ipsec *ipsec, flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest.counter_id = mlx5_fc_id(flow_counter); + dest.counter = flow_counter; if (rx == ipsec->rx_esw) spec->flow_context.flow_source = MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK; rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); @@ -686,7 +686,7 @@ static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, rx->ft.status = ft; dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest[1].counter_id = mlx5_fc_id(rx->fc->cnt); + dest[1].counter = rx->fc->cnt; err = mlx5_ipsec_rx_status_create(ipsec, rx, dest); if (err) goto err_add; @@ -873,7 +873,7 @@ static int ipsec_counter_rule_tx(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_ flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW | MLX5_FLOW_CONTEXT_ACTION_COUNT; dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest.counter_id = mlx5_fc_id(tx->fc->cnt); + dest.counter = tx->fc->cnt; fte = mlx5_add_flow_rules(tx->ft.status, spec, &flow_act, &dest, 1); if (IS_ERR(fte)) { err = PTR_ERR(fte); @@ -1649,7 +1649,7 @@ static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest[0].ft = rx->ft.status; dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest[1].counter_id = mlx5_fc_id(counter); + dest[1].counter = counter; rule = mlx5_add_flow_rules(rx->ft.sa, spec, &flow_act, dest, 2); if (IS_ERR(rule)) { err = PTR_ERR(rule); @@ -1762,7 +1762,7 @@ static int tx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) dest[0].ft = tx->ft.status; dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest[1].counter_id = mlx5_fc_id(counter); + dest[1].counter = counter; rule = mlx5_add_flow_rules(tx->ft.sa, spec, &flow_act, dest, 2); if (IS_ERR(rule)) { err = PTR_ERR(rule); @@ -1835,7 +1835,7 @@ static int tx_add_policy(struct mlx5e_ipsec_pol_entry *pol_entry) flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest[dstn].counter_id = mlx5_fc_id(tx->fc->drop); + dest[dstn].counter = tx->fc->drop; dstn++; break; default: @@ -1913,7 +1913,7 @@ static int rx_add_policy(struct mlx5e_ipsec_pol_entry *pol_entry) case XFRM_POLICY_BLOCK: flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest[dstn].counter_id = mlx5_fc_id(rx->fc->drop); + dest[dstn].counter = rx->fc->drop; dstn++; break; default: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 6b3b1afe83121..9ba99609999f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1282,7 +1282,7 @@ mlx5e_add_offloaded_nic_rule(struct mlx5e_priv *priv, if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest[dest_ix].counter_id = mlx5_fc_id(attr->counter); + dest[dest_ix].counter = attr->counter; dest_ix++; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c index 6b4c9ffad95b2..7dd1dc3f77c75 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -135,7 +135,7 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, if (drop_counter) { flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - drop_ctr_dst.counter_id = mlx5_fc_id(drop_counter); + drop_ctr_dst.counter = drop_counter; dst = &drop_ctr_dst; dest_num++; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c index 093ed86a0acd8..1c37098e09ea5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -260,7 +260,7 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, if (counter) { flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - drop_ctr_dst.counter_id = mlx5_fc_id(counter); + drop_ctr_dst.counter = counter; dst = &drop_ctr_dst; dest_num++; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c index c5ea1d1d2b035..5f647358a05ca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -570,7 +570,8 @@ mlx5_esw_bridge_egress_table_cleanup(struct mlx5_esw_bridge *bridge) static struct mlx5_flow_handle * mlx5_esw_bridge_ingress_flow_with_esw_create(u16 vport_num, const unsigned char *addr, - struct mlx5_esw_bridge_vlan *vlan, u32 counter_id, + struct mlx5_esw_bridge_vlan *vlan, + struct mlx5_fc *counter, struct mlx5_esw_bridge *bridge, struct mlx5_eswitch *esw) { @@ -628,7 +629,7 @@ mlx5_esw_bridge_ingress_flow_with_esw_create(u16 vport_num, const unsigned char dests[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; dests[0].ft = bridge->egress_ft; dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dests[1].counter_id = counter_id; + dests[1].counter = counter; handle = mlx5_add_flow_rules(br_offloads->ingress_ft, rule_spec, &flow_act, dests, ARRAY_SIZE(dests)); @@ -639,17 +640,19 @@ mlx5_esw_bridge_ingress_flow_with_esw_create(u16 vport_num, const unsigned char static struct mlx5_flow_handle * mlx5_esw_bridge_ingress_flow_create(u16 vport_num, const unsigned char *addr, - struct mlx5_esw_bridge_vlan *vlan, u32 counter_id, + struct mlx5_esw_bridge_vlan *vlan, + struct mlx5_fc *counter, struct mlx5_esw_bridge *bridge) { - return mlx5_esw_bridge_ingress_flow_with_esw_create(vport_num, addr, vlan, counter_id, + return mlx5_esw_bridge_ingress_flow_with_esw_create(vport_num, addr, vlan, counter, bridge, bridge->br_offloads->esw); } static struct mlx5_flow_handle * mlx5_esw_bridge_ingress_flow_peer_create(u16 vport_num, u16 esw_owner_vhca_id, const unsigned char *addr, - struct mlx5_esw_bridge_vlan *vlan, u32 counter_id, + struct mlx5_esw_bridge_vlan *vlan, + struct mlx5_fc *counter, struct mlx5_esw_bridge *bridge) { struct mlx5_devcom_comp_dev *devcom = bridge->br_offloads->esw->devcom, *pos; @@ -671,7 +674,7 @@ mlx5_esw_bridge_ingress_flow_peer_create(u16 vport_num, u16 esw_owner_vhca_id, goto out; } - handle = mlx5_esw_bridge_ingress_flow_with_esw_create(vport_num, addr, vlan, counter_id, + handle = mlx5_esw_bridge_ingress_flow_with_esw_create(vport_num, addr, vlan, counter, bridge, peer_esw); out: @@ -1385,10 +1388,9 @@ mlx5_esw_bridge_fdb_entry_init(struct net_device *dev, u16 vport_num, u16 esw_ow handle = peer ? mlx5_esw_bridge_ingress_flow_peer_create(vport_num, esw_owner_vhca_id, - addr, vlan, mlx5_fc_id(counter), - bridge) : + addr, vlan, counter, bridge) : mlx5_esw_bridge_ingress_flow_create(vport_num, addr, vlan, - mlx5_fc_id(counter), bridge); + counter, bridge); if (IS_ERR(handle)) { err = PTR_ERR(handle); esw_warn(esw->dev, "Failed to create ingress flow(vport=%u,err=%d,peer=%d)\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index d5b42b3a19fdf..8636f0485800d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -724,7 +724,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest[i].counter_id = mlx5_fc_id(attr->counter); + dest[i].counter = attr->counter; i++; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 676005854dad4..6bf0aade69d75 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -718,7 +718,7 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, continue; MLX5_SET(flow_counter_list, in_dests, flow_counter_id, - dst->dest_attr.counter_id); + mlx5_fc_id(dst->dest_attr.counter)); in_dests += dst_cnt_size; list_size++; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2eabfcc247c6a..f781f8f169b9a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -658,6 +658,7 @@ static void del_sw_hw_rule(struct fs_node *node) BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) | BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS); fte->act_dests.action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT; + mlx5_fc_local_destroy(rule->dest_attr.counter); goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 62d0c689796b9..7d56deaa46097 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -43,6 +43,11 @@ #define MLX5_FC_POOL_MAX_THRESHOLD BIT(18) #define MLX5_FC_POOL_USED_BUFF_RATIO 10 +enum mlx5_fc_type { + MLX5_FC_TYPE_ACQUIRED = 0, + MLX5_FC_TYPE_LOCAL, +}; + struct mlx5_fc_cache { u64 packets; u64 bytes; @@ -52,6 +57,7 @@ struct mlx5_fc_cache { struct mlx5_fc { u32 id; bool aging; + enum mlx5_fc_type type; struct mlx5_fc_bulk *bulk; struct mlx5_fc_cache cache; /* last{packets,bytes} are used for calculating deltas since last reading. */ @@ -186,6 +192,9 @@ static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter) { struct mlx5_fc_stats *fc_stats = dev->priv.fc_stats; + if (WARN_ON(counter->type == MLX5_FC_TYPE_LOCAL)) + return; + if (counter->bulk) mlx5_fc_pool_release_counter(&fc_stats->fc_pool, counter); else @@ -536,6 +545,50 @@ static int mlx5_fc_bulk_release_fc(struct mlx5_fc_bulk *bulk, struct mlx5_fc *fc return 0; } +/** + * mlx5_fc_local_create - Allocate mlx5_fc struct for a counter which + * was already acquired using its counter id and bulk data. + * + * @counter_id: counter acquired counter id + * @offset: counter offset from bulk base + * @bulk_size: counter's bulk size as was allocated + * + * Return: Pointer to mlx5_fc on success, ERR_PTR otherwise. + */ +struct mlx5_fc * +mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size) +{ + struct mlx5_fc_bulk *fc_bulk; + struct mlx5_fc *counter; + + counter = kzalloc(sizeof(*counter), GFP_KERNEL); + if (!counter) + return ERR_PTR(-ENOMEM); + fc_bulk = kzalloc(sizeof(*fc_bulk), GFP_KERNEL); + if (!fc_bulk) { + kfree(counter); + return ERR_PTR(-ENOMEM); + } + + counter->type = MLX5_FC_TYPE_LOCAL; + counter->id = counter_id; + fc_bulk->base_id = counter_id - offset; + fc_bulk->bulk_len = bulk_size; + counter->bulk = fc_bulk; + return counter; +} +EXPORT_SYMBOL(mlx5_fc_local_create); + +void mlx5_fc_local_destroy(struct mlx5_fc *counter) +{ + if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) + return; + + kfree(counter->bulk); + kfree(counter); +} +EXPORT_SYMBOL(mlx5_fc_local_destroy); + /* Flow counters pool API */ static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c index 4a078113e292d..762d55ba9e513 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c @@ -497,7 +497,7 @@ static int macsec_fs_tx_create(struct mlx5_macsec_fs *macsec_fs) memset(&dest, 0, sizeof(struct mlx5_flow_destination)); memset(&flow_act, 0, sizeof(flow_act)); dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest.counter_id = mlx5_fc_id(tx_tables->check_miss_rule_counter); + dest.counter = tx_tables->check_miss_rule_counter; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; rule = mlx5_add_flow_rules(tx_tables->ft_check, NULL, &flow_act, &dest, 1); if (IS_ERR(rule)) { @@ -519,7 +519,7 @@ static int macsec_fs_tx_create(struct mlx5_macsec_fs *macsec_fs) flow_act.flags = FLOW_ACT_NO_APPEND; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW | MLX5_FLOW_CONTEXT_ACTION_COUNT; dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest.counter_id = mlx5_fc_id(tx_tables->check_rule_counter); + dest.counter = tx_tables->check_rule_counter; rule = mlx5_add_flow_rules(tx_tables->ft_check, spec, &flow_act, &dest, 1); if (IS_ERR(rule)) { err = PTR_ERR(rule); @@ -1200,7 +1200,7 @@ static int macsec_fs_rx_create_check_decap_rule(struct mlx5_macsec_fs *macsec_fs flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT | MLX5_FLOW_CONTEXT_ACTION_COUNT; roce_dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - roce_dest[dstn].counter_id = mlx5_fc_id(rx_tables->check_rule_counter); + roce_dest[dstn].counter = rx_tables->check_rule_counter; rule = mlx5_add_flow_rules(rx_tables->ft_check, spec, flow_act, roce_dest, dstn + 1); if (IS_ERR(rule)) { @@ -1592,7 +1592,7 @@ static int macsec_fs_rx_create(struct mlx5_macsec_fs *macsec_fs) memset(&flow_act, 0, sizeof(flow_act)); dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest.counter_id = mlx5_fc_id(rx_tables->check_miss_rule_counter); + dest.counter = rx_tables->check_miss_rule_counter; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; rule = mlx5_add_flow_rules(rx_tables->ft_check, NULL, &flow_act, &dest, 1); if (IS_ERR(rule)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c index 4b349d4005e44..8007d3f523c94 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/fs_dr.c @@ -521,7 +521,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, goto free_actions; } - id = dst->dest_attr.counter_id; + id = mlx5_fc_id(dst->dest_attr.counter); tmp_action = mlx5dr_action_create_flow_counter(id); if (!tmp_action) { diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 5f581e71e2010..36099047560df 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1952,7 +1952,7 @@ static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac, goto out_free; #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) - dests[1].counter_id = mlx5_fc_id(node->ucast_counter.counter); + dests[1].counter = node->ucast_counter.counter; #endif node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS); if (IS_ERR(node->ucast_rule)) { @@ -1961,7 +1961,7 @@ static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac, } #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG) - dests[1].counter_id = mlx5_fc_id(node->mcast_counter.counter); + dests[1].counter = node->mcast_counter.counter; #endif memset(dmac_c, 0, ETH_ALEN); diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 438db888bde0d..2a69d9d71276d 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -163,7 +163,7 @@ struct mlx5_flow_destination { u32 tir_num; u32 ft_num; struct mlx5_flow_table *ft; - u32 counter_id; + struct mlx5_fc *counter; struct { u16 num; u16 vhca_id; @@ -299,6 +299,8 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); +struct mlx5_fc *mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size); +void mlx5_fc_local_destroy(struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); From 31d1356b8fdcdb7fe845874b598cce552a151c64 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 19 Dec 2024 19:58:34 +0200 Subject: [PATCH 0966/1450] net/mlx5: fs, add mlx5_fs_pool API Refactor fc_pool API to create generic fs_pool API, as HW steering has more flow steering elements which can take advantage of the same pool of bulks API. Change fs_counters code to use the fs_pool API. Note, removed __counted_by from struct mlx5_fc_bulk as bulk_len is now inner struct member. It will be added back once __counted_by can support inner struct members. Signed-off-by: Moshe Shemesh Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +- .../ethernet/mellanox/mlx5/core/fs_counters.c | 286 +++++------------- .../net/ethernet/mellanox/mlx5/core/fs_pool.c | 194 ++++++++++++ .../net/ethernet/mellanox/mlx5/core/fs_pool.h | 54 ++++ 4 files changed, 327 insertions(+), 209 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index be3d0876c521d..79fe09de0a9fd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -17,7 +17,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \ diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \ - fw_reset.o qos.o lib/tout.o lib/aso.o wc.o + fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 7d56deaa46097..d8e1c4ebd3646 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -34,6 +34,7 @@ #include #include "mlx5_core.h" #include "fs_core.h" +#include "fs_pool.h" #include "fs_cmd.h" #define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000) @@ -65,17 +66,6 @@ struct mlx5_fc { u64 lastbytes; }; -struct mlx5_fc_pool { - struct mlx5_core_dev *dev; - struct mutex pool_lock; /* protects pool lists */ - struct list_head fully_used; - struct list_head partially_used; - struct list_head unused; - int available_fcs; - int used_fcs; - int threshold; -}; - struct mlx5_fc_stats { struct xarray counters; @@ -86,13 +76,13 @@ struct mlx5_fc_stats { int bulk_query_len; bool bulk_query_alloc_failed; unsigned long next_bulk_query_alloc; - struct mlx5_fc_pool fc_pool; + struct mlx5_fs_pool fc_pool; }; -static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev); -static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool); -static struct mlx5_fc *mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool); -static void mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct mlx5_fc *fc); +static void mlx5_fc_pool_init(struct mlx5_fs_pool *fc_pool, struct mlx5_core_dev *dev); +static void mlx5_fc_pool_cleanup(struct mlx5_fs_pool *fc_pool); +static struct mlx5_fc *mlx5_fc_pool_acquire_counter(struct mlx5_fs_pool *fc_pool); +static void mlx5_fc_pool_release_counter(struct mlx5_fs_pool *fc_pool, struct mlx5_fc *fc); static int get_init_bulk_query_len(struct mlx5_core_dev *dev) { @@ -447,11 +437,9 @@ void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev, /* Flow counter bluks */ struct mlx5_fc_bulk { - struct list_head pool_list; + struct mlx5_fs_bulk fs_bulk; u32 base_id; - int bulk_len; - unsigned long *bitmask; - struct mlx5_fc fcs[] __counted_by(bulk_len); + struct mlx5_fc fcs[]; }; static void mlx5_fc_init(struct mlx5_fc *counter, struct mlx5_fc_bulk *bulk, @@ -461,16 +449,10 @@ static void mlx5_fc_init(struct mlx5_fc *counter, struct mlx5_fc_bulk *bulk, counter->id = id; } -static int mlx5_fc_bulk_get_free_fcs_amount(struct mlx5_fc_bulk *bulk) -{ - return bitmap_weight(bulk->bitmask, bulk->bulk_len); -} - -static struct mlx5_fc_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev) +static struct mlx5_fs_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev) { enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask; - struct mlx5_fc_bulk *bulk; - int err = -ENOMEM; + struct mlx5_fc_bulk *fc_bulk; int bulk_len; u32 base_id; int i; @@ -478,71 +460,97 @@ static struct mlx5_fc_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev) alloc_bitmask = MLX5_CAP_GEN(dev, flow_counter_bulk_alloc); bulk_len = alloc_bitmask > 0 ? MLX5_FC_BULK_NUM_FCS(alloc_bitmask) : 1; - bulk = kvzalloc(struct_size(bulk, fcs, bulk_len), GFP_KERNEL); - if (!bulk) - goto err_alloc_bulk; + fc_bulk = kvzalloc(struct_size(fc_bulk, fcs, bulk_len), GFP_KERNEL); + if (!fc_bulk) + return NULL; - bulk->bitmask = kvcalloc(BITS_TO_LONGS(bulk_len), sizeof(unsigned long), - GFP_KERNEL); - if (!bulk->bitmask) - goto err_alloc_bitmask; + if (mlx5_fs_bulk_init(dev, &fc_bulk->fs_bulk, bulk_len)) + goto fc_bulk_free; - err = mlx5_cmd_fc_bulk_alloc(dev, alloc_bitmask, &base_id); - if (err) - goto err_mlx5_cmd_bulk_alloc; - - bulk->base_id = base_id; - bulk->bulk_len = bulk_len; - for (i = 0; i < bulk_len; i++) { - mlx5_fc_init(&bulk->fcs[i], bulk, base_id + i); - set_bit(i, bulk->bitmask); - } + if (mlx5_cmd_fc_bulk_alloc(dev, alloc_bitmask, &base_id)) + goto fs_bulk_cleanup; + fc_bulk->base_id = base_id; + for (i = 0; i < bulk_len; i++) + mlx5_fc_init(&fc_bulk->fcs[i], fc_bulk, base_id + i); - return bulk; + return &fc_bulk->fs_bulk; -err_mlx5_cmd_bulk_alloc: - kvfree(bulk->bitmask); -err_alloc_bitmask: - kvfree(bulk); -err_alloc_bulk: - return ERR_PTR(err); +fs_bulk_cleanup: + mlx5_fs_bulk_cleanup(&fc_bulk->fs_bulk); +fc_bulk_free: + kvfree(fc_bulk); + return NULL; } static int -mlx5_fc_bulk_destroy(struct mlx5_core_dev *dev, struct mlx5_fc_bulk *bulk) +mlx5_fc_bulk_destroy(struct mlx5_core_dev *dev, struct mlx5_fs_bulk *fs_bulk) { - if (mlx5_fc_bulk_get_free_fcs_amount(bulk) < bulk->bulk_len) { + struct mlx5_fc_bulk *fc_bulk = container_of(fs_bulk, + struct mlx5_fc_bulk, + fs_bulk); + + if (mlx5_fs_bulk_get_free_amount(fs_bulk) < fs_bulk->bulk_len) { mlx5_core_err(dev, "Freeing bulk before all counters were released\n"); return -EBUSY; } - mlx5_cmd_fc_free(dev, bulk->base_id); - kvfree(bulk->bitmask); - kvfree(bulk); + mlx5_cmd_fc_free(dev, fc_bulk->base_id); + mlx5_fs_bulk_cleanup(fs_bulk); + kvfree(fc_bulk); return 0; } -static struct mlx5_fc *mlx5_fc_bulk_acquire_fc(struct mlx5_fc_bulk *bulk) +static void mlx5_fc_pool_update_threshold(struct mlx5_fs_pool *fc_pool) { - int free_fc_index = find_first_bit(bulk->bitmask, bulk->bulk_len); + fc_pool->threshold = min_t(int, MLX5_FC_POOL_MAX_THRESHOLD, + fc_pool->used_units / MLX5_FC_POOL_USED_BUFF_RATIO); +} - if (free_fc_index >= bulk->bulk_len) - return ERR_PTR(-ENOSPC); +/* Flow counters pool API */ - clear_bit(free_fc_index, bulk->bitmask); - return &bulk->fcs[free_fc_index]; +static const struct mlx5_fs_pool_ops mlx5_fc_pool_ops = { + .bulk_destroy = mlx5_fc_bulk_destroy, + .bulk_create = mlx5_fc_bulk_create, + .update_threshold = mlx5_fc_pool_update_threshold, +}; + +static void +mlx5_fc_pool_init(struct mlx5_fs_pool *fc_pool, struct mlx5_core_dev *dev) +{ + mlx5_fs_pool_init(fc_pool, dev, &mlx5_fc_pool_ops); } -static int mlx5_fc_bulk_release_fc(struct mlx5_fc_bulk *bulk, struct mlx5_fc *fc) +static void mlx5_fc_pool_cleanup(struct mlx5_fs_pool *fc_pool) { - int fc_index = fc->id - bulk->base_id; + mlx5_fs_pool_cleanup(fc_pool); +} - if (test_bit(fc_index, bulk->bitmask)) - return -EINVAL; +static struct mlx5_fc * +mlx5_fc_pool_acquire_counter(struct mlx5_fs_pool *fc_pool) +{ + struct mlx5_fs_pool_index pool_index = {}; + struct mlx5_fc_bulk *fc_bulk; + int err; - set_bit(fc_index, bulk->bitmask); - return 0; + err = mlx5_fs_pool_acquire_index(fc_pool, &pool_index); + if (err) + return ERR_PTR(err); + fc_bulk = container_of(pool_index.fs_bulk, struct mlx5_fc_bulk, fs_bulk); + return &fc_bulk->fcs[pool_index.index]; +} + +static void +mlx5_fc_pool_release_counter(struct mlx5_fs_pool *fc_pool, struct mlx5_fc *fc) +{ + struct mlx5_fs_bulk *fs_bulk = &fc->bulk->fs_bulk; + struct mlx5_fs_pool_index pool_index = {}; + struct mlx5_core_dev *dev = fc_pool->dev; + + pool_index.fs_bulk = fs_bulk; + pool_index.index = fc->id - fc->bulk->base_id; + if (mlx5_fs_pool_release_index(fc_pool, &pool_index)) + mlx5_core_warn(dev, "Attempted to release a counter which is not acquired\n"); } /** @@ -573,7 +581,7 @@ mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size) counter->type = MLX5_FC_TYPE_LOCAL; counter->id = counter_id; fc_bulk->base_id = counter_id - offset; - fc_bulk->bulk_len = bulk_size; + fc_bulk->fs_bulk.bulk_len = bulk_size; counter->bulk = fc_bulk; return counter; } @@ -588,141 +596,3 @@ void mlx5_fc_local_destroy(struct mlx5_fc *counter) kfree(counter); } EXPORT_SYMBOL(mlx5_fc_local_destroy); - -/* Flow counters pool API */ - -static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev) -{ - fc_pool->dev = dev; - mutex_init(&fc_pool->pool_lock); - INIT_LIST_HEAD(&fc_pool->fully_used); - INIT_LIST_HEAD(&fc_pool->partially_used); - INIT_LIST_HEAD(&fc_pool->unused); - fc_pool->available_fcs = 0; - fc_pool->used_fcs = 0; - fc_pool->threshold = 0; -} - -static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool) -{ - struct mlx5_core_dev *dev = fc_pool->dev; - struct mlx5_fc_bulk *bulk; - struct mlx5_fc_bulk *tmp; - - list_for_each_entry_safe(bulk, tmp, &fc_pool->fully_used, pool_list) - mlx5_fc_bulk_destroy(dev, bulk); - list_for_each_entry_safe(bulk, tmp, &fc_pool->partially_used, pool_list) - mlx5_fc_bulk_destroy(dev, bulk); - list_for_each_entry_safe(bulk, tmp, &fc_pool->unused, pool_list) - mlx5_fc_bulk_destroy(dev, bulk); -} - -static void mlx5_fc_pool_update_threshold(struct mlx5_fc_pool *fc_pool) -{ - fc_pool->threshold = min_t(int, MLX5_FC_POOL_MAX_THRESHOLD, - fc_pool->used_fcs / MLX5_FC_POOL_USED_BUFF_RATIO); -} - -static struct mlx5_fc_bulk * -mlx5_fc_pool_alloc_new_bulk(struct mlx5_fc_pool *fc_pool) -{ - struct mlx5_core_dev *dev = fc_pool->dev; - struct mlx5_fc_bulk *new_bulk; - - new_bulk = mlx5_fc_bulk_create(dev); - if (!IS_ERR(new_bulk)) - fc_pool->available_fcs += new_bulk->bulk_len; - mlx5_fc_pool_update_threshold(fc_pool); - return new_bulk; -} - -static void -mlx5_fc_pool_free_bulk(struct mlx5_fc_pool *fc_pool, struct mlx5_fc_bulk *bulk) -{ - struct mlx5_core_dev *dev = fc_pool->dev; - - fc_pool->available_fcs -= bulk->bulk_len; - mlx5_fc_bulk_destroy(dev, bulk); - mlx5_fc_pool_update_threshold(fc_pool); -} - -static struct mlx5_fc * -mlx5_fc_pool_acquire_from_list(struct list_head *src_list, - struct list_head *next_list, - bool move_non_full_bulk) -{ - struct mlx5_fc_bulk *bulk; - struct mlx5_fc *fc; - - if (list_empty(src_list)) - return ERR_PTR(-ENODATA); - - bulk = list_first_entry(src_list, struct mlx5_fc_bulk, pool_list); - fc = mlx5_fc_bulk_acquire_fc(bulk); - if (move_non_full_bulk || mlx5_fc_bulk_get_free_fcs_amount(bulk) == 0) - list_move(&bulk->pool_list, next_list); - return fc; -} - -static struct mlx5_fc * -mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool) -{ - struct mlx5_fc_bulk *new_bulk; - struct mlx5_fc *fc; - - mutex_lock(&fc_pool->pool_lock); - - fc = mlx5_fc_pool_acquire_from_list(&fc_pool->partially_used, - &fc_pool->fully_used, false); - if (IS_ERR(fc)) - fc = mlx5_fc_pool_acquire_from_list(&fc_pool->unused, - &fc_pool->partially_used, - true); - if (IS_ERR(fc)) { - new_bulk = mlx5_fc_pool_alloc_new_bulk(fc_pool); - if (IS_ERR(new_bulk)) { - fc = ERR_CAST(new_bulk); - goto out; - } - fc = mlx5_fc_bulk_acquire_fc(new_bulk); - list_add(&new_bulk->pool_list, &fc_pool->partially_used); - } - fc_pool->available_fcs--; - fc_pool->used_fcs++; - -out: - mutex_unlock(&fc_pool->pool_lock); - return fc; -} - -static void -mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct mlx5_fc *fc) -{ - struct mlx5_core_dev *dev = fc_pool->dev; - struct mlx5_fc_bulk *bulk = fc->bulk; - int bulk_free_fcs_amount; - - mutex_lock(&fc_pool->pool_lock); - - if (mlx5_fc_bulk_release_fc(bulk, fc)) { - mlx5_core_warn(dev, "Attempted to release a counter which is not acquired\n"); - goto unlock; - } - - fc_pool->available_fcs++; - fc_pool->used_fcs--; - - bulk_free_fcs_amount = mlx5_fc_bulk_get_free_fcs_amount(bulk); - if (bulk_free_fcs_amount == 1) - list_move_tail(&bulk->pool_list, &fc_pool->partially_used); - if (bulk_free_fcs_amount == bulk->bulk_len) { - list_del(&bulk->pool_list); - if (fc_pool->available_fcs > fc_pool->threshold) - mlx5_fc_pool_free_bulk(fc_pool, bulk); - else - list_add(&bulk->pool_list, &fc_pool->unused); - } - -unlock: - mutex_unlock(&fc_pool->pool_lock); -} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c new file mode 100644 index 0000000000000..b891d7b9e3e0c --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2024 NVIDIA Corporation & Affiliates */ + +#include +#include "fs_pool.h" + +int mlx5_fs_bulk_init(struct mlx5_core_dev *dev, struct mlx5_fs_bulk *fs_bulk, + int bulk_len) +{ + int i; + + fs_bulk->bitmask = kvcalloc(BITS_TO_LONGS(bulk_len), sizeof(unsigned long), + GFP_KERNEL); + if (!fs_bulk->bitmask) + return -ENOMEM; + + fs_bulk->bulk_len = bulk_len; + for (i = 0; i < bulk_len; i++) + set_bit(i, fs_bulk->bitmask); + + return 0; +} + +void mlx5_fs_bulk_cleanup(struct mlx5_fs_bulk *fs_bulk) +{ + kvfree(fs_bulk->bitmask); +} + +int mlx5_fs_bulk_get_free_amount(struct mlx5_fs_bulk *bulk) +{ + return bitmap_weight(bulk->bitmask, bulk->bulk_len); +} + +static int mlx5_fs_bulk_acquire_index(struct mlx5_fs_bulk *fs_bulk, + struct mlx5_fs_pool_index *pool_index) +{ + int free_index = find_first_bit(fs_bulk->bitmask, fs_bulk->bulk_len); + + WARN_ON_ONCE(!pool_index || !fs_bulk); + if (free_index >= fs_bulk->bulk_len) + return -ENOSPC; + + clear_bit(free_index, fs_bulk->bitmask); + pool_index->fs_bulk = fs_bulk; + pool_index->index = free_index; + return 0; +} + +static int mlx5_fs_bulk_release_index(struct mlx5_fs_bulk *fs_bulk, int index) +{ + if (test_bit(index, fs_bulk->bitmask)) + return -EINVAL; + + set_bit(index, fs_bulk->bitmask); + return 0; +} + +void mlx5_fs_pool_init(struct mlx5_fs_pool *pool, struct mlx5_core_dev *dev, + const struct mlx5_fs_pool_ops *ops) +{ + WARN_ON_ONCE(!ops || !ops->bulk_destroy || !ops->bulk_create || + !ops->update_threshold); + pool->dev = dev; + mutex_init(&pool->pool_lock); + INIT_LIST_HEAD(&pool->fully_used); + INIT_LIST_HEAD(&pool->partially_used); + INIT_LIST_HEAD(&pool->unused); + pool->available_units = 0; + pool->used_units = 0; + pool->threshold = 0; + pool->ops = ops; +} + +void mlx5_fs_pool_cleanup(struct mlx5_fs_pool *pool) +{ + struct mlx5_core_dev *dev = pool->dev; + struct mlx5_fs_bulk *bulk; + struct mlx5_fs_bulk *tmp; + + list_for_each_entry_safe(bulk, tmp, &pool->fully_used, pool_list) + pool->ops->bulk_destroy(dev, bulk); + list_for_each_entry_safe(bulk, tmp, &pool->partially_used, pool_list) + pool->ops->bulk_destroy(dev, bulk); + list_for_each_entry_safe(bulk, tmp, &pool->unused, pool_list) + pool->ops->bulk_destroy(dev, bulk); +} + +static struct mlx5_fs_bulk * +mlx5_fs_pool_alloc_new_bulk(struct mlx5_fs_pool *fs_pool) +{ + struct mlx5_core_dev *dev = fs_pool->dev; + struct mlx5_fs_bulk *new_bulk; + + new_bulk = fs_pool->ops->bulk_create(dev); + if (new_bulk) + fs_pool->available_units += new_bulk->bulk_len; + fs_pool->ops->update_threshold(fs_pool); + return new_bulk; +} + +static void +mlx5_fs_pool_free_bulk(struct mlx5_fs_pool *fs_pool, struct mlx5_fs_bulk *bulk) +{ + struct mlx5_core_dev *dev = fs_pool->dev; + + fs_pool->available_units -= bulk->bulk_len; + fs_pool->ops->bulk_destroy(dev, bulk); + fs_pool->ops->update_threshold(fs_pool); +} + +static int +mlx5_fs_pool_acquire_from_list(struct list_head *src_list, + struct list_head *next_list, + bool move_non_full_bulk, + struct mlx5_fs_pool_index *pool_index) +{ + struct mlx5_fs_bulk *fs_bulk; + int err; + + if (list_empty(src_list)) + return -ENODATA; + + fs_bulk = list_first_entry(src_list, struct mlx5_fs_bulk, pool_list); + err = mlx5_fs_bulk_acquire_index(fs_bulk, pool_index); + if (move_non_full_bulk || mlx5_fs_bulk_get_free_amount(fs_bulk) == 0) + list_move(&fs_bulk->pool_list, next_list); + return err; +} + +int mlx5_fs_pool_acquire_index(struct mlx5_fs_pool *fs_pool, + struct mlx5_fs_pool_index *pool_index) +{ + struct mlx5_fs_bulk *new_bulk; + int err; + + mutex_lock(&fs_pool->pool_lock); + + err = mlx5_fs_pool_acquire_from_list(&fs_pool->partially_used, + &fs_pool->fully_used, false, + pool_index); + if (err) + err = mlx5_fs_pool_acquire_from_list(&fs_pool->unused, + &fs_pool->partially_used, + true, pool_index); + if (err) { + new_bulk = mlx5_fs_pool_alloc_new_bulk(fs_pool); + if (!new_bulk) { + err = -ENOENT; + goto out; + } + err = mlx5_fs_bulk_acquire_index(new_bulk, pool_index); + WARN_ON_ONCE(err); + list_add(&new_bulk->pool_list, &fs_pool->partially_used); + } + fs_pool->available_units--; + fs_pool->used_units++; + +out: + mutex_unlock(&fs_pool->pool_lock); + return err; +} + +int mlx5_fs_pool_release_index(struct mlx5_fs_pool *fs_pool, + struct mlx5_fs_pool_index *pool_index) +{ + struct mlx5_fs_bulk *bulk = pool_index->fs_bulk; + int bulk_free_amount; + int err; + + mutex_lock(&fs_pool->pool_lock); + + /* TBD would rather return void if there was no warn here in original code */ + err = mlx5_fs_bulk_release_index(bulk, pool_index->index); + if (err) + goto unlock; + + fs_pool->available_units++; + fs_pool->used_units--; + + bulk_free_amount = mlx5_fs_bulk_get_free_amount(bulk); + if (bulk_free_amount == 1) + list_move_tail(&bulk->pool_list, &fs_pool->partially_used); + if (bulk_free_amount == bulk->bulk_len) { + list_del(&bulk->pool_list); + if (fs_pool->available_units > fs_pool->threshold) + mlx5_fs_pool_free_bulk(fs_pool, bulk); + else + list_add(&bulk->pool_list, &fs_pool->unused); + } + +unlock: + mutex_unlock(&fs_pool->pool_lock); + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h new file mode 100644 index 0000000000000..3b149863260c7 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_pool.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2024 NVIDIA Corporation & Affiliates */ + +#ifndef __MLX5_FS_POOL_H__ +#define __MLX5_FS_POOL_H__ + +#include + +struct mlx5_fs_bulk { + struct list_head pool_list; + int bulk_len; + unsigned long *bitmask; +}; + +struct mlx5_fs_pool_index { + struct mlx5_fs_bulk *fs_bulk; + int index; +}; + +struct mlx5_fs_pool; + +struct mlx5_fs_pool_ops { + int (*bulk_destroy)(struct mlx5_core_dev *dev, struct mlx5_fs_bulk *bulk); + struct mlx5_fs_bulk * (*bulk_create)(struct mlx5_core_dev *dev); + void (*update_threshold)(struct mlx5_fs_pool *pool); +}; + +struct mlx5_fs_pool { + struct mlx5_core_dev *dev; + void *pool_ctx; + const struct mlx5_fs_pool_ops *ops; + struct mutex pool_lock; /* protects pool lists */ + struct list_head fully_used; + struct list_head partially_used; + struct list_head unused; + int available_units; + int used_units; + int threshold; +}; + +int mlx5_fs_bulk_init(struct mlx5_core_dev *dev, struct mlx5_fs_bulk *fs_bulk, + int bulk_len); +void mlx5_fs_bulk_cleanup(struct mlx5_fs_bulk *fs_bulk); +int mlx5_fs_bulk_get_free_amount(struct mlx5_fs_bulk *bulk); + +void mlx5_fs_pool_init(struct mlx5_fs_pool *pool, struct mlx5_core_dev *dev, + const struct mlx5_fs_pool_ops *ops); +void mlx5_fs_pool_cleanup(struct mlx5_fs_pool *pool); +int mlx5_fs_pool_acquire_index(struct mlx5_fs_pool *fs_pool, + struct mlx5_fs_pool_index *pool_index); +int mlx5_fs_pool_release_index(struct mlx5_fs_pool *fs_pool, + struct mlx5_fs_pool_index *pool_index); + +#endif /* __MLX5_FS_POOL_H__ */ From 586face88106481e8c527675a837da8a3ab6677d Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 19 Dec 2024 19:58:35 +0200 Subject: [PATCH 0967/1450] net/mlx5: fs, retry insertion to hash table on EBUSY When inserting into an rhashtable faster than it can grow, an -EBUSY error may be encountered. Modify the insertion logic to retry on -EBUSY until either a successful insertion or a genuine error is returned. Signed-off-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20241219175841.1094544-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index f781f8f169b9a..ae1a5705b26dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -821,11 +821,17 @@ static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte) return index; fte->index = index + fg->start_index; +retry_insert: ret = rhashtable_insert_fast(&fg->ftes_hash, &fte->hash, rhash_fte); - if (ret) + if (ret) { + if (ret == -EBUSY) { + cond_resched(); + goto retry_insert; + } goto err_ida_remove; + } tree_add_node(&fte->node, &fg->node); list_add_tail(&fte->node.list, &fg->node.children); From 9a0155a709fadaab468a24abca7996c5fdf0507b Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 19 Dec 2024 19:58:36 +0200 Subject: [PATCH 0968/1450] net/mlx5: HWS, no need to expose mlx5hws_send_queues_open/close No need to have mlx5hws_send_queues_open/close in header. Make them static and remove from header. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/steering/hws/send.c | 8 ++++---- .../net/ethernet/mellanox/mlx5/core/steering/hws/send.h | 6 ------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index 883b4ed30892c..b68b0c368771a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -896,15 +896,15 @@ static int mlx5hws_send_ring_open(struct mlx5hws_context *ctx, return err; } -void mlx5hws_send_queue_close(struct mlx5hws_send_engine *queue) +static void mlx5hws_send_queue_close(struct mlx5hws_send_engine *queue) { hws_send_ring_close(queue); kfree(queue->completed.entries); } -int mlx5hws_send_queue_open(struct mlx5hws_context *ctx, - struct mlx5hws_send_engine *queue, - u16 queue_size) +static int mlx5hws_send_queue_open(struct mlx5hws_context *ctx, + struct mlx5hws_send_engine *queue, + u16 queue_size) { int err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h index b50825d6dc53d..f833092235c19 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h @@ -189,12 +189,6 @@ void mlx5hws_send_abort_new_dep_wqe(struct mlx5hws_send_engine *queue); void mlx5hws_send_all_dep_wqe(struct mlx5hws_send_engine *queue); -void mlx5hws_send_queue_close(struct mlx5hws_send_engine *queue); - -int mlx5hws_send_queue_open(struct mlx5hws_context *ctx, - struct mlx5hws_send_engine *queue, - u16 queue_size); - void mlx5hws_send_queues_close(struct mlx5hws_context *ctx); int mlx5hws_send_queues_open(struct mlx5hws_context *ctx, From 429776b6019bbdcf04dcd49706fe7de6a280078b Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 19 Dec 2024 19:58:37 +0200 Subject: [PATCH 0969/1450] net/mlx5: HWS, do not initialize native API queues HWS has two types of APIs: - Native: fastest and slimmest, async API. The user of this API is required to manage rule handles memory, and to poll for completion for each rule. - BWC: backward compatible API, similar semantics to SWS API. This layer is implemented above native API and it does all the work for the user, so that it is easy to switch between SWS and HWS. Right now the existing users of HWS require only BWC API. Therefore, in order to not waste resources, this patch disables send queues allocation for native API. If in the future support for faster HWS rule insertion will be required (such as for Connection Tracking), native queues can be enabled. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-8-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/steering/hws/bwc.h | 6 ++++-- .../mellanox/mlx5/core/steering/hws/context.c | 6 ++++-- .../mellanox/mlx5/core/steering/hws/context.h | 6 ++++++ .../mellanox/mlx5/core/steering/hws/mlx5hws.h | 1 - .../ethernet/mellanox/mlx5/core/steering/hws/send.c | 13 +++++++++++-- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h index 0b745968e21e1..3d4965213b012 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h @@ -60,9 +60,11 @@ void mlx5hws_bwc_rule_fill_attr(struct mlx5hws_bwc_matcher *bwc_matcher, static inline u16 mlx5hws_bwc_queues(struct mlx5hws_context *ctx) { /* Besides the control queue, half of the queues are - * reguler HWS queues, and the other half are BWC queues. + * regular HWS queues, and the other half are BWC queues. */ - return (ctx->queues - 1) / 2; + if (mlx5hws_context_bwc_supported(ctx)) + return (ctx->queues - 1) / 2; + return 0; } static inline u16 mlx5hws_bwc_get_queue_id(struct mlx5hws_context *ctx, u16 idx) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c index fd48b05e91e05..4a8928f33bb99 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c @@ -161,8 +161,10 @@ static int hws_context_init_hws(struct mlx5hws_context *ctx, if (ret) goto uninit_pd; - if (attr->bwc) - ctx->flags |= MLX5HWS_CONTEXT_FLAG_BWC_SUPPORT; + /* Context has support for backward compatible API, + * and does not have support for native HWS API. + */ + ctx->flags |= MLX5HWS_CONTEXT_FLAG_BWC_SUPPORT; ret = mlx5hws_send_queues_open(ctx, attr->queues, attr->queue_size); if (ret) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h index 47f5cc8de73fa..1c9cc4fba083d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h @@ -8,6 +8,7 @@ enum mlx5hws_context_flags { MLX5HWS_CONTEXT_FLAG_HWS_SUPPORT = 1 << 0, MLX5HWS_CONTEXT_FLAG_PRIVATE_PD = 1 << 1, MLX5HWS_CONTEXT_FLAG_BWC_SUPPORT = 1 << 2, + MLX5HWS_CONTEXT_FLAG_NATIVE_SUPPORT = 1 << 3, }; enum mlx5hws_context_shared_stc_type { @@ -58,6 +59,11 @@ static inline bool mlx5hws_context_bwc_supported(struct mlx5hws_context *ctx) return ctx->flags & MLX5HWS_CONTEXT_FLAG_BWC_SUPPORT; } +static inline bool mlx5hws_context_native_supported(struct mlx5hws_context *ctx) +{ + return ctx->flags & MLX5HWS_CONTEXT_FLAG_NATIVE_SUPPORT; +} + bool mlx5hws_context_cap_dynamic_reparse(struct mlx5hws_context *ctx); u8 mlx5hws_context_get_reparse_mode(struct mlx5hws_context *ctx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h index f39d636ff39ae..5121951f2778a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h @@ -70,7 +70,6 @@ enum mlx5hws_send_queue_actions { struct mlx5hws_context_attr { u16 queues; u16 queue_size; - bool bwc; /* add support for backward compatible API*/ }; struct mlx5hws_table_attr { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index b68b0c368771a..20fe126ffd221 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -898,6 +898,9 @@ static int mlx5hws_send_ring_open(struct mlx5hws_context *ctx, static void mlx5hws_send_queue_close(struct mlx5hws_send_engine *queue) { + if (!queue->num_entries) + return; /* this queue wasn't initialized */ + hws_send_ring_close(queue); kfree(queue->completed.entries); } @@ -1005,7 +1008,7 @@ int mlx5hws_send_queues_open(struct mlx5hws_context *ctx, u16 queue_size) { int err = 0; - u32 i; + int i = 0; /* Open one extra queue for control path */ ctx->queues = queues + 1; @@ -1021,7 +1024,13 @@ int mlx5hws_send_queues_open(struct mlx5hws_context *ctx, goto free_bwc_locks; } - for (i = 0; i < ctx->queues; i++) { + /* If native API isn't supported, skip the unused native queues: + * initialize BWC queues and control queue only. + */ + if (!mlx5hws_context_native_supported(ctx)) + i = mlx5hws_bwc_get_queue_id(ctx, 0); + + for (; i < ctx->queues; i++) { err = mlx5hws_send_queue_open(ctx, &ctx->send_queue[i], queue_size); if (err) goto close_send_queues; From aa90a30804a563763eb78f00f56f759b72b91cb0 Mon Sep 17 00:00:00 2001 From: Itamar Gozlan Date: Thu, 19 Dec 2024 19:58:38 +0200 Subject: [PATCH 0970/1450] net/mlx5: DR, expand SWS STE callbacks and consolidate common structs Expand SWS STE callbacks to support ConnectX-8 hardware. Move common enums and structures to a shared header file. Signed-off-by: Itamar Gozlan Signed-off-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-9-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/sws/dr_ste.c | 4 +- .../mellanox/mlx5/core/steering/sws/dr_ste.h | 18 +- .../mlx5/core/steering/sws/dr_ste_v0.c | 6 +- .../mlx5/core/steering/sws/dr_ste_v1.c | 207 ++++-------------- .../mlx5/core/steering/sws/dr_ste_v1.h | 147 ++++++++++++- .../mlx5/core/steering/sws/dr_ste_v2.c | 169 +------------- .../mlx5/core/steering/sws/dr_ste_v2.h | 168 ++++++++++++++ 7 files changed, 377 insertions(+), 342 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v2.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.c index e94fbb015efad..01ba8eae2983d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.c @@ -555,7 +555,7 @@ void mlx5dr_ste_set_actions_tx(struct mlx5dr_ste_ctx *ste_ctx, struct mlx5dr_ste_actions_attr *attr, u32 *added_stes) { - ste_ctx->set_actions_tx(dmn, action_type_set, ste_ctx->actions_caps, + ste_ctx->set_actions_tx(ste_ctx, dmn, action_type_set, ste_ctx->actions_caps, hw_ste_arr, attr, added_stes); } @@ -566,7 +566,7 @@ void mlx5dr_ste_set_actions_rx(struct mlx5dr_ste_ctx *ste_ctx, struct mlx5dr_ste_actions_attr *attr, u32 *added_stes) { - ste_ctx->set_actions_rx(dmn, action_type_set, ste_ctx->actions_caps, + ste_ctx->set_actions_rx(ste_ctx, dmn, action_type_set, ste_ctx->actions_caps, hw_ste_arr, attr, added_stes); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.h index 54a6619c3ecbf..b6ec8d30d9903 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.h @@ -160,13 +160,15 @@ struct mlx5dr_ste_ctx { /* Actions */ u32 actions_caps; - void (*set_actions_rx)(struct mlx5dr_domain *dmn, + void (*set_actions_rx)(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, u8 *action_type_set, u32 actions_caps, u8 *hw_ste_arr, struct mlx5dr_ste_actions_attr *attr, u32 *added_stes); - void (*set_actions_tx)(struct mlx5dr_domain *dmn, + void (*set_actions_tx)(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, u8 *action_type_set, u32 actions_caps, u8 *hw_ste_arr, @@ -197,7 +199,17 @@ struct mlx5dr_ste_ctx { u16 *used_hw_action_num); int (*alloc_modify_hdr_chunk)(struct mlx5dr_action *action); void (*dealloc_modify_hdr_chunk)(struct mlx5dr_action *action); - + /* Actions bit set */ + void (*set_encap)(u8 *hw_ste_p, u8 *d_action, + u32 reformat_id, int size); + void (*set_push_vlan)(u8 *ste, u8 *d_action, + u32 vlan_hdr); + void (*set_pop_vlan)(u8 *hw_ste_p, u8 *s_action, + u8 vlans_num); + void (*set_rx_decap)(u8 *hw_ste_p, u8 *s_action); + void (*set_encap_l3)(u8 *hw_ste_p, u8 *frst_s_action, + u8 *scnd_d_action, u32 reformat_id, + int size); /* Send */ void (*prepare_for_postsend)(u8 *hw_ste_p, u32 ste_size); }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v0.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v0.c index e9f6c7ed7a7be..42536bee55e28 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v0.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v0.c @@ -406,7 +406,8 @@ static void dr_ste_v0_arr_init_next(u8 **last_ste, } static void -dr_ste_v0_set_actions_tx(struct mlx5dr_domain *dmn, +dr_ste_v0_set_actions_tx(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, u8 *action_type_set, u32 actions_caps, u8 *last_ste, @@ -476,7 +477,8 @@ dr_ste_v0_set_actions_tx(struct mlx5dr_domain *dmn, } static void -dr_ste_v0_set_actions_rx(struct mlx5dr_domain *dmn, +dr_ste_v0_set_actions_rx(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, u8 *action_type_set, u32 actions_caps, u8 *last_ste, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v1.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v1.c index 1d49704b95427..7f83d77c43ef0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v1.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v1.c @@ -5,136 +5,6 @@ #include "mlx5_ifc_dr_ste_v1.h" #include "dr_ste_v1.h" -#define DR_STE_CALC_DFNR_TYPE(lookup_type, inner) \ - ((inner) ? DR_STE_V1_LU_TYPE_##lookup_type##_I : \ - DR_STE_V1_LU_TYPE_##lookup_type##_O) - -enum dr_ste_v1_entry_format { - DR_STE_V1_TYPE_BWC_BYTE = 0x0, - DR_STE_V1_TYPE_BWC_DW = 0x1, - DR_STE_V1_TYPE_MATCH = 0x2, - DR_STE_V1_TYPE_MATCH_RANGES = 0x7, -}; - -/* Lookup type is built from 2B: [ Definer mode 1B ][ Definer index 1B ] */ -enum { - DR_STE_V1_LU_TYPE_NOP = 0x0000, - DR_STE_V1_LU_TYPE_ETHL2_TNL = 0x0002, - DR_STE_V1_LU_TYPE_IBL3_EXT = 0x0102, - DR_STE_V1_LU_TYPE_ETHL2_O = 0x0003, - DR_STE_V1_LU_TYPE_IBL4 = 0x0103, - DR_STE_V1_LU_TYPE_ETHL2_I = 0x0004, - DR_STE_V1_LU_TYPE_SRC_QP_GVMI = 0x0104, - DR_STE_V1_LU_TYPE_ETHL2_SRC_O = 0x0005, - DR_STE_V1_LU_TYPE_ETHL2_HEADERS_O = 0x0105, - DR_STE_V1_LU_TYPE_ETHL2_SRC_I = 0x0006, - DR_STE_V1_LU_TYPE_ETHL2_HEADERS_I = 0x0106, - DR_STE_V1_LU_TYPE_ETHL3_IPV4_5_TUPLE_O = 0x0007, - DR_STE_V1_LU_TYPE_IPV6_DES_O = 0x0107, - DR_STE_V1_LU_TYPE_ETHL3_IPV4_5_TUPLE_I = 0x0008, - DR_STE_V1_LU_TYPE_IPV6_DES_I = 0x0108, - DR_STE_V1_LU_TYPE_ETHL4_O = 0x0009, - DR_STE_V1_LU_TYPE_IPV6_SRC_O = 0x0109, - DR_STE_V1_LU_TYPE_ETHL4_I = 0x000a, - DR_STE_V1_LU_TYPE_IPV6_SRC_I = 0x010a, - DR_STE_V1_LU_TYPE_ETHL2_SRC_DST_O = 0x000b, - DR_STE_V1_LU_TYPE_MPLS_O = 0x010b, - DR_STE_V1_LU_TYPE_ETHL2_SRC_DST_I = 0x000c, - DR_STE_V1_LU_TYPE_MPLS_I = 0x010c, - DR_STE_V1_LU_TYPE_ETHL3_IPV4_MISC_O = 0x000d, - DR_STE_V1_LU_TYPE_GRE = 0x010d, - DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER = 0x000e, - DR_STE_V1_LU_TYPE_GENERAL_PURPOSE = 0x010e, - DR_STE_V1_LU_TYPE_ETHL3_IPV4_MISC_I = 0x000f, - DR_STE_V1_LU_TYPE_STEERING_REGISTERS_0 = 0x010f, - DR_STE_V1_LU_TYPE_STEERING_REGISTERS_1 = 0x0110, - DR_STE_V1_LU_TYPE_FLEX_PARSER_OK = 0x0011, - DR_STE_V1_LU_TYPE_FLEX_PARSER_0 = 0x0111, - DR_STE_V1_LU_TYPE_FLEX_PARSER_1 = 0x0112, - DR_STE_V1_LU_TYPE_ETHL4_MISC_O = 0x0113, - DR_STE_V1_LU_TYPE_ETHL4_MISC_I = 0x0114, - DR_STE_V1_LU_TYPE_INVALID = 0x00ff, - DR_STE_V1_LU_TYPE_DONT_CARE = MLX5DR_STE_LU_TYPE_DONT_CARE, -}; - -enum dr_ste_v1_header_anchors { - DR_STE_HEADER_ANCHOR_START_OUTER = 0x00, - DR_STE_HEADER_ANCHOR_1ST_VLAN = 0x02, - DR_STE_HEADER_ANCHOR_IPV6_IPV4 = 0x07, - DR_STE_HEADER_ANCHOR_INNER_MAC = 0x13, - DR_STE_HEADER_ANCHOR_INNER_IPV6_IPV4 = 0x19, -}; - -enum dr_ste_v1_action_size { - DR_STE_ACTION_SINGLE_SZ = 4, - DR_STE_ACTION_DOUBLE_SZ = 8, - DR_STE_ACTION_TRIPLE_SZ = 12, -}; - -enum dr_ste_v1_action_insert_ptr_attr { - DR_STE_V1_ACTION_INSERT_PTR_ATTR_NONE = 0, /* Regular push header (e.g. push vlan) */ - DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP = 1, /* Encapsulation / Tunneling */ - DR_STE_V1_ACTION_INSERT_PTR_ATTR_ESP = 2, /* IPsec */ -}; - -enum dr_ste_v1_action_id { - DR_STE_V1_ACTION_ID_NOP = 0x00, - DR_STE_V1_ACTION_ID_COPY = 0x05, - DR_STE_V1_ACTION_ID_SET = 0x06, - DR_STE_V1_ACTION_ID_ADD = 0x07, - DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE = 0x08, - DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER = 0x09, - DR_STE_V1_ACTION_ID_INSERT_INLINE = 0x0a, - DR_STE_V1_ACTION_ID_INSERT_POINTER = 0x0b, - DR_STE_V1_ACTION_ID_FLOW_TAG = 0x0c, - DR_STE_V1_ACTION_ID_QUEUE_ID_SEL = 0x0d, - DR_STE_V1_ACTION_ID_ACCELERATED_LIST = 0x0e, - DR_STE_V1_ACTION_ID_MODIFY_LIST = 0x0f, - DR_STE_V1_ACTION_ID_ASO = 0x12, - DR_STE_V1_ACTION_ID_TRAILER = 0x13, - DR_STE_V1_ACTION_ID_COUNTER_ID = 0x14, - DR_STE_V1_ACTION_ID_MAX = 0x21, - /* use for special cases */ - DR_STE_V1_ACTION_ID_SPECIAL_ENCAP_L3 = 0x22, -}; - -enum { - DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_0 = 0x00, - DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_1 = 0x01, - DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_2 = 0x02, - DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_0 = 0x08, - DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_1 = 0x09, - DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0 = 0x0e, - DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0 = 0x18, - DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_1 = 0x19, - DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_0 = 0x40, - DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_1 = 0x41, - DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_0 = 0x44, - DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_1 = 0x45, - DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_2 = 0x46, - DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_3 = 0x47, - DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_0 = 0x4c, - DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_1 = 0x4d, - DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_2 = 0x4e, - DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_3 = 0x4f, - DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_0 = 0x5e, - DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_1 = 0x5f, - DR_STE_V1_ACTION_MDFY_FLD_CFG_HDR_0_0 = 0x6f, - DR_STE_V1_ACTION_MDFY_FLD_CFG_HDR_0_1 = 0x70, - DR_STE_V1_ACTION_MDFY_FLD_METADATA_2_CQE = 0x7b, - DR_STE_V1_ACTION_MDFY_FLD_GNRL_PURPOSE = 0x7c, - DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2_0 = 0x8c, - DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2_1 = 0x8d, - DR_STE_V1_ACTION_MDFY_FLD_REGISTER_1_0 = 0x8e, - DR_STE_V1_ACTION_MDFY_FLD_REGISTER_1_1 = 0x8f, - DR_STE_V1_ACTION_MDFY_FLD_REGISTER_0_0 = 0x90, - DR_STE_V1_ACTION_MDFY_FLD_REGISTER_0_1 = 0x91, -}; - -enum dr_ste_v1_aso_ctx_type { - DR_STE_V1_ASO_CTX_TYPE_POLICERS = 0x2, -}; - static const struct mlx5dr_ste_action_modify_field dr_ste_v1_action_modify_field_arr[] = { [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { .hw_field = DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_0, .start = 0, .end = 31, @@ -379,13 +249,12 @@ static void dr_ste_v1_set_counter_id(u8 *hw_ste_p, u32 ctr_id) MLX5_SET(ste_match_bwc_v1, hw_ste_p, counter_id, ctr_id); } -static void dr_ste_v1_set_reparse(u8 *hw_ste_p) +void dr_ste_v1_set_reparse(u8 *hw_ste_p) { MLX5_SET(ste_match_bwc_v1, hw_ste_p, reparse, 1); } -static void dr_ste_v1_set_encap(u8 *hw_ste_p, u8 *d_action, - u32 reformat_id, int size) +void dr_ste_v1_set_encap(u8 *hw_ste_p, u8 *d_action, u32 reformat_id, int size) { MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, action_id, DR_STE_V1_ACTION_ID_INSERT_POINTER); @@ -432,8 +301,7 @@ static void dr_ste_v1_set_remove_hdr(u8 *hw_ste_p, u8 *s_action, dr_ste_v1_set_reparse(hw_ste_p); } -static void dr_ste_v1_set_push_vlan(u8 *hw_ste_p, u8 *d_action, - u32 vlan_hdr) +void dr_ste_v1_set_push_vlan(u8 *hw_ste_p, u8 *d_action, u32 vlan_hdr) { MLX5_SET(ste_double_action_insert_with_inline_v1, d_action, action_id, DR_STE_V1_ACTION_ID_INSERT_INLINE); @@ -446,7 +314,7 @@ static void dr_ste_v1_set_push_vlan(u8 *hw_ste_p, u8 *d_action, dr_ste_v1_set_reparse(hw_ste_p); } -static void dr_ste_v1_set_pop_vlan(u8 *hw_ste_p, u8 *s_action, u8 vlans_num) +void dr_ste_v1_set_pop_vlan(u8 *hw_ste_p, u8 *s_action, u8 vlans_num) { MLX5_SET(ste_single_action_remove_header_size_v1, s_action, action_id, DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE); @@ -459,11 +327,8 @@ static void dr_ste_v1_set_pop_vlan(u8 *hw_ste_p, u8 *s_action, u8 vlans_num) dr_ste_v1_set_reparse(hw_ste_p); } -static void dr_ste_v1_set_encap_l3(u8 *hw_ste_p, - u8 *frst_s_action, - u8 *scnd_d_action, - u32 reformat_id, - int size) +void dr_ste_v1_set_encap_l3(u8 *hw_ste_p, u8 *frst_s_action, u8 *scnd_d_action, + u32 reformat_id, int size) { /* Remove L2 headers */ MLX5_SET(ste_single_action_remove_header_v1, frst_s_action, action_id, @@ -483,7 +348,7 @@ static void dr_ste_v1_set_encap_l3(u8 *hw_ste_p, dr_ste_v1_set_reparse(hw_ste_p); } -static void dr_ste_v1_set_rx_decap(u8 *hw_ste_p, u8 *s_action) +void dr_ste_v1_set_rx_decap(u8 *hw_ste_p, u8 *s_action) { MLX5_SET(ste_single_action_remove_header_v1, s_action, action_id, DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); @@ -620,7 +485,8 @@ static void dr_ste_v1_arr_init_next_match_range(u8 **last_ste, dr_ste_v1_set_entry_type(*last_ste, DR_STE_V1_TYPE_MATCH_RANGES); } -void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, +void dr_ste_v1_set_actions_tx(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, u8 *action_type_set, u32 actions_caps, u8 *last_ste, @@ -640,7 +506,7 @@ void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, last_ste, action); action_sz = DR_STE_ACTION_TRIPLE_SZ; } - dr_ste_v1_set_pop_vlan(last_ste, action, attr->vlans.count); + ste_ctx->set_pop_vlan(last_ste, action, attr->vlans.count); action_sz -= DR_STE_ACTION_SINGLE_SZ; action += DR_STE_ACTION_SINGLE_SZ; @@ -677,8 +543,8 @@ void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, action_sz = DR_STE_ACTION_TRIPLE_SZ; allow_encap = true; } - dr_ste_v1_set_push_vlan(last_ste, action, - attr->vlans.headers[i]); + ste_ctx->set_push_vlan(last_ste, action, + attr->vlans.headers[i]); action_sz -= DR_STE_ACTION_DOUBLE_SZ; action += DR_STE_ACTION_DOUBLE_SZ; } @@ -691,9 +557,9 @@ void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, action_sz = DR_STE_ACTION_TRIPLE_SZ; allow_encap = true; } - dr_ste_v1_set_encap(last_ste, action, - attr->reformat.id, - attr->reformat.size); + ste_ctx->set_encap(last_ste, action, + attr->reformat.id, + attr->reformat.size); action_sz -= DR_STE_ACTION_DOUBLE_SZ; action += DR_STE_ACTION_DOUBLE_SZ; } else if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]) { @@ -706,10 +572,10 @@ void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, } d_action = action + DR_STE_ACTION_SINGLE_SZ; - dr_ste_v1_set_encap_l3(last_ste, - action, d_action, - attr->reformat.id, - attr->reformat.size); + ste_ctx->set_encap_l3(last_ste, + action, d_action, + attr->reformat.id, + attr->reformat.size); action_sz -= DR_STE_ACTION_TRIPLE_SZ; action += DR_STE_ACTION_TRIPLE_SZ; } else if (action_type_set[DR_ACTION_TYP_INSERT_HDR]) { @@ -776,7 +642,8 @@ void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, dr_ste_v1_set_hit_addr(last_ste, attr->final_icm_addr, 1); } -void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, +void dr_ste_v1_set_actions_rx(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, u8 *action_type_set, u32 actions_caps, u8 *last_ste, @@ -799,7 +666,7 @@ void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, allow_modify_hdr = false; allow_ctr = false; } else if (action_type_set[DR_ACTION_TYP_TNL_L2_TO_L2]) { - dr_ste_v1_set_rx_decap(last_ste, action); + ste_ctx->set_rx_decap(last_ste, action); action_sz -= DR_STE_ACTION_SINGLE_SZ; action += DR_STE_ACTION_SINGLE_SZ; allow_modify_hdr = false; @@ -827,7 +694,7 @@ void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, action_sz = DR_STE_ACTION_TRIPLE_SZ; } - dr_ste_v1_set_pop_vlan(last_ste, action, attr->vlans.count); + ste_ctx->set_pop_vlan(last_ste, action, attr->vlans.count); action_sz -= DR_STE_ACTION_SINGLE_SZ; action += DR_STE_ACTION_SINGLE_SZ; allow_ctr = false; @@ -868,8 +735,8 @@ void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, last_ste, action); action_sz = DR_STE_ACTION_TRIPLE_SZ; } - dr_ste_v1_set_push_vlan(last_ste, action, - attr->vlans.headers[i]); + ste_ctx->set_push_vlan(last_ste, action, + attr->vlans.headers[i]); action_sz -= DR_STE_ACTION_DOUBLE_SZ; action += DR_STE_ACTION_DOUBLE_SZ; } @@ -895,9 +762,9 @@ void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); action_sz = DR_STE_ACTION_TRIPLE_SZ; } - dr_ste_v1_set_encap(last_ste, action, - attr->reformat.id, - attr->reformat.size); + ste_ctx->set_encap(last_ste, action, + attr->reformat.id, + attr->reformat.size); action_sz -= DR_STE_ACTION_DOUBLE_SZ; action += DR_STE_ACTION_DOUBLE_SZ; allow_modify_hdr = false; @@ -912,10 +779,10 @@ void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, d_action = action + DR_STE_ACTION_SINGLE_SZ; - dr_ste_v1_set_encap_l3(last_ste, - action, d_action, - attr->reformat.id, - attr->reformat.size); + ste_ctx->set_encap_l3(last_ste, + action, d_action, + attr->reformat.id, + attr->reformat.size); action_sz -= DR_STE_ACTION_TRIPLE_SZ; allow_modify_hdr = false; } else if (action_type_set[DR_ACTION_TYP_INSERT_HDR]) { @@ -1027,9 +894,6 @@ void dr_ste_v1_set_action_copy(u8 *d_action, MLX5_SET(ste_double_action_copy_v1, d_action, source_right_shifter, src_shifter); } -#define DR_STE_DECAP_L3_ACTION_NUM 8 -#define DR_STE_L2_HDR_MAX_SZ 20 - int dr_ste_v1_set_action_decap_l3_list(void *data, u32 data_sz, u8 *hw_action, @@ -2330,7 +2194,12 @@ static struct mlx5dr_ste_ctx ste_ctx_v1 = { .set_action_decap_l3_list = &dr_ste_v1_set_action_decap_l3_list, .alloc_modify_hdr_chunk = &dr_ste_v1_alloc_modify_hdr_ptrn_arg, .dealloc_modify_hdr_chunk = &dr_ste_v1_free_modify_hdr_ptrn_arg, - + /* Actions bit set */ + .set_encap = &dr_ste_v1_set_encap, + .set_push_vlan = &dr_ste_v1_set_push_vlan, + .set_pop_vlan = &dr_ste_v1_set_pop_vlan, + .set_rx_decap = &dr_ste_v1_set_rx_decap, + .set_encap_l3 = &dr_ste_v1_set_encap_l3, /* Send */ .prepare_for_postsend = &dr_ste_v1_prepare_for_postsend, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v1.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v1.h index e2fc698670880..a8d9e308d3392 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v1.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v1.h @@ -7,6 +7,138 @@ #include "dr_types.h" #include "dr_ste.h" +#define DR_STE_DECAP_L3_ACTION_NUM 8 +#define DR_STE_L2_HDR_MAX_SZ 20 +#define DR_STE_CALC_DFNR_TYPE(lookup_type, inner) \ + ((inner) ? DR_STE_V1_LU_TYPE_##lookup_type##_I : \ + DR_STE_V1_LU_TYPE_##lookup_type##_O) + +enum dr_ste_v1_entry_format { + DR_STE_V1_TYPE_BWC_BYTE = 0x0, + DR_STE_V1_TYPE_BWC_DW = 0x1, + DR_STE_V1_TYPE_MATCH = 0x2, + DR_STE_V1_TYPE_MATCH_RANGES = 0x7, +}; + +/* Lookup type is built from 2B: [ Definer mode 1B ][ Definer index 1B ] */ +enum { + DR_STE_V1_LU_TYPE_NOP = 0x0000, + DR_STE_V1_LU_TYPE_ETHL2_TNL = 0x0002, + DR_STE_V1_LU_TYPE_IBL3_EXT = 0x0102, + DR_STE_V1_LU_TYPE_ETHL2_O = 0x0003, + DR_STE_V1_LU_TYPE_IBL4 = 0x0103, + DR_STE_V1_LU_TYPE_ETHL2_I = 0x0004, + DR_STE_V1_LU_TYPE_SRC_QP_GVMI = 0x0104, + DR_STE_V1_LU_TYPE_ETHL2_SRC_O = 0x0005, + DR_STE_V1_LU_TYPE_ETHL2_HEADERS_O = 0x0105, + DR_STE_V1_LU_TYPE_ETHL2_SRC_I = 0x0006, + DR_STE_V1_LU_TYPE_ETHL2_HEADERS_I = 0x0106, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_5_TUPLE_O = 0x0007, + DR_STE_V1_LU_TYPE_IPV6_DES_O = 0x0107, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_5_TUPLE_I = 0x0008, + DR_STE_V1_LU_TYPE_IPV6_DES_I = 0x0108, + DR_STE_V1_LU_TYPE_ETHL4_O = 0x0009, + DR_STE_V1_LU_TYPE_IPV6_SRC_O = 0x0109, + DR_STE_V1_LU_TYPE_ETHL4_I = 0x000a, + DR_STE_V1_LU_TYPE_IPV6_SRC_I = 0x010a, + DR_STE_V1_LU_TYPE_ETHL2_SRC_DST_O = 0x000b, + DR_STE_V1_LU_TYPE_MPLS_O = 0x010b, + DR_STE_V1_LU_TYPE_ETHL2_SRC_DST_I = 0x000c, + DR_STE_V1_LU_TYPE_MPLS_I = 0x010c, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_MISC_O = 0x000d, + DR_STE_V1_LU_TYPE_GRE = 0x010d, + DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER = 0x000e, + DR_STE_V1_LU_TYPE_GENERAL_PURPOSE = 0x010e, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_MISC_I = 0x000f, + DR_STE_V1_LU_TYPE_STEERING_REGISTERS_0 = 0x010f, + DR_STE_V1_LU_TYPE_STEERING_REGISTERS_1 = 0x0110, + DR_STE_V1_LU_TYPE_FLEX_PARSER_OK = 0x0011, + DR_STE_V1_LU_TYPE_FLEX_PARSER_0 = 0x0111, + DR_STE_V1_LU_TYPE_FLEX_PARSER_1 = 0x0112, + DR_STE_V1_LU_TYPE_ETHL4_MISC_O = 0x0113, + DR_STE_V1_LU_TYPE_ETHL4_MISC_I = 0x0114, + DR_STE_V1_LU_TYPE_INVALID = 0x00ff, + DR_STE_V1_LU_TYPE_DONT_CARE = MLX5DR_STE_LU_TYPE_DONT_CARE, +}; + +enum dr_ste_v1_header_anchors { + DR_STE_HEADER_ANCHOR_START_OUTER = 0x00, + DR_STE_HEADER_ANCHOR_1ST_VLAN = 0x02, + DR_STE_HEADER_ANCHOR_IPV6_IPV4 = 0x07, + DR_STE_HEADER_ANCHOR_INNER_MAC = 0x13, + DR_STE_HEADER_ANCHOR_INNER_IPV6_IPV4 = 0x19, +}; + +enum dr_ste_v1_action_size { + DR_STE_ACTION_SINGLE_SZ = 4, + DR_STE_ACTION_DOUBLE_SZ = 8, + DR_STE_ACTION_TRIPLE_SZ = 12, +}; + +enum dr_ste_v1_action_insert_ptr_attr { + DR_STE_V1_ACTION_INSERT_PTR_ATTR_NONE = 0, /* Regular push header (e.g. push vlan) */ + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP = 1, /* Encapsulation / Tunneling */ + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ESP = 2, /* IPsec */ +}; + +enum dr_ste_v1_action_id { + DR_STE_V1_ACTION_ID_NOP = 0x00, + DR_STE_V1_ACTION_ID_COPY = 0x05, + DR_STE_V1_ACTION_ID_SET = 0x06, + DR_STE_V1_ACTION_ID_ADD = 0x07, + DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE = 0x08, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER = 0x09, + DR_STE_V1_ACTION_ID_INSERT_INLINE = 0x0a, + DR_STE_V1_ACTION_ID_INSERT_POINTER = 0x0b, + DR_STE_V1_ACTION_ID_FLOW_TAG = 0x0c, + DR_STE_V1_ACTION_ID_QUEUE_ID_SEL = 0x0d, + DR_STE_V1_ACTION_ID_ACCELERATED_LIST = 0x0e, + DR_STE_V1_ACTION_ID_MODIFY_LIST = 0x0f, + DR_STE_V1_ACTION_ID_ASO = 0x12, + DR_STE_V1_ACTION_ID_TRAILER = 0x13, + DR_STE_V1_ACTION_ID_COUNTER_ID = 0x14, + DR_STE_V1_ACTION_ID_MAX = 0x21, + /* use for special cases */ + DR_STE_V1_ACTION_ID_SPECIAL_ENCAP_L3 = 0x22, +}; + +enum { + DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_0 = 0x00, + DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_1 = 0x01, + DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_2 = 0x02, + DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_0 = 0x08, + DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_1 = 0x09, + DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0 = 0x0e, + DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0 = 0x18, + DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_1 = 0x19, + DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_0 = 0x40, + DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_1 = 0x41, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_0 = 0x44, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_1 = 0x45, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_2 = 0x46, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_3 = 0x47, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_0 = 0x4c, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_1 = 0x4d, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_2 = 0x4e, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_3 = 0x4f, + DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_0 = 0x5e, + DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_1 = 0x5f, + DR_STE_V1_ACTION_MDFY_FLD_CFG_HDR_0_0 = 0x6f, + DR_STE_V1_ACTION_MDFY_FLD_CFG_HDR_0_1 = 0x70, + DR_STE_V1_ACTION_MDFY_FLD_METADATA_2_CQE = 0x7b, + DR_STE_V1_ACTION_MDFY_FLD_GNRL_PURPOSE = 0x7c, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2_0 = 0x8c, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2_1 = 0x8d, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_1_0 = 0x8e, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_1_1 = 0x8f, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_0_0 = 0x90, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_0_1 = 0x91, +}; + +enum dr_ste_v1_aso_ctx_type { + DR_STE_V1_ASO_CTX_TYPE_POLICERS = 0x2, +}; + bool dr_ste_v1_is_miss_addr_set(u8 *hw_ste_p); void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr); u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p); @@ -17,11 +149,18 @@ u16 dr_ste_v1_get_next_lu_type(u8 *hw_ste_p); void dr_ste_v1_set_hit_addr(u8 *hw_ste_p, u64 icm_addr, u32 ht_size); void dr_ste_v1_init(u8 *hw_ste_p, u16 lu_type, bool is_rx, u16 gvmi); void dr_ste_v1_prepare_for_postsend(u8 *hw_ste_p, u32 ste_size); -void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, u8 *action_type_set, - u32 actions_caps, u8 *last_ste, +void dr_ste_v1_set_reparse(u8 *hw_ste_p); +void dr_ste_v1_set_encap(u8 *hw_ste_p, u8 *d_action, u32 reformat_id, int size); +void dr_ste_v1_set_push_vlan(u8 *hw_ste_p, u8 *d_action, u32 vlan_hdr); +void dr_ste_v1_set_pop_vlan(u8 *hw_ste_p, u8 *s_action, u8 vlans_num); +void dr_ste_v1_set_encap_l3(u8 *hw_ste_p, u8 *frst_s_action, u8 *scnd_d_action, + u32 reformat_id, int size); +void dr_ste_v1_set_rx_decap(u8 *hw_ste_p, u8 *s_action); +void dr_ste_v1_set_actions_tx(struct mlx5dr_ste_ctx *ste_ctx, struct mlx5dr_domain *dmn, + u8 *action_type_set, u32 actions_caps, u8 *last_ste, struct mlx5dr_ste_actions_attr *attr, u32 *added_stes); -void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, u8 *action_type_set, - u32 actions_caps, u8 *last_ste, +void dr_ste_v1_set_actions_rx(struct mlx5dr_ste_ctx *ste_ctx, struct mlx5dr_domain *dmn, + u8 *action_type_set, u32 actions_caps, u8 *last_ste, struct mlx5dr_ste_actions_attr *attr, u32 *added_stes); void dr_ste_v1_set_action_set(u8 *d_action, u8 hw_field, u8 shifter, u8 length, u32 data); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v2.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v2.c index 808b013cf48cb..0882dba0f64b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v2.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v2.c @@ -2,167 +2,7 @@ /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ #include "dr_ste_v1.h" - -enum { - DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_0 = 0x00, - DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1 = 0x01, - DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_2 = 0x02, - DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_0 = 0x08, - DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_1 = 0x09, - DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0 = 0x0e, - DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0 = 0x18, - DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_1 = 0x19, - DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_0 = 0x40, - DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_1 = 0x41, - DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_0 = 0x44, - DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_1 = 0x45, - DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_2 = 0x46, - DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_3 = 0x47, - DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_0 = 0x4c, - DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_1 = 0x4d, - DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_2 = 0x4e, - DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_3 = 0x4f, - DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_0 = 0x5e, - DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_1 = 0x5f, - DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_0 = 0x6f, - DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_1 = 0x70, - DR_STE_V2_ACTION_MDFY_FLD_METADATA_2_CQE = 0x7b, - DR_STE_V2_ACTION_MDFY_FLD_GNRL_PURPOSE = 0x7c, - DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_0 = 0x90, - DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_1 = 0x91, - DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_0 = 0x92, - DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_1 = 0x93, - DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_0 = 0x94, - DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_1 = 0x95, -}; - -static const struct mlx5dr_ste_action_modify_field dr_ste_v2_action_modify_field_arr[] = { - [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_0, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_1, .start = 16, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1, .start = 0, .end = 15, - }, - [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_0, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1, .start = 16, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 18, .end = 23, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_1, .start = 16, .end = 24, - .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, - .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, - .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, - }, - [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, - }, - [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, - .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, - }, - [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, - .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_0, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_1, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_2, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_3, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_0, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_1, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_2, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_3, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_0, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_1, .start = 0, .end = 31, - .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, - }, - [MLX5_ACTION_IN_FIELD_METADATA_REG_A] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_GNRL_PURPOSE, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_METADATA_REG_B] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_METADATA_2_CQE, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_METADATA_REG_C_0] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_0, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_METADATA_REG_C_1] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_1, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_METADATA_REG_C_2] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_0, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_METADATA_REG_C_3] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_1, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_METADATA_REG_C_4] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_0, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_METADATA_REG_C_5] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_1, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_0, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_1, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_2, .start = 0, .end = 15, - }, - [MLX5_ACTION_IN_FIELD_OUT_EMD_31_0] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_1, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_EMD_47_32] = { - .hw_field = DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_0, .start = 0, .end = 15, - }, -}; +#include "dr_ste_v2.h" static struct mlx5dr_ste_ctx ste_ctx_v2 = { /* Builders */ @@ -223,7 +63,12 @@ static struct mlx5dr_ste_ctx ste_ctx_v2 = { .set_action_decap_l3_list = &dr_ste_v1_set_action_decap_l3_list, .alloc_modify_hdr_chunk = &dr_ste_v1_alloc_modify_hdr_ptrn_arg, .dealloc_modify_hdr_chunk = &dr_ste_v1_free_modify_hdr_ptrn_arg, - + /* Actions bit set */ + .set_encap = &dr_ste_v1_set_encap, + .set_push_vlan = &dr_ste_v1_set_push_vlan, + .set_pop_vlan = &dr_ste_v1_set_pop_vlan, + .set_rx_decap = &dr_ste_v1_set_rx_decap, + .set_encap_l3 = &dr_ste_v1_set_encap_l3, /* Send */ .prepare_for_postsend = &dr_ste_v1_prepare_for_postsend, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v2.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v2.h new file mode 100644 index 0000000000000..d853fde49cfc6 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v2.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef _DR_STE_V2_ +#define _DR_STE_V2_ + +enum { + DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_0 = 0x00, + DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1 = 0x01, + DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_2 = 0x02, + DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_0 = 0x08, + DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_1 = 0x09, + DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0 = 0x0e, + DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0 = 0x18, + DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_1 = 0x19, + DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_0 = 0x40, + DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_1 = 0x41, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_0 = 0x44, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_1 = 0x45, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_2 = 0x46, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_3 = 0x47, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_0 = 0x4c, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_1 = 0x4d, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_2 = 0x4e, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_3 = 0x4f, + DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_0 = 0x5e, + DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_1 = 0x5f, + DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_0 = 0x6f, + DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_1 = 0x70, + DR_STE_V2_ACTION_MDFY_FLD_METADATA_2_CQE = 0x7b, + DR_STE_V2_ACTION_MDFY_FLD_GNRL_PURPOSE = 0x7c, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_0 = 0x90, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_1 = 0x91, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_0 = 0x92, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_1 = 0x93, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_0 = 0x94, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_1 = 0x95, +}; + +static const struct mlx5dr_ste_action_modify_field dr_ste_v2_action_modify_field_arr[] = { + [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 18, .end = 23, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_1, .start = 16, .end = 24, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_A] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_GNRL_PURPOSE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_B] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_METADATA_2_CQE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_1] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_2] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_3] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_4] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_5] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_2, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_EMD_31_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_EMD_47_32] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_0, .start = 0, .end = 15, + }, +}; + +#endif /* _DR_STE_V2_ */ From 4d617b57574f8ac04c997bdf9127a4c703a5f1f0 Mon Sep 17 00:00:00 2001 From: Itamar Gozlan Date: Thu, 19 Dec 2024 19:58:39 +0200 Subject: [PATCH 0971/1450] net/mlx5: DR, add support for ConnectX-8 steering Add support for a new steering format version that is implemented by ConnectX-8. Except for several differences, the STEv3 is identical to STEv2, so for most callbacks STEv3 context struct will call STEv2 functions. Signed-off-by: Itamar Gozlan Signed-off-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-10-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/Makefile | 1 + .../mlx5/core/steering/sws/dr_domain.c | 2 +- .../mellanox/mlx5/core/steering/sws/dr_ste.c | 2 + .../mellanox/mlx5/core/steering/sws/dr_ste.h | 1 + .../mlx5/core/steering/sws/dr_ste_v3.c | 221 ++++++++++++++++++ .../mlx5/core/steering/sws/mlx5_ifc_dr.h | 40 ++++ .../mellanox/mlx5/core/steering/sws/mlx5dr.h | 2 +- 7 files changed, 267 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v3.c diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 79fe09de0a9fd..10a763e668ed6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -123,6 +123,7 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/sws/dr_domain.o \ steering/sws/dr_ste_v0.o \ steering/sws/dr_ste_v1.o \ steering/sws/dr_ste_v2.o \ + steering/sws/dr_ste_v3.o \ steering/sws/dr_cmd.o \ steering/sws/dr_fw.o \ steering/sws/dr_action.o \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c index 49f22cad92bfd..60cb4527588a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c @@ -8,7 +8,7 @@ #define DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, dmn_type) \ ((dmn)->info.caps.dmn_type##_sw_owner || \ ((dmn)->info.caps.dmn_type##_sw_owner_v2 && \ - (dmn)->info.caps.sw_format_ver <= MLX5_STEERING_FORMAT_CONNECTX_7)) + (dmn)->info.caps.sw_format_ver <= MLX5_STEERING_FORMAT_CONNECTX_8)) bool mlx5dr_domain_is_support_ptrn_arg(struct mlx5dr_domain *dmn) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.c index 01ba8eae2983d..c8b8ff80c7c70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.c @@ -1458,6 +1458,8 @@ struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx(u8 version) return mlx5dr_ste_get_ctx_v1(); else if (version == MLX5_STEERING_FORMAT_CONNECTX_7) return mlx5dr_ste_get_ctx_v2(); + else if (version == MLX5_STEERING_FORMAT_CONNECTX_8) + return mlx5dr_ste_get_ctx_v3(); return NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.h index b6ec8d30d9903..5f409dc30aca8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste.h @@ -217,5 +217,6 @@ struct mlx5dr_ste_ctx { struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v0(void); struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v1(void); struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v2(void); +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v3(void); #endif /* _DR_STE_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v3.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v3.c new file mode 100644 index 0000000000000..cc60ce1d274ef --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_ste_v3.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "dr_ste_v1.h" +#include "dr_ste_v2.h" + +static void dr_ste_v3_set_encap(u8 *hw_ste_p, u8 *d_action, + u32 reformat_id, int size) +{ + MLX5_SET(ste_double_action_insert_with_ptr_v3, d_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_POINTER); + /* The hardware expects here size in words (2 byte) */ + MLX5_SET(ste_double_action_insert_with_ptr_v3, d_action, size, size / 2); + MLX5_SET(ste_double_action_insert_with_ptr_v3, d_action, pointer, reformat_id); + MLX5_SET(ste_double_action_insert_with_ptr_v3, d_action, attributes, + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP); + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v3_set_push_vlan(u8 *ste, u8 *d_action, + u32 vlan_hdr) +{ + MLX5_SET(ste_double_action_insert_with_inline_v3, d_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_INLINE); + /* The hardware expects here offset to vlan header in words (2 byte) */ + MLX5_SET(ste_double_action_insert_with_inline_v3, d_action, start_offset, + HDR_LEN_L2_MACS >> 1); + MLX5_SET(ste_double_action_insert_with_inline_v3, d_action, inline_data, vlan_hdr); + dr_ste_v1_set_reparse(ste); +} + +static void dr_ste_v3_set_pop_vlan(u8 *hw_ste_p, u8 *s_action, + u8 vlans_num) +{ + MLX5_SET(ste_single_action_remove_header_size_v3, s_action, + action_id, DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE); + MLX5_SET(ste_single_action_remove_header_size_v3, s_action, + start_anchor, DR_STE_HEADER_ANCHOR_1ST_VLAN); + /* The hardware expects here size in words (2 byte) */ + MLX5_SET(ste_single_action_remove_header_size_v3, s_action, + remove_size, (HDR_LEN_L2_VLAN >> 1) * vlans_num); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v3_set_encap_l3(u8 *hw_ste_p, + u8 *frst_s_action, + u8 *scnd_d_action, + u32 reformat_id, + int size) +{ + /* Remove L2 headers */ + MLX5_SET(ste_single_action_remove_header_v3, frst_s_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); + MLX5_SET(ste_single_action_remove_header_v3, frst_s_action, end_anchor, + DR_STE_HEADER_ANCHOR_IPV6_IPV4); + + /* Encapsulate with given reformat ID */ + MLX5_SET(ste_double_action_insert_with_ptr_v3, scnd_d_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_POINTER); + /* The hardware expects here size in words (2 byte) */ + MLX5_SET(ste_double_action_insert_with_ptr_v3, scnd_d_action, size, size / 2); + MLX5_SET(ste_double_action_insert_with_ptr_v3, scnd_d_action, pointer, reformat_id); + MLX5_SET(ste_double_action_insert_with_ptr_v3, scnd_d_action, attributes, + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v3_set_rx_decap(u8 *hw_ste_p, u8 *s_action) +{ + MLX5_SET(ste_single_action_remove_header_v3, s_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); + MLX5_SET(ste_single_action_remove_header_v3, s_action, decap, 1); + MLX5_SET(ste_single_action_remove_header_v3, s_action, vni_to_cqe, 1); + MLX5_SET(ste_single_action_remove_header_v3, s_action, end_anchor, + DR_STE_HEADER_ANCHOR_INNER_MAC); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static int +dr_ste_v3_set_action_decap_l3_list(void *data, u32 data_sz, + u8 *hw_action, u32 hw_action_sz, + uint16_t *used_hw_action_num) +{ + u8 padded_data[DR_STE_L2_HDR_MAX_SZ] = {}; + void *data_ptr = padded_data; + u16 used_actions = 0; + u32 inline_data_sz; + u32 i; + + if (hw_action_sz / DR_STE_ACTION_DOUBLE_SZ < DR_STE_DECAP_L3_ACTION_NUM) + return -EINVAL; + + inline_data_sz = + MLX5_FLD_SZ_BYTES(ste_double_action_insert_with_inline_v3, inline_data); + + /* Add an alignment padding */ + memcpy(padded_data + data_sz % inline_data_sz, data, data_sz); + + /* Remove L2L3 outer headers */ + MLX5_SET(ste_single_action_remove_header_v3, hw_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); + MLX5_SET(ste_single_action_remove_header_v3, hw_action, decap, 1); + MLX5_SET(ste_single_action_remove_header_v3, hw_action, vni_to_cqe, 1); + MLX5_SET(ste_single_action_remove_header_v3, hw_action, end_anchor, + DR_STE_HEADER_ANCHOR_INNER_IPV6_IPV4); + hw_action += DR_STE_ACTION_DOUBLE_SZ; + used_actions++; /* Remove and NOP are a single double action */ + + /* Point to the last dword of the header */ + data_ptr += (data_sz / inline_data_sz) * inline_data_sz; + + /* Add the new header using inline action 4Byte at a time, the header + * is added in reversed order to the beginning of the packet to avoid + * incorrect parsing by the HW. Since header is 14B or 18B an extra + * two bytes are padded and later removed. + */ + for (i = 0; i < data_sz / inline_data_sz + 1; i++) { + void *addr_inline; + + MLX5_SET(ste_double_action_insert_with_inline_v3, hw_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_INLINE); + /* The hardware expects here offset to words (2 bytes) */ + MLX5_SET(ste_double_action_insert_with_inline_v3, hw_action, start_offset, 0); + + /* Copy bytes one by one to avoid endianness problem */ + addr_inline = MLX5_ADDR_OF(ste_double_action_insert_with_inline_v3, + hw_action, inline_data); + memcpy(addr_inline, data_ptr - i * inline_data_sz, inline_data_sz); + hw_action += DR_STE_ACTION_DOUBLE_SZ; + used_actions++; + } + + /* Remove first 2 extra bytes */ + MLX5_SET(ste_single_action_remove_header_size_v3, hw_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE); + MLX5_SET(ste_single_action_remove_header_size_v3, hw_action, start_offset, 0); + /* The hardware expects here size in words (2 bytes) */ + MLX5_SET(ste_single_action_remove_header_size_v3, hw_action, remove_size, 1); + used_actions++; + + *used_hw_action_num = used_actions; + + return 0; +} + +static struct mlx5dr_ste_ctx ste_ctx_v3 = { + /* Builders */ + .build_eth_l2_src_dst_init = &dr_ste_v1_build_eth_l2_src_dst_init, + .build_eth_l3_ipv6_src_init = &dr_ste_v1_build_eth_l3_ipv6_src_init, + .build_eth_l3_ipv6_dst_init = &dr_ste_v1_build_eth_l3_ipv6_dst_init, + .build_eth_l3_ipv4_5_tuple_init = &dr_ste_v1_build_eth_l3_ipv4_5_tuple_init, + .build_eth_l2_src_init = &dr_ste_v1_build_eth_l2_src_init, + .build_eth_l2_dst_init = &dr_ste_v1_build_eth_l2_dst_init, + .build_eth_l2_tnl_init = &dr_ste_v1_build_eth_l2_tnl_init, + .build_eth_l3_ipv4_misc_init = &dr_ste_v1_build_eth_l3_ipv4_misc_init, + .build_eth_ipv6_l3_l4_init = &dr_ste_v1_build_eth_ipv6_l3_l4_init, + .build_mpls_init = &dr_ste_v1_build_mpls_init, + .build_tnl_gre_init = &dr_ste_v1_build_tnl_gre_init, + .build_tnl_mpls_init = &dr_ste_v1_build_tnl_mpls_init, + .build_tnl_mpls_over_udp_init = &dr_ste_v1_build_tnl_mpls_over_udp_init, + .build_tnl_mpls_over_gre_init = &dr_ste_v1_build_tnl_mpls_over_gre_init, + .build_icmp_init = &dr_ste_v1_build_icmp_init, + .build_general_purpose_init = &dr_ste_v1_build_general_purpose_init, + .build_eth_l4_misc_init = &dr_ste_v1_build_eth_l4_misc_init, + .build_tnl_vxlan_gpe_init = &dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_init, + .build_tnl_geneve_init = &dr_ste_v1_build_flex_parser_tnl_geneve_init, + .build_tnl_geneve_tlv_opt_init = &dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_init, + .build_tnl_geneve_tlv_opt_exist_init = + &dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_init, + .build_register_0_init = &dr_ste_v1_build_register_0_init, + .build_register_1_init = &dr_ste_v1_build_register_1_init, + .build_src_gvmi_qpn_init = &dr_ste_v1_build_src_gvmi_qpn_init, + .build_flex_parser_0_init = &dr_ste_v1_build_flex_parser_0_init, + .build_flex_parser_1_init = &dr_ste_v1_build_flex_parser_1_init, + .build_tnl_gtpu_init = &dr_ste_v1_build_flex_parser_tnl_gtpu_init, + .build_tnl_header_0_1_init = &dr_ste_v1_build_tnl_header_0_1_init, + .build_tnl_gtpu_flex_parser_0_init = &dr_ste_v1_build_tnl_gtpu_flex_parser_0_init, + .build_tnl_gtpu_flex_parser_1_init = &dr_ste_v1_build_tnl_gtpu_flex_parser_1_init, + + /* Getters and Setters */ + .ste_init = &dr_ste_v1_init, + .set_next_lu_type = &dr_ste_v1_set_next_lu_type, + .get_next_lu_type = &dr_ste_v1_get_next_lu_type, + .is_miss_addr_set = &dr_ste_v1_is_miss_addr_set, + .set_miss_addr = &dr_ste_v1_set_miss_addr, + .get_miss_addr = &dr_ste_v1_get_miss_addr, + .set_hit_addr = &dr_ste_v1_set_hit_addr, + .set_byte_mask = &dr_ste_v1_set_byte_mask, + .get_byte_mask = &dr_ste_v1_get_byte_mask, + + /* Actions */ + .actions_caps = DR_STE_CTX_ACTION_CAP_TX_POP | + DR_STE_CTX_ACTION_CAP_RX_PUSH | + DR_STE_CTX_ACTION_CAP_RX_ENCAP, + .set_actions_rx = &dr_ste_v1_set_actions_rx, + .set_actions_tx = &dr_ste_v1_set_actions_tx, + .modify_field_arr_sz = ARRAY_SIZE(dr_ste_v2_action_modify_field_arr), + .modify_field_arr = dr_ste_v2_action_modify_field_arr, + .set_action_set = &dr_ste_v1_set_action_set, + .set_action_add = &dr_ste_v1_set_action_add, + .set_action_copy = &dr_ste_v1_set_action_copy, + .set_action_decap_l3_list = &dr_ste_v3_set_action_decap_l3_list, + .alloc_modify_hdr_chunk = &dr_ste_v1_alloc_modify_hdr_ptrn_arg, + .dealloc_modify_hdr_chunk = &dr_ste_v1_free_modify_hdr_ptrn_arg, + /* Actions bit set */ + .set_encap = &dr_ste_v3_set_encap, + .set_push_vlan = &dr_ste_v3_set_push_vlan, + .set_pop_vlan = &dr_ste_v3_set_pop_vlan, + .set_rx_decap = &dr_ste_v3_set_rx_decap, + .set_encap_l3 = &dr_ste_v3_set_encap_l3, + /* Send */ + .prepare_for_postsend = &dr_ste_v1_prepare_for_postsend, +}; + +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v3(void) +{ + return &ste_ctx_v3; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5_ifc_dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5_ifc_dr.h index fb078fa0f0cc8..898c3618ff263 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5_ifc_dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5_ifc_dr.h @@ -600,4 +600,44 @@ struct mlx5_ifc_ste_double_action_aso_v1_bits { }; }; +struct mlx5_ifc_ste_single_action_remove_header_v3_bits { + u8 action_id[0x8]; + u8 start_anchor[0x7]; + u8 end_anchor[0x7]; + u8 reserved_at_16[0x1]; + u8 outer_l4_remove[0x1]; + u8 reserved_at_18[0x4]; + u8 decap[0x1]; + u8 vni_to_cqe[0x1]; + u8 qos_profile[0x2]; +}; + +struct mlx5_ifc_ste_single_action_remove_header_size_v3_bits { + u8 action_id[0x8]; + u8 start_anchor[0x7]; + u8 start_offset[0x8]; + u8 outer_l4_remove[0x1]; + u8 reserved_at_18[0x2]; + u8 remove_size[0x6]; +}; + +struct mlx5_ifc_ste_double_action_insert_with_inline_v3_bits { + u8 action_id[0x8]; + u8 start_anchor[0x7]; + u8 start_offset[0x8]; + u8 reserved_at_17[0x9]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_ste_double_action_insert_with_ptr_v3_bits { + u8 action_id[0x8]; + u8 start_anchor[0x7]; + u8 start_offset[0x8]; + u8 size[0x6]; + u8 attributes[0x3]; + + u8 pointer[0x20]; +}; + #endif /* MLX5_IFC_DR_H */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h index 3ac7dc67509fc..0bb3724c10c26 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h @@ -160,7 +160,7 @@ mlx5dr_is_supported(struct mlx5_core_dev *dev) (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner) || (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner_v2) && (MLX5_CAP_GEN(dev, steering_format_version) <= - MLX5_STEERING_FORMAT_CONNECTX_7))); + MLX5_STEERING_FORMAT_CONNECTX_8))); } /* buddy functions & structure */ From f440d69a21f75af1acfdad16d3804750a360613c Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 19 Dec 2024 19:58:40 +0200 Subject: [PATCH 0972/1450] net/mlx5: Remove PTM support log message The absence of Precision Time Measurement support should not emit a message, as it can be misleading in contexts where PTM is not required. Remove the log message indicating the lack of PCIe PTM support. Signed-off-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-11-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 869bfecdd8ffa..a108d8c726f82 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -945,9 +945,7 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev, mlx5_pci_vsc_init(dev); - err = pci_enable_ptm(pdev, NULL); - if (err) - mlx5_core_info(dev, "PTM is not supported by PCIe\n"); + pci_enable_ptm(pdev, NULL); return 0; From ef1749d5066984881b3af7a3007c2af91668dd5b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 19 Dec 2024 19:58:41 +0200 Subject: [PATCH 0973/1450] net/mlx5: fs, Add support for RDMA RX steering over IB link layer Relax the capability check for creating the RDMA RX steering domain by considering only the capabilities reported by the firmware as necessary for its creation, which in turn allows RDMA RX creation over devices with IB link layer as well. The table_miss_action_domain capability is required only for a specific priority, which is handled in mlx5_rdma_enable_roce_steering(). The additional capability check for this case is already in place. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-12-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 6bf0aade69d75..ae20c061e0fb2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -217,7 +217,8 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns, int err; if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) && - underlay_qpn == 0) + underlay_qpn == 0 && + (ft->type != FS_FT_RDMA_RX && ft->type != FS_FT_RDMA_TX)) return 0; if (ft->type == FS_FT_FDB && diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index ae1a5705b26dd..41b5e98a04952 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -3665,8 +3665,7 @@ int mlx5_fs_core_init(struct mlx5_core_dev *dev) goto err; } - if (MLX5_CAP_FLOWTABLE_RDMA_RX(dev, ft_support) && - MLX5_CAP_FLOWTABLE_RDMA_RX(dev, table_miss_action_domain)) { + if (MLX5_CAP_FLOWTABLE_RDMA_RX(dev, ft_support)) { err = init_rdma_rx_root_ns(steering); if (err) goto err; From 7d0bf493b1352ba269f5fefe02dda2b06013f8b5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 Dec 2024 18:52:32 -0800 Subject: [PATCH 0974/1450] eth: fbnic: reorder ethtool code Define ethtool callback handlers in order in which they are defined in the ops struct. It doesn't really matter what the order is, but it's good to have an order. Reviewed-by: Larysa Zaremba Link: https://patch.msgid.link/20241220025241.1522781-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 158 +++++++++--------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index cc8ca94529ca1..777e083acae9b 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -40,35 +40,6 @@ static const struct fbnic_stat fbnic_gstrings_hw_stats[] = { #define FBNIC_HW_FIXED_STATS_LEN ARRAY_SIZE(fbnic_gstrings_hw_stats) #define FBNIC_HW_STATS_LEN FBNIC_HW_FIXED_STATS_LEN -static int -fbnic_get_ts_info(struct net_device *netdev, - struct kernel_ethtool_ts_info *tsinfo) -{ - struct fbnic_net *fbn = netdev_priv(netdev); - - tsinfo->phc_index = ptp_clock_index(fbn->fbd->ptp); - - tsinfo->so_timestamping = - SOF_TIMESTAMPING_TX_SOFTWARE | - SOF_TIMESTAMPING_TX_HARDWARE | - SOF_TIMESTAMPING_RX_HARDWARE | - SOF_TIMESTAMPING_RAW_HARDWARE; - - tsinfo->tx_types = - BIT(HWTSTAMP_TX_OFF) | - BIT(HWTSTAMP_TX_ON); - - tsinfo->rx_filters = - BIT(HWTSTAMP_FILTER_NONE) | - BIT(HWTSTAMP_FILTER_PTP_V1_L4_EVENT) | - BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) | - BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) | - BIT(HWTSTAMP_FILTER_PTP_V2_EVENT) | - BIT(HWTSTAMP_FILTER_ALL); - - return 0; -} - static void fbnic_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { @@ -79,10 +50,19 @@ fbnic_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) sizeof(drvinfo->fw_version)); } -static void fbnic_set_counter(u64 *stat, struct fbnic_stat_counter *counter) +static int fbnic_get_regs_len(struct net_device *netdev) { - if (counter->reported) - *stat = counter->value; + struct fbnic_net *fbn = netdev_priv(netdev); + + return fbnic_csr_regs_len(fbn->fbd) * sizeof(u32); +} + +static void fbnic_get_regs(struct net_device *netdev, + struct ethtool_regs *regs, void *data) +{ + struct fbnic_net *fbn = netdev_priv(netdev); + + fbnic_csr_get_regs(fbn->fbd, data, ®s->version); } static void fbnic_get_strings(struct net_device *dev, u32 sset, u8 *data) @@ -97,6 +77,21 @@ static void fbnic_get_strings(struct net_device *dev, u32 sset, u8 *data) } } +static void fbnic_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, u64 *data) +{ + struct fbnic_net *fbn = netdev_priv(dev); + const struct fbnic_stat *stat; + int i; + + fbnic_get_hw_stats(fbn->fbd); + + for (i = 0; i < FBNIC_HW_STATS_LEN; i++) { + stat = &fbnic_gstrings_hw_stats[i]; + data[i] = *(u64 *)((u8 *)&fbn->fbd->hw_stats + stat->offset); + } +} + static int fbnic_get_sset_count(struct net_device *dev, int sset) { switch (sset) { @@ -107,21 +102,64 @@ static int fbnic_get_sset_count(struct net_device *dev, int sset) } } -static void fbnic_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, u64 *data) +static int +fbnic_get_ts_info(struct net_device *netdev, + struct kernel_ethtool_ts_info *tsinfo) { - struct fbnic_net *fbn = netdev_priv(dev); - const struct fbnic_stat *stat; - int i; + struct fbnic_net *fbn = netdev_priv(netdev); - fbnic_get_hw_stats(fbn->fbd); + tsinfo->phc_index = ptp_clock_index(fbn->fbd->ptp); - for (i = 0; i < FBNIC_HW_STATS_LEN; i++) { - stat = &fbnic_gstrings_hw_stats[i]; - data[i] = *(u64 *)((u8 *)&fbn->fbd->hw_stats + stat->offset); + tsinfo->so_timestamping = + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_RX_HARDWARE | + SOF_TIMESTAMPING_RAW_HARDWARE; + + tsinfo->tx_types = + BIT(HWTSTAMP_TX_OFF) | + BIT(HWTSTAMP_TX_ON); + + tsinfo->rx_filters = + BIT(HWTSTAMP_FILTER_NONE) | + BIT(HWTSTAMP_FILTER_PTP_V1_L4_EVENT) | + BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) | + BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) | + BIT(HWTSTAMP_FILTER_PTP_V2_EVENT) | + BIT(HWTSTAMP_FILTER_ALL); + + return 0; +} + +static void fbnic_get_ts_stats(struct net_device *netdev, + struct ethtool_ts_stats *ts_stats) +{ + struct fbnic_net *fbn = netdev_priv(netdev); + u64 ts_packets, ts_lost; + struct fbnic_ring *ring; + unsigned int start; + int i; + + ts_stats->pkts = fbn->tx_stats.ts_packets; + ts_stats->lost = fbn->tx_stats.ts_lost; + for (i = 0; i < fbn->num_tx_queues; i++) { + ring = fbn->tx[i]; + do { + start = u64_stats_fetch_begin(&ring->stats.syncp); + ts_packets = ring->stats.ts_packets; + ts_lost = ring->stats.ts_lost; + } while (u64_stats_fetch_retry(&ring->stats.syncp, start)); + ts_stats->pkts += ts_packets; + ts_stats->lost += ts_lost; } } +static void fbnic_set_counter(u64 *stat, struct fbnic_stat_counter *counter) +{ + if (counter->reported) + *stat = counter->value; +} + static void fbnic_get_eth_mac_stats(struct net_device *netdev, struct ethtool_eth_mac_stats *eth_mac_stats) @@ -164,44 +202,6 @@ fbnic_get_eth_mac_stats(struct net_device *netdev, &mac_stats->eth_mac.FrameTooLongErrors); } -static void fbnic_get_ts_stats(struct net_device *netdev, - struct ethtool_ts_stats *ts_stats) -{ - struct fbnic_net *fbn = netdev_priv(netdev); - u64 ts_packets, ts_lost; - struct fbnic_ring *ring; - unsigned int start; - int i; - - ts_stats->pkts = fbn->tx_stats.ts_packets; - ts_stats->lost = fbn->tx_stats.ts_lost; - for (i = 0; i < fbn->num_tx_queues; i++) { - ring = fbn->tx[i]; - do { - start = u64_stats_fetch_begin(&ring->stats.syncp); - ts_packets = ring->stats.ts_packets; - ts_lost = ring->stats.ts_lost; - } while (u64_stats_fetch_retry(&ring->stats.syncp, start)); - ts_stats->pkts += ts_packets; - ts_stats->lost += ts_lost; - } -} - -static void fbnic_get_regs(struct net_device *netdev, - struct ethtool_regs *regs, void *data) -{ - struct fbnic_net *fbn = netdev_priv(netdev); - - fbnic_csr_get_regs(fbn->fbd, data, ®s->version); -} - -static int fbnic_get_regs_len(struct net_device *netdev) -{ - struct fbnic_net *fbn = netdev_priv(netdev); - - return fbnic_csr_regs_len(fbn->fbd) * sizeof(u32); -} - static const struct ethtool_ops fbnic_ethtool_ops = { .get_drvinfo = fbnic_get_drvinfo, .get_regs_len = fbnic_get_regs_len, From 7cb06a6a777cf5a98d6f4edcde5b3937f324efb5 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 19 Dec 2024 18:52:33 -0800 Subject: [PATCH 0975/1450] eth: fbnic: support querying RSS config The initial driver submission already added all the RSS state, as part of multi-queue support. Expose the configuration via the ethtool APIs. Signed-off-by: Alexander Duyck Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20241220025241.1522781-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index 777e083acae9b..e71ae6abb0f52 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -102,6 +102,105 @@ static int fbnic_get_sset_count(struct net_device *dev, int sset) } } +static int fbnic_get_rss_hash_idx(u32 flow_type) +{ + switch (flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { + case TCP_V4_FLOW: + return FBNIC_TCP4_HASH_OPT; + case TCP_V6_FLOW: + return FBNIC_TCP6_HASH_OPT; + case UDP_V4_FLOW: + return FBNIC_UDP4_HASH_OPT; + case UDP_V6_FLOW: + return FBNIC_UDP6_HASH_OPT; + case AH_V4_FLOW: + case ESP_V4_FLOW: + case AH_ESP_V4_FLOW: + case SCTP_V4_FLOW: + case IPV4_FLOW: + case IPV4_USER_FLOW: + return FBNIC_IPV4_HASH_OPT; + case AH_V6_FLOW: + case ESP_V6_FLOW: + case AH_ESP_V6_FLOW: + case SCTP_V6_FLOW: + case IPV6_FLOW: + case IPV6_USER_FLOW: + return FBNIC_IPV6_HASH_OPT; + case ETHER_FLOW: + return FBNIC_ETHER_HASH_OPT; + } + + return -1; +} + +static int +fbnic_get_rss_hash_opts(struct fbnic_net *fbn, struct ethtool_rxnfc *cmd) +{ + int hash_opt_idx = fbnic_get_rss_hash_idx(cmd->flow_type); + + if (hash_opt_idx < 0) + return -EINVAL; + + /* Report options from rss_en table in fbn */ + cmd->data = fbn->rss_flow_hash[hash_opt_idx]; + + return 0; +} + +static int fbnic_get_rxnfc(struct net_device *netdev, + struct ethtool_rxnfc *cmd, u32 *rule_locs) +{ + struct fbnic_net *fbn = netdev_priv(netdev); + int ret = -EOPNOTSUPP; + + switch (cmd->cmd) { + case ETHTOOL_GRXRINGS: + cmd->data = fbn->num_rx_queues; + ret = 0; + break; + case ETHTOOL_GRXFH: + ret = fbnic_get_rss_hash_opts(fbn, cmd); + break; + } + + return ret; +} + +static u32 fbnic_get_rxfh_key_size(struct net_device *netdev) +{ + return FBNIC_RPC_RSS_KEY_BYTE_LEN; +} + +static u32 fbnic_get_rxfh_indir_size(struct net_device *netdev) +{ + return FBNIC_RPC_RSS_TBL_SIZE; +} + +static int +fbnic_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh) +{ + struct fbnic_net *fbn = netdev_priv(netdev); + unsigned int i; + + rxfh->hfunc = ETH_RSS_HASH_TOP; + + if (rxfh->key) { + for (i = 0; i < FBNIC_RPC_RSS_KEY_BYTE_LEN; i++) { + u32 rss_key = fbn->rss_key[i / 4] << ((i % 4) * 8); + + rxfh->key[i] = rss_key >> 24; + } + } + + if (rxfh->indir) { + for (i = 0; i < FBNIC_RPC_RSS_TBL_SIZE; i++) + rxfh->indir[i] = fbn->indir_tbl[0][i]; + } + + return 0; +} + static int fbnic_get_ts_info(struct net_device *netdev, struct kernel_ethtool_ts_info *tsinfo) @@ -209,6 +308,10 @@ static const struct ethtool_ops fbnic_ethtool_ops = { .get_strings = fbnic_get_strings, .get_ethtool_stats = fbnic_get_ethtool_stats, .get_sset_count = fbnic_get_sset_count, + .get_rxnfc = fbnic_get_rxnfc, + .get_rxfh_key_size = fbnic_get_rxfh_key_size, + .get_rxfh_indir_size = fbnic_get_rxfh_indir_size, + .get_rxfh = fbnic_get_rxfh, .get_ts_info = fbnic_get_ts_info, .get_ts_stats = fbnic_get_ts_stats, .get_eth_mac_stats = fbnic_get_eth_mac_stats, From ef1c28817bf90aab3a6365ec81c30c09a3b18ece Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 Dec 2024 18:52:34 -0800 Subject: [PATCH 0976/1450] eth: fbnic: don't reset the secondary RSS indir table Secondary RSS indirection table is for additional contexts. It can / should be initialized when such context is created. Since we don't support creating RSS contexts, yet, this change has no user visible effect. Link: https://patch.msgid.link/20241220025241.1522781-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic_rpc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c index 908c098cd59e7..b99c890ac43f6 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c @@ -13,10 +13,8 @@ void fbnic_reset_indir_tbl(struct fbnic_net *fbn) unsigned int num_rx = fbn->num_rx_queues; unsigned int i; - for (i = 0; i < FBNIC_RPC_RSS_TBL_SIZE; i++) { + for (i = 0; i < FBNIC_RPC_RSS_TBL_SIZE; i++) fbn->indir_tbl[0][i] = ethtool_rxfh_indir_default(i, num_rx); - fbn->indir_tbl[1][i] = ethtool_rxfh_indir_default(i, num_rx); - } } void fbnic_rss_key_fill(u32 *buffer) From 31ab733e999edbc4070d8386c608d9f0b73267c5 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 19 Dec 2024 18:52:35 -0800 Subject: [PATCH 0977/1450] eth: fbnic: support setting RSS configuration Let the user program the RSS indirection table and the RSS key. Straightforward implementation. Track the changes and don't bother poking the HW if user asked for a config identical to what's already programmed. The device only supports Toeplitz hash. Similarly to the GET support - all the real code that does the programming was part of initial driver submission, already. Signed-off-by: Alexander Duyck Link: https://patch.msgid.link/20241220025241.1522781-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index e71ae6abb0f52..5523803c8edda 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -201,6 +201,60 @@ fbnic_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh) return 0; } +static unsigned int +fbnic_set_indir(struct fbnic_net *fbn, unsigned int idx, const u32 *indir) +{ + unsigned int i, changes = 0; + + for (i = 0; i < FBNIC_RPC_RSS_TBL_SIZE; i++) { + if (fbn->indir_tbl[idx][i] == indir[i]) + continue; + + fbn->indir_tbl[idx][i] = indir[i]; + changes++; + } + + return changes; +} + +static int +fbnic_set_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh, + struct netlink_ext_ack *extack) +{ + struct fbnic_net *fbn = netdev_priv(netdev); + unsigned int i, changes = 0; + + if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE && + rxfh->hfunc != ETH_RSS_HASH_TOP) + return -EINVAL; + + if (rxfh->key) { + u32 rss_key = 0; + + for (i = FBNIC_RPC_RSS_KEY_BYTE_LEN; i--;) { + rss_key >>= 8; + rss_key |= (u32)(rxfh->key[i]) << 24; + + if (i % 4) + continue; + + if (fbn->rss_key[i / 4] == rss_key) + continue; + + fbn->rss_key[i / 4] = rss_key; + changes++; + } + } + + if (rxfh->indir) + changes += fbnic_set_indir(fbn, 0, rxfh->indir); + + if (changes && netif_running(netdev)) + fbnic_rss_reinit_hw(fbn->fbd, fbn); + + return 0; +} + static int fbnic_get_ts_info(struct net_device *netdev, struct kernel_ethtool_ts_info *tsinfo) @@ -312,6 +366,7 @@ static const struct ethtool_ops fbnic_ethtool_ops = { .get_rxfh_key_size = fbnic_get_rxfh_key_size, .get_rxfh_indir_size = fbnic_get_rxfh_indir_size, .get_rxfh = fbnic_get_rxfh, + .set_rxfh = fbnic_set_rxfh, .get_ts_info = fbnic_get_ts_info, .get_ts_stats = fbnic_get_ts_stats, .get_eth_mac_stats = fbnic_get_eth_mac_stats, From c23a1461bfee0a6f158795a58c768911c49d6cd0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 19 Dec 2024 18:52:36 -0800 Subject: [PATCH 0978/1450] eth: fbnic: let user control the RSS hash fields Support setting the fields over which RSS computes its hash. Signed-off-by: Alexander Duyck Link: https://patch.msgid.link/20241220025241.1522781-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index 5523803c8edda..d1be8fc304046 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -167,6 +167,55 @@ static int fbnic_get_rxnfc(struct net_device *netdev, return ret; } +#define FBNIC_L2_HASH_OPTIONS \ + (RXH_L2DA | RXH_DISCARD) +#define FBNIC_L3_HASH_OPTIONS \ + (FBNIC_L2_HASH_OPTIONS | RXH_IP_SRC | RXH_IP_DST) +#define FBNIC_L4_HASH_OPTIONS \ + (FBNIC_L3_HASH_OPTIONS | RXH_L4_B_0_1 | RXH_L4_B_2_3) + +static int +fbnic_set_rss_hash_opts(struct fbnic_net *fbn, const struct ethtool_rxnfc *cmd) +{ + int hash_opt_idx; + + /* Verify the type requested is correct */ + hash_opt_idx = fbnic_get_rss_hash_idx(cmd->flow_type); + if (hash_opt_idx < 0) + return -EINVAL; + + /* Verify the fields asked for can actually be assigned based on type */ + if (cmd->data & ~FBNIC_L4_HASH_OPTIONS || + (hash_opt_idx > FBNIC_L4_HASH_OPT && + cmd->data & ~FBNIC_L3_HASH_OPTIONS) || + (hash_opt_idx > FBNIC_IP_HASH_OPT && + cmd->data & ~FBNIC_L2_HASH_OPTIONS)) + return -EINVAL; + + fbn->rss_flow_hash[hash_opt_idx] = cmd->data; + + if (netif_running(fbn->netdev)) { + fbnic_rss_reinit(fbn->fbd, fbn); + fbnic_write_rules(fbn->fbd); + } + + return 0; +} + +static int fbnic_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd) +{ + struct fbnic_net *fbn = netdev_priv(netdev); + int ret = -EOPNOTSUPP; + + switch (cmd->cmd) { + case ETHTOOL_SRXFH: + ret = fbnic_set_rss_hash_opts(fbn, cmd); + break; + } + + return ret; +} + static u32 fbnic_get_rxfh_key_size(struct net_device *netdev) { return FBNIC_RPC_RSS_KEY_BYTE_LEN; @@ -363,6 +412,7 @@ static const struct ethtool_ops fbnic_ethtool_ops = { .get_ethtool_stats = fbnic_get_ethtool_stats, .get_sset_count = fbnic_get_sset_count, .get_rxnfc = fbnic_get_rxnfc, + .set_rxnfc = fbnic_set_rxnfc, .get_rxfh_key_size = fbnic_get_rxfh_key_size, .get_rxfh_indir_size = fbnic_get_rxfh_indir_size, .get_rxfh = fbnic_get_rxfh, From db7159c400ffbbf3e0df1f3ef6b847b7b62186a3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 Dec 2024 18:52:37 -0800 Subject: [PATCH 0979/1450] eth: fbnic: store NAPIs in an array instead of the list We will need an array for storing NAPIs in the upcoming IRQ handler reuse rework. Replace the current list we have, so that we are able to reuse it later. In a few places replace i as the iterator with t when we iterate over triads, this seems slightly less confusing than having i, j, k variables. Link: https://patch.msgid.link/20241220025241.1522781-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/meta/fbnic/fbnic_netdev.c | 1 - .../net/ethernet/meta/fbnic/fbnic_netdev.h | 6 +- drivers/net/ethernet/meta/fbnic/fbnic_txrx.c | 117 ++++++++++-------- drivers/net/ethernet/meta/fbnic/fbnic_txrx.h | 7 +- 4 files changed, 71 insertions(+), 60 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index fc7d80db5fa6d..558644c49a4be 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -615,7 +615,6 @@ struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd) fbn->netdev = netdev; fbn->fbd = fbd; - INIT_LIST_HEAD(&fbn->napis); fbn->txq_size = FBNIC_TXQ_SIZE_DEFAULT; fbn->hpq_size = FBNIC_HPQ_SIZE_DEFAULT; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h index b8417b3007788..0986c8f120a80 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h @@ -11,10 +11,14 @@ #include "fbnic_rpc.h" #include "fbnic_txrx.h" +#define FBNIC_MAX_NAPI_VECTORS 128u + struct fbnic_net { struct fbnic_ring *tx[FBNIC_MAX_TXQS]; struct fbnic_ring *rx[FBNIC_MAX_RXQS]; + struct fbnic_napi_vector *napi[FBNIC_MAX_NAPI_VECTORS]; + struct net_device *netdev; struct fbnic_dev *fbd; @@ -56,8 +60,6 @@ struct fbnic_net { /* Time stampinn filter config */ struct kernel_hwtstamp_config hwtstamp_config; - - struct list_head napis; }; int __fbnic_open(struct fbnic_net *fbn); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c index b5050fabe8fe8..87e4eb03d9912 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c @@ -1116,16 +1116,17 @@ static void fbnic_free_napi_vector(struct fbnic_net *fbn, fbnic_free_irq(fbd, v_idx, nv); page_pool_destroy(nv->page_pool); netif_napi_del(&nv->napi); - list_del(&nv->napis); + fbn->napi[fbnic_napi_idx(nv)] = NULL; kfree(nv); } void fbnic_free_napi_vectors(struct fbnic_net *fbn) { - struct fbnic_napi_vector *nv, *temp; + int i; - list_for_each_entry_safe(nv, temp, &fbn->napis, napis) - fbnic_free_napi_vector(fbn, nv); + for (i = 0; i < fbn->num_napi; i++) + if (fbn->napi[i]) + fbnic_free_napi_vector(fbn, fbn->napi[i]); } static void fbnic_name_napi_vector(struct fbnic_napi_vector *nv) @@ -1222,7 +1223,7 @@ static int fbnic_alloc_napi_vector(struct fbnic_dev *fbd, struct fbnic_net *fbn, nv->v_idx = v_idx; /* Tie napi to netdev */ - list_add(&nv->napis, &fbn->napis); + fbn->napi[fbnic_napi_idx(nv)] = nv; netif_napi_add(fbn->netdev, &nv->napi, fbnic_poll); /* Record IRQ to NAPI struct */ @@ -1307,7 +1308,7 @@ static int fbnic_alloc_napi_vector(struct fbnic_dev *fbd, struct fbnic_net *fbn, page_pool_destroy(nv->page_pool); napi_del: netif_napi_del(&nv->napi); - list_del(&nv->napis); + fbn->napi[fbnic_napi_idx(nv)] = NULL; kfree(nv); return err; } @@ -1612,19 +1613,18 @@ static int fbnic_alloc_nv_resources(struct fbnic_net *fbn, void fbnic_free_resources(struct fbnic_net *fbn) { - struct fbnic_napi_vector *nv; + int i; - list_for_each_entry(nv, &fbn->napis, napis) - fbnic_free_nv_resources(fbn, nv); + for (i = 0; i < fbn->num_napi; i++) + fbnic_free_nv_resources(fbn, fbn->napi[i]); } int fbnic_alloc_resources(struct fbnic_net *fbn) { - struct fbnic_napi_vector *nv; - int err = -ENODEV; + int i, err = -ENODEV; - list_for_each_entry(nv, &fbn->napis, napis) { - err = fbnic_alloc_nv_resources(fbn, nv); + for (i = 0; i < fbn->num_napi; i++) { + err = fbnic_alloc_nv_resources(fbn, fbn->napi[i]); if (err) goto free_resources; } @@ -1632,8 +1632,8 @@ int fbnic_alloc_resources(struct fbnic_net *fbn) return 0; free_resources: - list_for_each_entry_continue_reverse(nv, &fbn->napis, napis) - fbnic_free_nv_resources(fbn, nv); + while (i--) + fbnic_free_nv_resources(fbn, fbn->napi[i]); return err; } @@ -1670,33 +1670,34 @@ static void fbnic_disable_rcq(struct fbnic_ring *rxr) void fbnic_napi_disable(struct fbnic_net *fbn) { - struct fbnic_napi_vector *nv; + int i; - list_for_each_entry(nv, &fbn->napis, napis) { - napi_disable(&nv->napi); + for (i = 0; i < fbn->num_napi; i++) { + napi_disable(&fbn->napi[i]->napi); - fbnic_nv_irq_disable(nv); + fbnic_nv_irq_disable(fbn->napi[i]); } } void fbnic_disable(struct fbnic_net *fbn) { struct fbnic_dev *fbd = fbn->fbd; - struct fbnic_napi_vector *nv; - int i, j; + int i, j, t; + + for (i = 0; i < fbn->num_napi; i++) { + struct fbnic_napi_vector *nv = fbn->napi[i]; - list_for_each_entry(nv, &fbn->napis, napis) { /* Disable Tx queue triads */ - for (i = 0; i < nv->txt_count; i++) { - struct fbnic_q_triad *qt = &nv->qt[i]; + for (t = 0; t < nv->txt_count; t++) { + struct fbnic_q_triad *qt = &nv->qt[t]; fbnic_disable_twq0(&qt->sub0); fbnic_disable_tcq(&qt->cmpl); } /* Disable Rx queue triads */ - for (j = 0; j < nv->rxt_count; j++, i++) { - struct fbnic_q_triad *qt = &nv->qt[i]; + for (j = 0; j < nv->rxt_count; j++, t++) { + struct fbnic_q_triad *qt = &nv->qt[t]; fbnic_disable_bdq(&qt->sub0, &qt->sub1); fbnic_disable_rcq(&qt->cmpl); @@ -1792,14 +1793,15 @@ int fbnic_wait_all_queues_idle(struct fbnic_dev *fbd, bool may_fail) void fbnic_flush(struct fbnic_net *fbn) { - struct fbnic_napi_vector *nv; + int i; - list_for_each_entry(nv, &fbn->napis, napis) { - int i, j; + for (i = 0; i < fbn->num_napi; i++) { + struct fbnic_napi_vector *nv = fbn->napi[i]; + int j, t; /* Flush any processed Tx Queue Triads and drop the rest */ - for (i = 0; i < nv->txt_count; i++) { - struct fbnic_q_triad *qt = &nv->qt[i]; + for (t = 0; t < nv->txt_count; t++) { + struct fbnic_q_triad *qt = &nv->qt[t]; struct netdev_queue *tx_queue; /* Clean the work queues of unprocessed work */ @@ -1823,8 +1825,8 @@ void fbnic_flush(struct fbnic_net *fbn) } /* Flush any processed Rx Queue Triads and drop the rest */ - for (j = 0; j < nv->rxt_count; j++, i++) { - struct fbnic_q_triad *qt = &nv->qt[i]; + for (j = 0; j < nv->rxt_count; j++, t++) { + struct fbnic_q_triad *qt = &nv->qt[t]; /* Clean the work queues of unprocessed work */ fbnic_clean_bdq(nv, 0, &qt->sub0, qt->sub0.tail); @@ -1845,14 +1847,15 @@ void fbnic_flush(struct fbnic_net *fbn) void fbnic_fill(struct fbnic_net *fbn) { - struct fbnic_napi_vector *nv; + int i; - list_for_each_entry(nv, &fbn->napis, napis) { - int i, j; + for (i = 0; i < fbn->num_napi; i++) { + struct fbnic_napi_vector *nv = fbn->napi[i]; + int j, t; /* Configure NAPI mapping for Tx */ - for (i = 0; i < nv->txt_count; i++) { - struct fbnic_q_triad *qt = &nv->qt[i]; + for (t = 0; t < nv->txt_count; t++) { + struct fbnic_q_triad *qt = &nv->qt[t]; /* Nothing to do if Tx queue is disabled */ if (qt->sub0.flags & FBNIC_RING_F_DISABLED) @@ -1866,8 +1869,8 @@ void fbnic_fill(struct fbnic_net *fbn) /* Configure NAPI mapping and populate pages * in the BDQ rings to use for Rx */ - for (j = 0; j < nv->rxt_count; j++, i++) { - struct fbnic_q_triad *qt = &nv->qt[i]; + for (j = 0; j < nv->rxt_count; j++, t++) { + struct fbnic_q_triad *qt = &nv->qt[t]; /* Associate Rx queue with NAPI */ netif_queue_set_napi(nv->napi.dev, qt->cmpl.q_idx, @@ -2025,21 +2028,23 @@ static void fbnic_enable_rcq(struct fbnic_napi_vector *nv, void fbnic_enable(struct fbnic_net *fbn) { struct fbnic_dev *fbd = fbn->fbd; - struct fbnic_napi_vector *nv; - int i, j; + int i; + + for (i = 0; i < fbn->num_napi; i++) { + struct fbnic_napi_vector *nv = fbn->napi[i]; + int j, t; - list_for_each_entry(nv, &fbn->napis, napis) { /* Setup Tx Queue Triads */ - for (i = 0; i < nv->txt_count; i++) { - struct fbnic_q_triad *qt = &nv->qt[i]; + for (t = 0; t < nv->txt_count; t++) { + struct fbnic_q_triad *qt = &nv->qt[t]; fbnic_enable_twq0(&qt->sub0); fbnic_enable_tcq(nv, &qt->cmpl); } /* Setup Rx Queue Triads */ - for (j = 0; j < nv->rxt_count; j++, i++) { - struct fbnic_q_triad *qt = &nv->qt[i]; + for (j = 0; j < nv->rxt_count; j++, t++) { + struct fbnic_q_triad *qt = &nv->qt[t]; fbnic_enable_bdq(&qt->sub0, &qt->sub1); fbnic_config_drop_mode_rcq(nv, &qt->cmpl); @@ -2064,10 +2069,11 @@ void fbnic_napi_enable(struct fbnic_net *fbn) { u32 irqs[FBNIC_MAX_MSIX_VECS / 32] = {}; struct fbnic_dev *fbd = fbn->fbd; - struct fbnic_napi_vector *nv; int i; - list_for_each_entry(nv, &fbn->napis, napis) { + for (i = 0; i < fbn->num_napi; i++) { + struct fbnic_napi_vector *nv = fbn->napi[i]; + napi_enable(&nv->napi); fbnic_nv_irq_enable(nv); @@ -2096,17 +2102,18 @@ void fbnic_napi_depletion_check(struct net_device *netdev) struct fbnic_net *fbn = netdev_priv(netdev); u32 irqs[FBNIC_MAX_MSIX_VECS / 32] = {}; struct fbnic_dev *fbd = fbn->fbd; - struct fbnic_napi_vector *nv; - int i, j; + int i, j, t; + + for (i = 0; i < fbn->num_napi; i++) { + struct fbnic_napi_vector *nv = fbn->napi[i]; - list_for_each_entry(nv, &fbn->napis, napis) { /* Find RQs which are completely out of pages */ - for (i = nv->txt_count, j = 0; j < nv->rxt_count; j++, i++) { + for (t = nv->txt_count, j = 0; j < nv->rxt_count; j++, t++) { /* Assume 4 pages is always enough to fit a packet * and therefore generate a completion and an IRQ. */ - if (fbnic_desc_used(&nv->qt[i].sub0) < 4 || - fbnic_desc_used(&nv->qt[i].sub1) < 4) + if (fbnic_desc_used(&nv->qt[t].sub0) < 4 || + fbnic_desc_used(&nv->qt[t].sub1) < 4) irqs[nv->v_idx / 32] |= BIT(nv->v_idx % 32); } } diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h index 8d626287c3f47..1965d1fa38a2a 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h @@ -110,8 +110,6 @@ struct fbnic_napi_vector { u8 txt_count; u8 rxt_count; - struct list_head napis; - struct fbnic_q_triad qt[]; }; @@ -137,4 +135,9 @@ void fbnic_fill(struct fbnic_net *fbn); void fbnic_napi_depletion_check(struct net_device *netdev); int fbnic_wait_all_queues_idle(struct fbnic_dev *fbd, bool may_fail); +static inline int fbnic_napi_idx(const struct fbnic_napi_vector *nv) +{ + return nv->v_idx - FBNIC_NON_NAPI_VECTORS; +} + #endif /* _FBNIC_TXRX_H_ */ From 3a856ab347261870d2bb6f3cab95325f27eee104 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 Dec 2024 18:52:38 -0800 Subject: [PATCH 0980/1450] eth: fbnic: add IRQ reuse support Change our method of swapping NAPIs without disturbing existing config. This is primarily needed for "live reconfiguration" such as changing the channel count when interface is already up. Previously we were planning to use a trick of using shared interrupts. We would install a second IRQ handler for the new NAPI, and make it return IRQ_NONE until we were ready for it to take over. This works fine functionally but breaks IRQ naming. The IRQ subsystem uses the IRQ name to create the procfs entry, since both handlers used the same name the second handler wouldn't get a proc directory registered. When first one gets removed on success full ring count change it would remove its directory and we would be left with none. New approach uses a double pointer to the NAPI. The IRQ handler needs to know how to locate the NAPI to schedule. We register a single IRQ handler and give it a pointer to a pointer. We can then change what it points to without re-registering. This may have a tiny perf impact, but really really negligible. Link: https://patch.msgid.link/20241220025241.1522781-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic.h | 14 +++++++ drivers/net/ethernet/meta/fbnic/fbnic_irq.c | 42 +++++++++++++++++++ .../net/ethernet/meta/fbnic/fbnic_netdev.c | 2 + drivers/net/ethernet/meta/fbnic/fbnic_txrx.c | 25 ++--------- drivers/net/ethernet/meta/fbnic/fbnic_txrx.h | 2 +- 5 files changed, 63 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 706ae6104c8e2..ed527209b30cf 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -16,6 +16,10 @@ #include "fbnic_mac.h" #include "fbnic_rpc.h" +struct fbnic_napi_vector; + +#define FBNIC_MAX_NAPI_VECTORS 128u + struct fbnic_dev { struct device *dev; struct net_device *netdev; @@ -29,6 +33,11 @@ struct fbnic_dev { unsigned int pcs_msix_vector; unsigned short num_irqs; + struct { + u8 users; + char name[IFNAMSIZ + 9]; + } napi_irq[FBNIC_MAX_NAPI_VECTORS]; + struct delayed_work service_task; struct fbnic_fw_mbx mbx[FBNIC_IPC_MBX_INDICES]; @@ -148,6 +157,11 @@ void fbnic_hwmon_unregister(struct fbnic_dev *fbd); int fbnic_pcs_irq_enable(struct fbnic_dev *fbd); void fbnic_pcs_irq_disable(struct fbnic_dev *fbd); +void fbnic_napi_name_irqs(struct fbnic_dev *fbd); +int fbnic_napi_request_irq(struct fbnic_dev *fbd, + struct fbnic_napi_vector *nv); +void fbnic_napi_free_irq(struct fbnic_dev *fbd, + struct fbnic_napi_vector *nv); int fbnic_request_irq(struct fbnic_dev *dev, int nr, irq_handler_t handler, unsigned long flags, const char *name, void *data); void fbnic_free_irq(struct fbnic_dev *dev, int nr, void *data); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_irq.c b/drivers/net/ethernet/meta/fbnic/fbnic_irq.c index 9143621959203..a8ea7b6774a83 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_irq.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_irq.c @@ -169,6 +169,48 @@ void fbnic_free_irq(struct fbnic_dev *fbd, int nr, void *data) free_irq(irq, data); } +void fbnic_napi_name_irqs(struct fbnic_dev *fbd) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(fbd->napi_irq); i++) + snprintf(fbd->napi_irq[i].name, + sizeof(fbd->napi_irq[i].name), + "%s-TxRx-%u", fbd->netdev->name, i); +} + +int fbnic_napi_request_irq(struct fbnic_dev *fbd, + struct fbnic_napi_vector *nv) +{ + struct fbnic_net *fbn = netdev_priv(fbd->netdev); + int i = fbnic_napi_idx(nv); + int err; + + if (!fbd->napi_irq[i].users) { + err = fbnic_request_irq(fbd, nv->v_idx, + fbnic_msix_clean_rings, 0, + fbd->napi_irq[i].name, + &fbn->napi[i]); + if (err) + return err; + } + + fbd->napi_irq[i].users++; + return 0; +} + +void fbnic_napi_free_irq(struct fbnic_dev *fbd, + struct fbnic_napi_vector *nv) +{ + struct fbnic_net *fbn = netdev_priv(fbd->netdev); + int i = fbnic_napi_idx(nv); + + if (--fbd->napi_irq[i].users) + return; + + fbnic_free_irq(fbd, nv->v_idx, &fbn->napi[i]); +} + void fbnic_free_irqs(struct fbnic_dev *fbd) { struct pci_dev *pdev = to_pci_dev(fbd->dev); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index 558644c49a4be..2f19144e44105 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -74,6 +74,8 @@ static int fbnic_open(struct net_device *netdev) struct fbnic_net *fbn = netdev_priv(netdev); int err; + fbnic_napi_name_irqs(fbn->fbd); + err = __fbnic_open(fbn); if (!err) fbnic_up(fbn); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c index 87e4eb03d9912..75b491b8e1ca7 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c @@ -1036,9 +1036,9 @@ static int fbnic_poll(struct napi_struct *napi, int budget) return 0; } -static irqreturn_t fbnic_msix_clean_rings(int __always_unused irq, void *data) +irqreturn_t fbnic_msix_clean_rings(int __always_unused irq, void *data) { - struct fbnic_napi_vector *nv = data; + struct fbnic_napi_vector *nv = *(void **)data; napi_schedule_irqoff(&nv->napi); @@ -1099,7 +1099,6 @@ static void fbnic_free_napi_vector(struct fbnic_net *fbn, struct fbnic_napi_vector *nv) { struct fbnic_dev *fbd = nv->fbd; - u32 v_idx = nv->v_idx; int i, j; for (i = 0; i < nv->txt_count; i++) { @@ -1113,7 +1112,7 @@ static void fbnic_free_napi_vector(struct fbnic_net *fbn, fbnic_remove_rx_ring(fbn, &nv->qt[i].cmpl); } - fbnic_free_irq(fbd, v_idx, nv); + fbnic_napi_free_irq(fbd, nv); page_pool_destroy(nv->page_pool); netif_napi_del(&nv->napi); fbn->napi[fbnic_napi_idx(nv)] = NULL; @@ -1129,18 +1128,6 @@ void fbnic_free_napi_vectors(struct fbnic_net *fbn) fbnic_free_napi_vector(fbn, fbn->napi[i]); } -static void fbnic_name_napi_vector(struct fbnic_napi_vector *nv) -{ - unsigned char *dev_name = nv->napi.dev->name; - - if (!nv->rxt_count) - snprintf(nv->name, sizeof(nv->name), "%s-Tx-%u", dev_name, - nv->v_idx - FBNIC_NON_NAPI_VECTORS); - else - snprintf(nv->name, sizeof(nv->name), "%s-TxRx-%u", dev_name, - nv->v_idx - FBNIC_NON_NAPI_VECTORS); -} - #define FBNIC_PAGE_POOL_FLAGS \ (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) @@ -1240,12 +1227,8 @@ static int fbnic_alloc_napi_vector(struct fbnic_dev *fbd, struct fbnic_net *fbn, goto napi_del; } - /* Initialize vector name */ - fbnic_name_napi_vector(nv); - /* Request the IRQ for napi vector */ - err = fbnic_request_irq(fbd, v_idx, &fbnic_msix_clean_rings, - IRQF_SHARED, nv->name, nv); + err = fbnic_napi_request_irq(fbd, nv); if (err) goto pp_destroy; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h index 1965d1fa38a2a..c8d908860ab0c 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h @@ -104,7 +104,6 @@ struct fbnic_napi_vector { struct device *dev; /* Device for DMA unmapping */ struct page_pool *page_pool; struct fbnic_dev *fbd; - char name[IFNAMSIZ + 9]; u16 v_idx; u8 txt_count; @@ -125,6 +124,7 @@ int fbnic_alloc_napi_vectors(struct fbnic_net *fbn); void fbnic_free_napi_vectors(struct fbnic_net *fbn); int fbnic_alloc_resources(struct fbnic_net *fbn); void fbnic_free_resources(struct fbnic_net *fbn); +irqreturn_t fbnic_msix_clean_rings(int irq, void *data); void fbnic_napi_enable(struct fbnic_net *fbn); void fbnic_napi_disable(struct fbnic_net *fbn); void fbnic_enable(struct fbnic_net *fbn); From 557d02238e05eb66b9aba9a1f90f3a2131c6c887 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 19 Dec 2024 18:52:39 -0800 Subject: [PATCH 0981/1450] eth: fbnic: centralize the queue count and NAPI<>queue setting To simplify dealing with RTNL_ASSERT() requirements further down the line, move setting queue count and NAPI<>queue association to their own helpers. Signed-off-by: Alexander Duyck Link: https://patch.msgid.link/20241220025241.1522781-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/meta/fbnic/fbnic_netdev.c | 9 +- drivers/net/ethernet/meta/fbnic/fbnic_txrx.c | 92 +++++++++++++------ drivers/net/ethernet/meta/fbnic/fbnic_txrx.h | 2 + 3 files changed, 70 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index 2f19144e44105..7a96b6ee773f3 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -23,13 +23,7 @@ int __fbnic_open(struct fbnic_net *fbn) if (err) goto free_napi_vectors; - err = netif_set_real_num_tx_queues(fbn->netdev, - fbn->num_tx_queues); - if (err) - goto free_resources; - - err = netif_set_real_num_rx_queues(fbn->netdev, - fbn->num_rx_queues); + err = fbnic_set_netif_queues(fbn); if (err) goto free_resources; @@ -93,6 +87,7 @@ static int fbnic_stop(struct net_device *netdev) fbnic_time_stop(fbn); fbnic_fw_xmit_ownership_msg(fbn->fbd, false); + fbnic_reset_netif_queues(fbn); fbnic_free_resources(fbn); fbnic_free_napi_vectors(fbn); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c index 75b491b8e1ca7..92fc1ad6ed6f9 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c @@ -1621,6 +1621,71 @@ int fbnic_alloc_resources(struct fbnic_net *fbn) return err; } +static void fbnic_set_netif_napi(struct fbnic_napi_vector *nv) +{ + int i, j; + + /* Associate Tx queue with NAPI */ + for (i = 0; i < nv->txt_count; i++) { + struct fbnic_q_triad *qt = &nv->qt[i]; + + netif_queue_set_napi(nv->napi.dev, qt->sub0.q_idx, + NETDEV_QUEUE_TYPE_TX, &nv->napi); + } + + /* Associate Rx queue with NAPI */ + for (j = 0; j < nv->rxt_count; j++, i++) { + struct fbnic_q_triad *qt = &nv->qt[i]; + + netif_queue_set_napi(nv->napi.dev, qt->cmpl.q_idx, + NETDEV_QUEUE_TYPE_RX, &nv->napi); + } +} + +static void fbnic_reset_netif_napi(struct fbnic_napi_vector *nv) +{ + int i, j; + + /* Disassociate Tx queue from NAPI */ + for (i = 0; i < nv->txt_count; i++) { + struct fbnic_q_triad *qt = &nv->qt[i]; + + netif_queue_set_napi(nv->napi.dev, qt->sub0.q_idx, + NETDEV_QUEUE_TYPE_TX, NULL); + } + + /* Disassociate Rx queue from NAPI */ + for (j = 0; j < nv->rxt_count; j++, i++) { + struct fbnic_q_triad *qt = &nv->qt[i]; + + netif_queue_set_napi(nv->napi.dev, qt->cmpl.q_idx, + NETDEV_QUEUE_TYPE_RX, NULL); + } +} + +int fbnic_set_netif_queues(struct fbnic_net *fbn) +{ + int i, err; + + err = netif_set_real_num_queues(fbn->netdev, fbn->num_tx_queues, + fbn->num_rx_queues); + if (err) + return err; + + for (i = 0; i < fbn->num_napi; i++) + fbnic_set_netif_napi(fbn->napi[i]); + + return 0; +} + +void fbnic_reset_netif_queues(struct fbnic_net *fbn) +{ + int i; + + for (i = 0; i < fbn->num_napi; i++) + fbnic_reset_netif_napi(fbn->napi[i]); +} + static void fbnic_disable_twq0(struct fbnic_ring *txr) { u32 twq_ctl = fbnic_ring_rd32(txr, FBNIC_QUEUE_TWQ0_CTL); @@ -1801,10 +1866,6 @@ void fbnic_flush(struct fbnic_net *fbn) tx_queue = netdev_get_tx_queue(nv->napi.dev, qt->sub0.q_idx); netdev_tx_reset_queue(tx_queue); - - /* Disassociate Tx queue from NAPI */ - netif_queue_set_napi(nv->napi.dev, qt->sub0.q_idx, - NETDEV_QUEUE_TYPE_TX, NULL); } /* Flush any processed Rx Queue Triads and drop the rest */ @@ -1820,10 +1881,6 @@ void fbnic_flush(struct fbnic_net *fbn) fbnic_put_pkt_buff(nv, qt->cmpl.pkt, 0); qt->cmpl.pkt->buff.data_hard_start = NULL; - - /* Disassociate Rx queue from NAPI */ - netif_queue_set_napi(nv->napi.dev, qt->cmpl.q_idx, - NETDEV_QUEUE_TYPE_RX, NULL); } } } @@ -1836,29 +1893,12 @@ void fbnic_fill(struct fbnic_net *fbn) struct fbnic_napi_vector *nv = fbn->napi[i]; int j, t; - /* Configure NAPI mapping for Tx */ - for (t = 0; t < nv->txt_count; t++) { - struct fbnic_q_triad *qt = &nv->qt[t]; - - /* Nothing to do if Tx queue is disabled */ - if (qt->sub0.flags & FBNIC_RING_F_DISABLED) - continue; - - /* Associate Tx queue with NAPI */ - netif_queue_set_napi(nv->napi.dev, qt->sub0.q_idx, - NETDEV_QUEUE_TYPE_TX, &nv->napi); - } - /* Configure NAPI mapping and populate pages * in the BDQ rings to use for Rx */ - for (j = 0; j < nv->rxt_count; j++, t++) { + for (j = 0, t = nv->txt_count; j < nv->rxt_count; j++, t++) { struct fbnic_q_triad *qt = &nv->qt[t]; - /* Associate Rx queue with NAPI */ - netif_queue_set_napi(nv->napi.dev, qt->cmpl.q_idx, - NETDEV_QUEUE_TYPE_RX, &nv->napi); - /* Populate the header and payload BDQs */ fbnic_fill_bdq(nv, &qt->sub0); fbnic_fill_bdq(nv, &qt->sub1); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h index c8d908860ab0c..92c671135ad73 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h @@ -124,6 +124,8 @@ int fbnic_alloc_napi_vectors(struct fbnic_net *fbn); void fbnic_free_napi_vectors(struct fbnic_net *fbn); int fbnic_alloc_resources(struct fbnic_net *fbn); void fbnic_free_resources(struct fbnic_net *fbn); +int fbnic_set_netif_queues(struct fbnic_net *fbn); +void fbnic_reset_netif_queues(struct fbnic_net *fbn); irqreturn_t fbnic_msix_clean_rings(int irq, void *data); void fbnic_napi_enable(struct fbnic_net *fbn); void fbnic_napi_disable(struct fbnic_net *fbn); From 3a481cc72673b2fbb18271acf2d9b43f6a920ec4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 Dec 2024 18:52:40 -0800 Subject: [PATCH 0982/1450] eth: fbnic: support ring channel get and set while down Trivial implementation of ethtool channel get and set. Set is only supported when device is closed, next patch will add code for live reconfig. Asymmetric configurations are supported (combined + extra Tx or Rx), so are configurations with independent IRQs for Rx and Tx. Having all 3 NAPI types (combined, Tx, Rx) is not supported. We used to only call fbnic_reset_indir_tbl() during init. Now that we call it after device had been register must be careful not to override user config. Link: https://patch.msgid.link/20241220025241.1522781-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 64 +++++++++++++++++++ drivers/net/ethernet/meta/fbnic/fbnic_rpc.c | 3 + 2 files changed, 67 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index d1be8fc304046..d2fe97ae6a716 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -304,6 +304,68 @@ fbnic_set_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh, return 0; } +static void fbnic_get_channels(struct net_device *netdev, + struct ethtool_channels *ch) +{ + struct fbnic_net *fbn = netdev_priv(netdev); + struct fbnic_dev *fbd = fbn->fbd; + + ch->max_rx = fbd->max_num_queues; + ch->max_tx = fbd->max_num_queues; + ch->max_combined = min(ch->max_rx, ch->max_tx); + ch->max_other = FBNIC_NON_NAPI_VECTORS; + + if (fbn->num_rx_queues > fbn->num_napi || + fbn->num_tx_queues > fbn->num_napi) + ch->combined_count = min(fbn->num_rx_queues, + fbn->num_tx_queues); + else + ch->combined_count = + fbn->num_rx_queues + fbn->num_tx_queues - fbn->num_napi; + ch->rx_count = fbn->num_rx_queues - ch->combined_count; + ch->tx_count = fbn->num_tx_queues - ch->combined_count; + ch->other_count = FBNIC_NON_NAPI_VECTORS; +} + +static void fbnic_set_queues(struct fbnic_net *fbn, struct ethtool_channels *ch, + unsigned int max_napis) +{ + fbn->num_rx_queues = ch->rx_count + ch->combined_count; + fbn->num_tx_queues = ch->tx_count + ch->combined_count; + fbn->num_napi = min(ch->rx_count + ch->tx_count + ch->combined_count, + max_napis); +} + +static int fbnic_set_channels(struct net_device *netdev, + struct ethtool_channels *ch) +{ + struct fbnic_net *fbn = netdev_priv(netdev); + unsigned int max_napis, standalone; + struct fbnic_dev *fbd = fbn->fbd; + + max_napis = fbd->num_irqs - FBNIC_NON_NAPI_VECTORS; + standalone = ch->rx_count + ch->tx_count; + + /* Limits for standalone queues: + * - each queue has it's own NAPI (num_napi >= rx + tx + combined) + * - combining queues (combined not 0, rx or tx must be 0) + */ + if ((ch->rx_count && ch->tx_count && ch->combined_count) || + (standalone && standalone + ch->combined_count > max_napis) || + ch->rx_count + ch->combined_count > fbd->max_num_queues || + ch->tx_count + ch->combined_count > fbd->max_num_queues || + ch->other_count != FBNIC_NON_NAPI_VECTORS) + return -EINVAL; + + if (!netif_running(netdev)) { + fbnic_set_queues(fbn, ch, max_napis); + fbnic_reset_indir_tbl(fbn); + return 0; + } + + return -EBUSY; +} + static int fbnic_get_ts_info(struct net_device *netdev, struct kernel_ethtool_ts_info *tsinfo) @@ -417,6 +479,8 @@ static const struct ethtool_ops fbnic_ethtool_ops = { .get_rxfh_indir_size = fbnic_get_rxfh_indir_size, .get_rxfh = fbnic_get_rxfh, .set_rxfh = fbnic_set_rxfh, + .get_channels = fbnic_get_channels, + .set_channels = fbnic_set_channels, .get_ts_info = fbnic_get_ts_info, .get_ts_stats = fbnic_get_ts_stats, .get_eth_mac_stats = fbnic_get_eth_mac_stats, diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c index b99c890ac43f6..c25bd300b9024 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c @@ -13,6 +13,9 @@ void fbnic_reset_indir_tbl(struct fbnic_net *fbn) unsigned int num_rx = fbn->num_rx_queues; unsigned int i; + if (netif_is_rxfh_configured(fbn->netdev)) + return; + for (i = 0; i < FBNIC_RPC_RSS_TBL_SIZE; i++) fbn->indir_tbl[0][i] = ethtool_rxfh_indir_default(i, num_rx); } From 52dc722db0d98bcdf40927dd1719468f7d08bd59 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 Dec 2024 18:52:41 -0800 Subject: [PATCH 0983/1450] eth: fbnic: support ring channel set while up Implement the channel count changes. Copy the netdev priv, allocate new channels using it. Stop, swap, start. Then free the copy of the priv along with the channels it holds, which are now the channels that used to be on the real priv. Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20241220025241.1522781-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic.h | 1 + .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 121 +++++++++++++++++- drivers/net/ethernet/meta/fbnic/fbnic_irq.c | 11 ++ .../net/ethernet/meta/fbnic/fbnic_netdev.h | 1 + drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 2 +- drivers/net/ethernet/meta/fbnic/fbnic_txrx.c | 8 +- drivers/net/ethernet/meta/fbnic/fbnic_txrx.h | 5 + 7 files changed, 143 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index ed527209b30cf..14751f16e1252 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -162,6 +162,7 @@ int fbnic_napi_request_irq(struct fbnic_dev *fbd, struct fbnic_napi_vector *nv); void fbnic_napi_free_irq(struct fbnic_dev *fbd, struct fbnic_napi_vector *nv); +void fbnic_synchronize_irq(struct fbnic_dev *fbd, int nr); int fbnic_request_irq(struct fbnic_dev *dev, int nr, irq_handler_t handler, unsigned long flags, const char *name, void *data); void fbnic_free_irq(struct fbnic_dev *dev, int nr, void *data); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index d2fe97ae6a716..20cd9f5f89e27 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -65,6 +65,76 @@ static void fbnic_get_regs(struct net_device *netdev, fbnic_csr_get_regs(fbn->fbd, data, ®s->version); } +static struct fbnic_net *fbnic_clone_create(struct fbnic_net *orig) +{ + struct fbnic_net *clone; + + clone = kmemdup(orig, sizeof(*orig), GFP_KERNEL); + if (!clone) + return NULL; + + memset(clone->tx, 0, sizeof(clone->tx)); + memset(clone->rx, 0, sizeof(clone->rx)); + memset(clone->napi, 0, sizeof(clone->napi)); + return clone; +} + +static void fbnic_clone_swap_cfg(struct fbnic_net *orig, + struct fbnic_net *clone) +{ + swap(clone->rcq_size, orig->rcq_size); + swap(clone->hpq_size, orig->hpq_size); + swap(clone->ppq_size, orig->ppq_size); + swap(clone->txq_size, orig->txq_size); + swap(clone->num_rx_queues, orig->num_rx_queues); + swap(clone->num_tx_queues, orig->num_tx_queues); + swap(clone->num_napi, orig->num_napi); +} + +static void fbnic_aggregate_vector_counters(struct fbnic_net *fbn, + struct fbnic_napi_vector *nv) +{ + int i, j; + + for (i = 0; i < nv->txt_count; i++) { + fbnic_aggregate_ring_tx_counters(fbn, &nv->qt[i].sub0); + fbnic_aggregate_ring_tx_counters(fbn, &nv->qt[i].sub1); + fbnic_aggregate_ring_tx_counters(fbn, &nv->qt[i].cmpl); + } + + for (j = 0; j < nv->rxt_count; j++, i++) { + fbnic_aggregate_ring_rx_counters(fbn, &nv->qt[i].sub0); + fbnic_aggregate_ring_rx_counters(fbn, &nv->qt[i].sub1); + fbnic_aggregate_ring_rx_counters(fbn, &nv->qt[i].cmpl); + } +} + +static void fbnic_clone_swap(struct fbnic_net *orig, + struct fbnic_net *clone) +{ + struct fbnic_dev *fbd = orig->fbd; + unsigned int i; + + for (i = 0; i < max(clone->num_napi, orig->num_napi); i++) + fbnic_synchronize_irq(fbd, FBNIC_NON_NAPI_VECTORS + i); + for (i = 0; i < orig->num_napi; i++) + fbnic_aggregate_vector_counters(orig, orig->napi[i]); + + fbnic_clone_swap_cfg(orig, clone); + + for (i = 0; i < ARRAY_SIZE(orig->napi); i++) + swap(clone->napi[i], orig->napi[i]); + for (i = 0; i < ARRAY_SIZE(orig->tx); i++) + swap(clone->tx[i], orig->tx[i]); + for (i = 0; i < ARRAY_SIZE(orig->rx); i++) + swap(clone->rx[i], orig->rx[i]); +} + +static void fbnic_clone_free(struct fbnic_net *clone) +{ + kfree(clone); +} + static void fbnic_get_strings(struct net_device *dev, u32 sset, u8 *data) { int i; @@ -342,6 +412,8 @@ static int fbnic_set_channels(struct net_device *netdev, struct fbnic_net *fbn = netdev_priv(netdev); unsigned int max_napis, standalone; struct fbnic_dev *fbd = fbn->fbd; + struct fbnic_net *clone; + int err; max_napis = fbd->num_irqs - FBNIC_NON_NAPI_VECTORS; standalone = ch->rx_count + ch->tx_count; @@ -363,7 +435,54 @@ static int fbnic_set_channels(struct net_device *netdev, return 0; } - return -EBUSY; + clone = fbnic_clone_create(fbn); + if (!clone) + return -ENOMEM; + + fbnic_set_queues(clone, ch, max_napis); + + err = fbnic_alloc_napi_vectors(clone); + if (err) + goto err_free_clone; + + err = fbnic_alloc_resources(clone); + if (err) + goto err_free_napis; + + fbnic_down_noidle(fbn); + err = fbnic_wait_all_queues_idle(fbn->fbd, true); + if (err) + goto err_start_stack; + + err = fbnic_set_netif_queues(clone); + if (err) + goto err_start_stack; + + /* Nothing can fail past this point */ + fbnic_flush(fbn); + + fbnic_clone_swap(fbn, clone); + + /* Reset RSS indirection table */ + fbnic_reset_indir_tbl(fbn); + + fbnic_up(fbn); + + fbnic_free_resources(clone); + fbnic_free_napi_vectors(clone); + fbnic_clone_free(clone); + + return 0; + +err_start_stack: + fbnic_flush(fbn); + fbnic_up(fbn); + fbnic_free_resources(clone); +err_free_napis: + fbnic_free_napi_vectors(clone); +err_free_clone: + fbnic_clone_free(clone); + return err; } static int diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_irq.c b/drivers/net/ethernet/meta/fbnic/fbnic_irq.c index a8ea7b6774a83..1bbc0e56f3a03 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_irq.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_irq.c @@ -146,6 +146,17 @@ void fbnic_pcs_irq_disable(struct fbnic_dev *fbd) free_irq(fbd->pcs_msix_vector, fbd); } +void fbnic_synchronize_irq(struct fbnic_dev *fbd, int nr) +{ + struct pci_dev *pdev = to_pci_dev(fbd->dev); + int irq = pci_irq_vector(pdev, nr); + + if (irq < 0) + return; + + synchronize_irq(irq); +} + int fbnic_request_irq(struct fbnic_dev *fbd, int nr, irq_handler_t handler, unsigned long flags, const char *name, void *data) { diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h index 0986c8f120a80..a392ac1cc4f2f 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.h @@ -65,6 +65,7 @@ struct fbnic_net { int __fbnic_open(struct fbnic_net *fbn); void fbnic_up(struct fbnic_net *fbn); void fbnic_down(struct fbnic_net *fbn); +void fbnic_down_noidle(struct fbnic_net *fbn); struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd); void fbnic_netdev_free(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 32702dc4a0667..6cbbc2ee3e1f9 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -145,7 +145,7 @@ void fbnic_up(struct fbnic_net *fbn) fbnic_service_task_start(fbn); } -static void fbnic_down_noidle(struct fbnic_net *fbn) +void fbnic_down_noidle(struct fbnic_net *fbn) { fbnic_service_task_stop(fbn); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c index 92fc1ad6ed6f9..bb54ce5f57871 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c @@ -1045,8 +1045,8 @@ irqreturn_t fbnic_msix_clean_rings(int __always_unused irq, void *data) return IRQ_HANDLED; } -static void fbnic_aggregate_ring_rx_counters(struct fbnic_net *fbn, - struct fbnic_ring *rxr) +void fbnic_aggregate_ring_rx_counters(struct fbnic_net *fbn, + struct fbnic_ring *rxr) { struct fbnic_queue_stats *stats = &rxr->stats; @@ -1056,8 +1056,8 @@ static void fbnic_aggregate_ring_rx_counters(struct fbnic_net *fbn, fbn->rx_stats.dropped += stats->dropped; } -static void fbnic_aggregate_ring_tx_counters(struct fbnic_net *fbn, - struct fbnic_ring *txr) +void fbnic_aggregate_ring_tx_counters(struct fbnic_net *fbn, + struct fbnic_ring *txr) { struct fbnic_queue_stats *stats = &txr->stats; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h index 92c671135ad73..c2a94f31f71bd 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h @@ -120,6 +120,11 @@ netdev_features_t fbnic_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features); +void fbnic_aggregate_ring_rx_counters(struct fbnic_net *fbn, + struct fbnic_ring *rxr); +void fbnic_aggregate_ring_tx_counters(struct fbnic_net *fbn, + struct fbnic_ring *txr); + int fbnic_alloc_napi_vectors(struct fbnic_net *fbn); void fbnic_free_napi_vectors(struct fbnic_net *fbn); int fbnic_alloc_resources(struct fbnic_net *fbn); From 4a4d38ace1fb0586bffd2aab03caaa05d6011748 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Fri, 20 Dec 2024 13:26:14 +0530 Subject: [PATCH 0984/1450] net: ethernet: ti: am65-cpsw: default to round-robin for host port receive The Host Port (i.e. CPU facing port) of CPSW receives traffic from Linux via TX DMA Channels which are Hardware Queues consisting of traffic categorized according to their priority. The Host Port is configured to dequeue traffic from these Hardware Queues on the basis of priority i.e. as long as traffic exists on a Hardware Queue of a higher priority, the traffic on Hardware Queues of lower priority isn't dequeued. An alternate operation is also supported wherein traffic can be dequeued by the Host Port in a Round-Robin manner. Until commit under Fixes, the am65-cpsw driver enabled a single TX DMA Channel, due to which, unless modified by user via "ethtool", all traffic from Linux is transmitted on DMA Channel 0. Therefore, configuring the Host Port for priority based dequeuing or Round-Robin operation is identical since there is a single DMA Channel. Since commit under Fixes, all 8 TX DMA Channels are enabled by default. Additionally, the default "tc mapping" doesn't take into account the possibility of different traffic profiles which various users might have. This results in traffic starvation at the Host Port due to the priority based dequeuing which has been enabled by default since the inception of the driver. The traffic starvation triggers NETDEV WATCHDOG timeout for all TX DMA Channels that haven't been serviced due to the presence of traffic on the higher priority TX DMA Channels. Fix this by defaulting to Round-Robin dequeuing at the Host Port, which shall ensure that traffic is dequeued from all TX DMA Channels irrespective of the traffic profile. This will address the NETDEV WATCHDOG timeouts. At the same time, users can still switch from Round-Robin to Priority based dequeuing at the Host Port with the help of the "p0-rx-ptype-rrobin" private flag of "ethtool". Users are expected to setup an appropriate "tc mapping" that suits their traffic profile when switching to priority based dequeuing at the Host Port. Fixes: be397ea3473d ("net: ethernet: am65-cpsw: Set default TX channels to maximum") Cc: Signed-off-by: Siddharth Vadapalli Link: https://patch.msgid.link/20241220075618.228202-1-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 14e1df721f2e8..5465bf872734a 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -3551,7 +3551,7 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) init_completion(&common->tdown_complete); common->tx_ch_num = AM65_CPSW_DEFAULT_TX_CHNS; common->rx_ch_num_flows = AM65_CPSW_DEFAULT_RX_CHN_FLOWS; - common->pf_p0_rx_ptype_rrobin = false; + common->pf_p0_rx_ptype_rrobin = true; common->default_vlan = 1; common->ports = devm_kcalloc(dev, common->port_num, From 4c61d809cf608842112c77880f50810a564cd9cb Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 20 Dec 2024 09:37:40 +0100 Subject: [PATCH 0985/1450] net: ethtool: Fix suspicious rcu_dereference usage The __ethtool_get_ts_info function can be called with or without the rtnl lock held. When the rtnl lock is not held, using rtnl_dereference() triggers a warning due to the lack of lock context. Add an rcu_read_lock() to ensure the lock is acquired and to maintain synchronization. Reported-by: syzbot+a344326c05c98ba19682@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/676147f8.050a0220.37aaf.0154.GAE@google.com/ Fixes: b9e3f7dc9ed9 ("net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology") Signed-off-by: Kory Maincent Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241220083741.175329-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- net/ethtool/common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 02f941f667dd5..2607aea1fbfb2 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -870,7 +870,8 @@ int __ethtool_get_ts_info(struct net_device *dev, { struct hwtstamp_provider *hwprov; - hwprov = rtnl_dereference(dev->hwprov); + rcu_read_lock(); + hwprov = rcu_dereference(dev->hwprov); /* No provider specified, use default behavior */ if (!hwprov) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -887,9 +888,11 @@ int __ethtool_get_ts_info(struct net_device *dev, info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; + rcu_read_unlock(); return err; } + rcu_read_unlock(); return ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc); } From 75221e96101fa93390d3db5c23e026f5e3565d9b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 20 Dec 2024 18:04:00 +0100 Subject: [PATCH 0986/1450] net: pse-pd: tps23881: Fix power on/off issue An issue was present in the initial driver implementation. The driver read the power status of all channels before toggling the bit of the desired one. Using the power status register as a base value introduced a problem, because only the bit corresponding to the concerned channel ID should be set in the write-only power enable register. This led to cases where disabling power for one channel also powered off other channels. This patch removes the power status read and ensures the value is limited to the bit matching the channel index of the PI. Fixes: 20e6d190ffe1 ("net: pse-pd: Add TI TPS23881 PSE controller driver") Signed-off-by: Kory Maincent Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20241220170400.291705-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/tps23881.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/net/pse-pd/tps23881.c b/drivers/net/pse-pd/tps23881.c index 5c4e88be46ee3..8797ca1a8a219 100644 --- a/drivers/net/pse-pd/tps23881.c +++ b/drivers/net/pse-pd/tps23881.c @@ -64,15 +64,11 @@ static int tps23881_pi_enable(struct pse_controller_dev *pcdev, int id) if (id >= TPS23881_MAX_CHANS) return -ERANGE; - ret = i2c_smbus_read_word_data(client, TPS23881_REG_PW_STATUS); - if (ret < 0) - return ret; - chan = priv->port[id].chan[0]; if (chan < 4) - val = (u16)(ret | BIT(chan)); + val = BIT(chan); else - val = (u16)(ret | BIT(chan + 4)); + val = BIT(chan + 4); if (priv->port[id].is_4p) { chan = priv->port[id].chan[1]; @@ -100,15 +96,11 @@ static int tps23881_pi_disable(struct pse_controller_dev *pcdev, int id) if (id >= TPS23881_MAX_CHANS) return -ERANGE; - ret = i2c_smbus_read_word_data(client, TPS23881_REG_PW_STATUS); - if (ret < 0) - return ret; - chan = priv->port[id].chan[0]; if (chan < 4) - val = (u16)(ret | BIT(chan + 4)); + val = BIT(chan + 4); else - val = (u16)(ret | BIT(chan + 8)); + val = BIT(chan + 8); if (priv->port[id].is_4p) { chan = priv->port[id].chan[1]; From 050a4c011b0dfeb91664a5d7bd3647ff38db08ce Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Fri, 20 Dec 2024 10:15:02 +0200 Subject: [PATCH 0987/1450] net/mlx5: DR, select MSIX vector 0 for completion queue creation When creating a software steering completion queue (CQ), an arbitrary MSIX vector n is selected. This results in the CQ sharing the same Ethernet traffic channel n associated with the chosen vector. However, the value of n is often unpredictable, which can introduce complications for interrupt monitoring and verification tools. Moreover, SW steering uses polling rather than event-driven interrupts. Therefore, there is no need to select any MSIX vector other than the existing vector 0 for CQ creation. In light of these factors, and to enhance predictability, we modify the code to consistently select MSIX vector 0 for CQ creation. Fixes: 297cccebdc5a ("net/mlx5: DR, Expose an internal API to issue RDMA operations") Signed-off-by: Shahar Shitrit Reviewed-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 6fa06ba2d3465..f57c84e5128bc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1067,7 +1067,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, int inlen, err, eqn; void *cqc, *in; __be64 *pas; - int vector; u32 i; cq = kzalloc(sizeof(*cq), GFP_KERNEL); @@ -1096,8 +1095,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, if (!in) goto err_cqwq; - vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); - err = mlx5_comp_eqn_get(mdev, vector, &eqn); + err = mlx5_comp_eqn_get(mdev, 0, &eqn); if (err) { kvfree(in); goto err_cqwq; From 8c6254479b3d5bd788d2b5fefaa48fb194331ed0 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 20 Dec 2024 10:15:03 +0200 Subject: [PATCH 0988/1450] net/mlx5e: macsec: Maintain TX SA from encoding_sa In MACsec, it is possible to create multiple active TX SAs on a SC, but only one such SA can be used at a time for transmission. This SA is selected through the encoding_sa link parameter. When there are 2 or more active TX SAs configured (encoding_sa=0): ip macsec add macsec0 tx sa 0 pn 1 on key 00 ip macsec add macsec0 tx sa 1 pn 1 on key 00 ... the traffic should be still sent via TX SA 0 as the encoding_sa was not changed. However, the driver ignores the encoding_sa and overrides it to SA 1 by installing the flow steering id of the newly created TX SA into the SCI -> flow steering id hash map. The future packet tx descriptors will point to the incorrect flow steering rule (SA 1). This patch fixes the issue by avoiding the creation of the flow steering rule for an active TX SA that is not the encoding_sa. The driver side tx_sa object and the FW side macsec object are still created. When the encoding_sa link parameter is changed to another active TX SA, only the new flow steering rule will be created in the mlx5e_macsec_upd_txsa() handler. Fixes: 8ff0ac5be144 ("net/mlx5: Add MACsec offload Tx command support") Signed-off-by: Dragos Tatulea Reviewed-by: Cosmin Ratiu Reviewed-by: Lior Nahmanson Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index cc9bcc4200324..6ab02f3fc2912 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -339,9 +339,13 @@ static int mlx5e_macsec_init_sa_fs(struct macsec_context *ctx, { struct mlx5e_priv *priv = macsec_netdev_priv(ctx->netdev); struct mlx5_macsec_fs *macsec_fs = priv->mdev->macsec_fs; + const struct macsec_tx_sc *tx_sc = &ctx->secy->tx_sc; struct mlx5_macsec_rule_attrs rule_attrs; union mlx5_macsec_rule *macsec_rule; + if (is_tx && tx_sc->encoding_sa != sa->assoc_num) + return 0; + rule_attrs.macsec_obj_id = sa->macsec_obj_id; rule_attrs.sci = sa->sci; rule_attrs.assoc_num = sa->assoc_num; From 5a03b368562a7ff5f5f1f63b5adf8309cbdbd5be Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:04 +0200 Subject: [PATCH 0989/1450] net/mlx5e: Skip restore TC rules for vport rep without loaded flag During driver unload, unregister_netdev is called after unloading vport rep. So, the mlx5e_rep_priv is already freed while trying to get rpriv->netdev, or walk rpriv->tc_ht, which results in use-after-free. So add the checking to make sure access the data of vport rep which is still loaded. Fixes: d1569537a837 ("net/mlx5e: Modify and restore TC rules for IPSec TX rules") Signed-off-by: Jianbo Liu Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 --- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c index 5a0047bdcb510..ed977ae75fab8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c @@ -150,11 +150,11 @@ void mlx5_esw_ipsec_restore_dest_uplink(struct mlx5_core_dev *mdev) unsigned long i; int err; - xa_for_each(&esw->offloads.vport_reps, i, rep) { - rpriv = rep->rep_data[REP_ETH].priv; - if (!rpriv || !rpriv->netdev) + mlx5_esw_for_each_rep(esw, i, rep) { + if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED) continue; + rpriv = rep->rep_data[REP_ETH].priv; rhashtable_walk_enter(&rpriv->tc_ht, &iter); rhashtable_walk_start(&iter); while ((flow = rhashtable_walk_next(&iter)) != NULL) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index a83d41121db67..8573d36785f42 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -714,6 +714,9 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw); MLX5_CAP_GEN_2((esw->dev), ec_vf_vport_base) +\ (last) - 1) +#define mlx5_esw_for_each_rep(esw, i, rep) \ + xa_for_each(&((esw)->offloads.vport_reps), i, rep) + struct mlx5_eswitch *__must_check mlx5_devlink_eswitch_get(struct devlink *devlink); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index d5b42b3a19fdf..40359f3207248 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -53,9 +53,6 @@ #include "lag/lag.h" #include "en/tc/post_meter.h" -#define mlx5_esw_for_each_rep(esw, i, rep) \ - xa_for_each(&((esw)->offloads.vport_reps), i, rep) - /* There are two match-all miss flows, one for unicast dst mac and * one for multicast. */ From 2a4f56fbcc473d8faeb29b73082df39efbe5893c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:05 +0200 Subject: [PATCH 0990/1450] net/mlx5e: Keep netdev when leave switchdev for devlink set legacy only In the cited commit, when changing from switchdev to legacy mode, uplink representor's netdev is kept, and its profile is replaced with nic profile, so netdev is detached from old profile, then attach to new profile. During profile change, the hardware resources allocated by the old profile will be cleaned up. However, the cleanup is relying on the related kernel modules. And they may need to flush themselves first, which is triggered by netdev events, for example, NETDEV_UNREGISTER. However, netdev is kept, or netdev_register is called after the cleanup, which may cause troubles because the resources are still referred by kernel modules. The same process applies to all the caes when uplink is leaving switchdev mode, including devlink eswitch mode set legacy, driver unload and devlink reload. For the first one, it can be blocked and returns failure to users, whenever possible. But it's hard for the others. Besides, the attachment to nic profile is unnecessary as the netdev will be unregistered anyway for such cases. So in this patch, the original behavior is kept only for devlink eswitch set mode legacy. For the others, moves netdev unregistration before the profile change. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 19 +++++++++++++++++-- .../net/ethernet/mellanox/mlx5/core/en_rep.c | 15 +++++++++++++++ .../mellanox/mlx5/core/eswitch_offloads.c | 2 ++ include/linux/mlx5/driver.h | 1 + 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dd16d73000c31..0ec17c276bdd2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6542,8 +6542,23 @@ static void _mlx5e_remove(struct auxiliary_device *adev) mlx5_core_uplink_netdev_set(mdev, NULL); mlx5e_dcbnl_delete_app(priv); - unregister_netdev(priv->netdev); - _mlx5e_suspend(adev, false); + /* When unload driver, the netdev is in registered state + * if it's from legacy mode. If from switchdev mode, it + * is already unregistered before changing to NIC profile. + */ + if (priv->netdev->reg_state == NETREG_REGISTERED) { + unregister_netdev(priv->netdev); + _mlx5e_suspend(adev, false); + } else { + struct mlx5_core_dev *pos; + int i; + + if (test_bit(MLX5E_STATE_DESTROYING, &priv->state)) + mlx5_sd_for_each_dev(i, mdev, pos) + mlx5e_destroy_mdev_resources(pos); + else + _mlx5e_suspend(adev, true); + } /* Avoid cleanup if profile rollback failed. */ if (priv->profile) priv->profile->cleanup(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 554f9cb5b53fd..fdff9fd8a89ec 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1509,6 +1509,21 @@ mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) priv = netdev_priv(netdev); + /* This bit is set when using devlink to change eswitch mode from + * switchdev to legacy. As need to keep uplink netdev ifindex, we + * detach uplink representor profile and attach NIC profile only. + * The netdev will be unregistered later when unload NIC auxiliary + * driver for this case. + * We explicitly block devlink eswitch mode change if any IPSec rules + * offloaded, but can't block other cases, such as driver unload + * and devlink reload. We have to unregister netdev before profile + * change for those cases. This is to avoid resource leak because + * the offloaded rules don't have the chance to be unoffloaded before + * cleanup which is triggered by detach uplink representor profile. + */ + if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_SWITCH_LEGACY)) + unregister_netdev(netdev); + mlx5e_netdev_attach_nic_profile(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 40359f3207248..06076dd9ec644 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3777,6 +3777,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, esw->eswitch_operation_in_progress = true; up_write(&esw->mode_lock); + if (mode == DEVLINK_ESWITCH_MODE_LEGACY) + esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY; mlx5_eswitch_disable_locked(esw); if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { if (mlx5_devlink_trap_get_num_active(esw->dev)) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d9..8f5991168ccdf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -524,6 +524,7 @@ enum { * creation/deletion on drivers rescan. Unset during device attach. */ MLX5_PRIV_FLAGS_DETACH = 1 << 2, + MLX5_PRIV_FLAGS_SWITCH_LEGACY = 1 << 3, }; struct mlx5_adev { From c71b59690aa12daf3edbb4dd02b8821490dc727e Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Fri, 20 Dec 2024 14:48:40 +0100 Subject: [PATCH 0991/1450] net: sparx5: do some preparation work The sparx5_port_init() does initial configuration of a variety of different features and options for each port. Some are shared for all types of devices, some are not. As it is now, common configuration is done after configuration of low-speed devices. This will not work when adding RGMII support in a subsequent patch. In preparation for lan969x RGMII support, move a block of code, that configures 2g5 devices, down. This ensures that the configuration common to all devices is done before configuration of 2g5, 5g, 10g and 25g devices. Reviewed-by: Steen Hegelund Reviewed-by: Horatiu Vultur Tested-by: Robert Marko Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20241220-sparx5-lan969x-switch-driver-4-v5-1-fa8ba5dff732@microchip.com Signed-off-by: Jakub Kicinski --- .../ethernet/microchip/sparx5/sparx5_port.c | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c index f9d1a6bb9bfff..f39bf4878e11c 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c @@ -1067,24 +1067,6 @@ int sparx5_port_init(struct sparx5 *sparx5, if (err) return err; - /* Configure MAC vlan awareness */ - err = sparx5_port_max_tags_set(sparx5, port); - if (err) - return err; - - /* Set Max Length */ - spx5_rmw(DEV2G5_MAC_MAXLEN_CFG_MAX_LEN_SET(ETH_MAXLEN), - DEV2G5_MAC_MAXLEN_CFG_MAX_LEN, - sparx5, - DEV2G5_MAC_MAXLEN_CFG(port->portno)); - - /* 1G/2G5: Signal Detect configuration */ - spx5_wr(DEV2G5_PCS1G_SD_CFG_SD_POL_SET(sd_pol) | - DEV2G5_PCS1G_SD_CFG_SD_SEL_SET(sd_sel) | - DEV2G5_PCS1G_SD_CFG_SD_ENA_SET(sd_ena), - sparx5, - DEV2G5_PCS1G_SD_CFG(port->portno)); - /* Set Pause WM hysteresis */ spx5_rmw(QSYS_PAUSE_CFG_PAUSE_START_SET(pause_start) | QSYS_PAUSE_CFG_PAUSE_STOP_SET(pause_stop) | @@ -1108,6 +1090,24 @@ int sparx5_port_init(struct sparx5 *sparx5, ANA_CL_FILTER_CTRL_FILTER_SMAC_MC_DIS, sparx5, ANA_CL_FILTER_CTRL(port->portno)); + /* Configure MAC vlan awareness */ + err = sparx5_port_max_tags_set(sparx5, port); + if (err) + return err; + + /* Set Max Length */ + spx5_rmw(DEV2G5_MAC_MAXLEN_CFG_MAX_LEN_SET(ETH_MAXLEN), + DEV2G5_MAC_MAXLEN_CFG_MAX_LEN, + sparx5, + DEV2G5_MAC_MAXLEN_CFG(port->portno)); + + /* 1G/2G5: Signal Detect configuration */ + spx5_wr(DEV2G5_PCS1G_SD_CFG_SD_POL_SET(sd_pol) | + DEV2G5_PCS1G_SD_CFG_SD_SEL_SET(sd_sel) | + DEV2G5_PCS1G_SD_CFG_SD_ENA_SET(sd_ena), + sparx5, + DEV2G5_PCS1G_SD_CFG(port->portno)); + if (conf->portmode == PHY_INTERFACE_MODE_QSGMII || conf->portmode == PHY_INTERFACE_MODE_SGMII) { err = sparx5_serdes_set(sparx5, port, conf); From dd2baee1084034b8666290cbcc02cb32fe5a8666 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Fri, 20 Dec 2024 14:48:41 +0100 Subject: [PATCH 0992/1450] net: sparx5: add function for RGMII port check The lan969x device contains two RGMII port interfaces, sitting at port 28 and 29. Add function: is_port_rgmii() to the match data ops, that checks if a given port is an RGMII port or not. For Sparx5, this function always returns false. Reviewed-by: Steen Hegelund Reviewed-by: Horatiu Vultur Tested-by: Robert Marko Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20241220-sparx5-lan969x-switch-driver-4-v5-2-fa8ba5dff732@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c | 1 + drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.h | 5 +++++ drivers/net/ethernet/microchip/sparx5/sparx5_main.c | 1 + drivers/net/ethernet/microchip/sparx5/sparx5_main.h | 1 + drivers/net/ethernet/microchip/sparx5/sparx5_port.h | 5 +++++ 5 files changed, 13 insertions(+) diff --git a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c index c2afa2176b08f..76f0c8635eb98 100644 --- a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c +++ b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c @@ -329,6 +329,7 @@ static const struct sparx5_ops lan969x_ops = { .is_port_5g = &lan969x_port_is_5g, .is_port_10g = &lan969x_port_is_10g, .is_port_25g = &lan969x_port_is_25g, + .is_port_rgmii = &lan969x_port_is_rgmii, .get_port_dev_index = &lan969x_port_dev_mapping, .get_port_dev_bit = &lan969x_get_dev_mode_bit, .get_hsch_max_group_rate = &lan969x_get_hsch_max_group_rate, diff --git a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.h b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.h index 2489d0d32dfd1..4b91c47d6d216 100644 --- a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.h +++ b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.h @@ -59,6 +59,11 @@ static inline bool lan969x_port_is_25g(int portno) return false; } +static inline bool lan969x_port_is_rgmii(int portno) +{ + return portno == 28 || portno == 29; +} + /* lan969x_calendar.c */ int lan969x_dsm_calendar_calc(struct sparx5 *sparx5, u32 taxi, struct sparx5_calendar_data *data); diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c index f61aa15beab7c..4be717ba7d373 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c @@ -1072,6 +1072,7 @@ static const struct sparx5_ops sparx5_ops = { .is_port_5g = &sparx5_port_is_5g, .is_port_10g = &sparx5_port_is_10g, .is_port_25g = &sparx5_port_is_25g, + .is_port_rgmii = &sparx5_port_is_rgmii, .get_port_dev_index = &sparx5_port_dev_mapping, .get_port_dev_bit = &sparx5_port_dev_mapping, .get_hsch_max_group_rate = &sparx5_get_hsch_max_group_rate, diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h index d5dd953b0a71d..c58d7841638ef 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h @@ -313,6 +313,7 @@ struct sparx5_ops { bool (*is_port_5g)(int portno); bool (*is_port_10g)(int portno); bool (*is_port_25g)(int portno); + bool (*is_port_rgmii)(int portno); u32 (*get_port_dev_index)(struct sparx5 *sparx5, int port); u32 (*get_port_dev_bit)(struct sparx5 *sparx5, int port); u32 (*get_hsch_max_group_rate)(int grp); diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.h b/drivers/net/ethernet/microchip/sparx5/sparx5_port.h index 9b9bcc6834bcb..c8a37468a3d1d 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.h @@ -40,6 +40,11 @@ static inline bool sparx5_port_is_25g(int portno) return portno >= 56 && portno <= 63; } +static inline bool sparx5_port_is_rgmii(int portno) +{ + return false; +} + static inline u32 sparx5_to_high_dev(struct sparx5 *sparx5, int port) { const struct sparx5_ops *ops = sparx5->data->ops; From 05bda8a1bdedd5dfc21522d732c3bf9413d70eb3 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Fri, 20 Dec 2024 14:48:42 +0100 Subject: [PATCH 0993/1450] net: sparx5: use is_port_rgmii() throughout Now that we can check if a given port is an RGMII port, use it in the following cases: - To set RGMII PHY modes for RGMII port devices. - To avoid checking for a SerDes node in the devicetree, when the port is an RGMII port. - To bail out of sparx5_port_init() when the common configuration is done. Reviewed-by: Steen Hegelund Reviewed-by: Horatiu Vultur Tested-by: Robert Marko Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20241220-sparx5-lan969x-switch-driver-4-v5-3-fa8ba5dff732@microchip.com Signed-off-by: Jakub Kicinski --- .../ethernet/microchip/sparx5/sparx5_main.c | 28 +++++++++++++------ .../ethernet/microchip/sparx5/sparx5_port.c | 3 ++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c index 4be717ba7d373..e68277c38adc3 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.c @@ -313,10 +313,13 @@ static int sparx5_create_port(struct sparx5 *sparx5, struct initial_port_config *config) { struct sparx5_port *spx5_port; + const struct sparx5_ops *ops; struct net_device *ndev; struct phylink *phylink; int err; + ops = sparx5->data->ops; + ndev = sparx5_create_netdev(sparx5, config->portno); if (IS_ERR(ndev)) { dev_err(sparx5->dev, "Could not create net device: %02u\n", @@ -357,6 +360,9 @@ static int sparx5_create_port(struct sparx5 *sparx5, MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000FD | MAC_2500FD | MAC_5000FD | MAC_10000FD | MAC_25000FD; + if (ops->is_port_rgmii(spx5_port->portno)) + phy_interface_set_rgmii(spx5_port->phylink_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_SGMII, spx5_port->phylink_config.supported_interfaces); __set_bit(PHY_INTERFACE_MODE_QSGMII, @@ -830,6 +836,7 @@ static int mchp_sparx5_probe(struct platform_device *pdev) struct initial_port_config *configs, *config; struct device_node *np = pdev->dev.of_node; struct device_node *ports, *portnp; + const struct sparx5_ops *ops; struct reset_control *reset; struct sparx5 *sparx5; int idx = 0, err = 0; @@ -851,6 +858,7 @@ static int mchp_sparx5_probe(struct platform_device *pdev) return -EINVAL; regs = sparx5->data->regs; + ops = sparx5->data->ops; /* Do switch core reset if available */ reset = devm_reset_control_get_optional_shared(&pdev->dev, "switch"); @@ -880,7 +888,7 @@ static int mchp_sparx5_probe(struct platform_device *pdev) for_each_available_child_of_node(ports, portnp) { struct sparx5_port_config *conf; - struct phy *serdes; + struct phy *serdes = NULL; u32 portno; err = of_property_read_u32(portnp, "reg", &portno); @@ -910,13 +918,17 @@ static int mchp_sparx5_probe(struct platform_device *pdev) conf->sd_sgpio = ~0; else sparx5->sd_sgpio_remapping = true; - serdes = devm_of_phy_get(sparx5->dev, portnp, NULL); - if (IS_ERR(serdes)) { - err = dev_err_probe(sparx5->dev, PTR_ERR(serdes), - "port %u: missing serdes\n", - portno); - of_node_put(portnp); - goto cleanup_config; + /* There is no SerDes node for RGMII ports. */ + if (!ops->is_port_rgmii(portno)) { + serdes = devm_of_phy_get(sparx5->dev, portnp, NULL); + if (IS_ERR(serdes)) { + err = dev_err_probe(sparx5->dev, + PTR_ERR(serdes), + "port %u: missing serdes\n", + portno); + of_node_put(portnp); + goto cleanup_config; + } } config->portno = portno; config->node = portnp; diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c index f39bf4878e11c..996dc43430191 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c @@ -1090,6 +1090,9 @@ int sparx5_port_init(struct sparx5 *sparx5, ANA_CL_FILTER_CTRL_FILTER_SMAC_MC_DIS, sparx5, ANA_CL_FILTER_CTRL(port->portno)); + if (ops->is_port_rgmii(port->portno)) + return 0; /* RGMII device - nothing more to configure */ + /* Configure MAC vlan awareness */ err = sparx5_port_max_tags_set(sparx5, port); if (err) From d9450934f915a97b09f035866acd5da302f0dc12 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Fri, 20 Dec 2024 14:48:43 +0100 Subject: [PATCH 0994/1450] net: sparx5: skip low-speed configuration when port is RGMII When doing a port config, we configure low-speed port devices, among other things. We have a check to ensure, that the device is indeed a low-speed device, an not a high-speed device. Add an additional check, to ensure that the device is not an RGMII device. Reviewed-by: Steen Hegelund Reviewed-by: Horatiu Vultur Tested-by: Robert Marko Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20241220-sparx5-lan969x-switch-driver-4-v5-4-fa8ba5dff732@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_port.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c index 996dc43430191..0a1374422ccbd 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c @@ -994,6 +994,7 @@ int sparx5_port_config(struct sparx5 *sparx5, struct sparx5_port *port, struct sparx5_port_config *conf) { + bool rgmii = phy_interface_mode_is_rgmii(conf->phy_mode); bool high_speed_dev = sparx5_is_baser(conf->portmode); const struct sparx5_ops *ops = sparx5->data->ops; int err, urgency, stop_wm; @@ -1003,7 +1004,7 @@ int sparx5_port_config(struct sparx5 *sparx5, return err; /* high speed device is already configured */ - if (!high_speed_dev) + if (!rgmii && !high_speed_dev) sparx5_port_config_low_set(sparx5, port, conf); /* Configure flow control */ From 9b8d70ecfef7abcabe265be4faeb07e552383520 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Fri, 20 Dec 2024 14:48:44 +0100 Subject: [PATCH 0995/1450] net: sparx5: only return PCS for modes that require it The RGMII ports have no PCS to configure. Make sure we only return the PCS for port modes that require it. Reviewed-by: Russell King (Oracle) Tested-by: Robert Marko Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20241220-sparx5-lan969x-switch-driver-4-v5-5-fa8ba5dff732@microchip.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/microchip/sparx5/sparx5_phylink.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c b/drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c index f8562c1a894da..035d2f1bea0d0 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_phylink.c @@ -32,7 +32,19 @@ sparx5_phylink_mac_select_pcs(struct phylink_config *config, { struct sparx5_port *port = netdev_priv(to_net_dev(config->dev)); - return &port->phylink_pcs; + /* Return the PCS for all the modes that require it. */ + switch (interface) { + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_2500BASEX: + case PHY_INTERFACE_MODE_5GBASER: + case PHY_INTERFACE_MODE_10GBASER: + case PHY_INTERFACE_MODE_25GBASER: + return &port->phylink_pcs; + default: + return NULL; + } } static void sparx5_phylink_mac_config(struct phylink_config *config, From 95e467b85e6930d34093b7770c7ed964113589b0 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Fri, 20 Dec 2024 14:48:45 +0100 Subject: [PATCH 0996/1450] net: sparx5: verify RGMII speeds When doing a port config, we verify the port speed against the PHY mode and supported speeds of that PHY mode. Add checks for the four RGMII phy modes: RGMII, RGMII_ID, RGMII_TXID and RGMII_RXID. Reviewed-by: Steen Hegelund Reviewed-by: Horatiu Vultur Tested-by: Robert Marko Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20241220-sparx5-lan969x-switch-driver-4-v5-6-fa8ba5dff732@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_port.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c index 0a1374422ccbd..86d6c9e9ec7c1 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c @@ -257,6 +257,15 @@ static int sparx5_port_verify_speed(struct sparx5 *sparx5, conf->speed != SPEED_25000)) return sparx5_port_error(port, conf, SPX5_PERR_SPEED); break; + case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_TXID: + case PHY_INTERFACE_MODE_RGMII_RXID: + if (conf->speed != SPEED_1000 && + conf->speed != SPEED_100 && + conf->speed != SPEED_10) + return sparx5_port_error(port, conf, SPX5_PERR_SPEED); + break; default: return sparx5_port_error(port, conf, SPX5_PERR_IFTYPE); } From fb6ac1829bb5865768e75517aefb416a3a19569e Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Fri, 20 Dec 2024 14:48:46 +0100 Subject: [PATCH 0997/1450] net: lan969x: add RGMII registers Configuration of RGMII is done by configuring the GPIO and clock settings in the HSIOWRAP target, and configuring the RGMII port devices in the DEVRGMII target. Both targets contain registers replicated for the number of RGMII port devices, which is two. Add said targets and register macros required to configure RGMII. Reviewed-by: Steen Hegelund Reviewed-by: Horatiu Vultur Tested-by: Robert Marko Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20241220-sparx5-lan969x-switch-driver-4-v5-7-fa8ba5dff732@microchip.com Signed-off-by: Jakub Kicinski --- .../microchip/sparx5/lan969x/lan969x.c | 3 + .../microchip/sparx5/sparx5_main_regs.h | 145 ++++++++++++++++++ 2 files changed, 148 insertions(+) diff --git a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c index 76f0c8635eb98..be49a99556fe9 100644 --- a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c +++ b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c @@ -90,9 +90,12 @@ static const struct sparx5_main_io_resource lan969x_main_iomap[] = { { TARGET_DEV2G5 + 27, 0x30d8000, 1 }, /* 0xe30d8000 */ { TARGET_DEV10G + 9, 0x30dc000, 1 }, /* 0xe30dc000 */ { TARGET_PCS10G_BR + 9, 0x30e0000, 1 }, /* 0xe30e0000 */ + { TARGET_DEVRGMII, 0x30e4000, 1 }, /* 0xe30e4000 */ + { TARGET_DEVRGMII + 1, 0x30e8000, 1 }, /* 0xe30e8000 */ { TARGET_DSM, 0x30ec000, 1 }, /* 0xe30ec000 */ { TARGET_PORT_CONF, 0x30f0000, 1 }, /* 0xe30f0000 */ { TARGET_ASM, 0x3200000, 1 }, /* 0xe3200000 */ + { TARGET_HSIO_WRAP, 0x3408000, 1 }, /* 0xe3408000 */ }; static struct sparx5_sdlb_group lan969x_sdlb_groups[LAN969X_SDLB_GRP_CNT] = { diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main_regs.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main_regs.h index 561344f190627..d9ef4ef137b80 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main_regs.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main_regs.h @@ -37,6 +37,7 @@ enum sparx5_target { TARGET_FDMA = 117, TARGET_GCB = 118, TARGET_HSCH = 119, + TARGET_HSIO_WRAP = 120, TARGET_LRN = 122, TARGET_PCEP = 129, TARGET_PCS10G_BR = 132, @@ -54,6 +55,7 @@ enum sparx5_target { TARGET_VCAP_SUPER = 326, TARGET_VOP = 327, TARGET_XQS = 331, + TARGET_DEVRGMII = 392, NUM_TARGETS = 517 }; @@ -5367,6 +5369,69 @@ extern const struct sparx5_regs *regs; #define HSCH_TAS_STATEMACHINE_CFG_REVISIT_DLY_GET(x)\ FIELD_GET(HSCH_TAS_STATEMACHINE_CFG_REVISIT_DLY, x) +/* LAN969X ONLY */ +/* HSIOWRAP:XMII_CFG:XMII_CFG */ +#define HSIO_WRAP_XMII_CFG(g) \ + __REG(TARGET_HSIO_WRAP, 0, 1, 116, g, 2, 20, 0, 0, 1, 4) + +#define HSIO_WRAP_XMII_CFG_GPIO_XMII_CFG GENMASK(2, 1) +#define HSIO_WRAP_XMII_CFG_GPIO_XMII_CFG_SET(x)\ + FIELD_PREP(HSIO_WRAP_XMII_CFG_GPIO_XMII_CFG, x) +#define HSIO_WRAP_XMII_CFG_GPIO_XMII_CFG_GET(x)\ + FIELD_GET(HSIO_WRAP_XMII_CFG_GPIO_XMII_CFG, x) + +/* LAN969X ONLY */ +/* HSIOWRAP:XMII_CFG:RGMII_CFG */ +#define HSIO_WRAP_RGMII_CFG(g) \ + __REG(TARGET_HSIO_WRAP, 0, 1, 116, g, 2, 20, 4, 0, 1, 4) + +#define HSIO_WRAP_RGMII_CFG_TX_CLK_CFG GENMASK(4, 2) +#define HSIO_WRAP_RGMII_CFG_TX_CLK_CFG_SET(x)\ + FIELD_PREP(HSIO_WRAP_RGMII_CFG_TX_CLK_CFG, x) +#define HSIO_WRAP_RGMII_CFG_TX_CLK_CFG_GET(x)\ + FIELD_GET(HSIO_WRAP_RGMII_CFG_TX_CLK_CFG, x) + +#define HSIO_WRAP_RGMII_CFG_RGMII_TX_RST BIT(1) +#define HSIO_WRAP_RGMII_CFG_RGMII_TX_RST_SET(x)\ + FIELD_PREP(HSIO_WRAP_RGMII_CFG_RGMII_TX_RST, x) +#define HSIO_WRAP_RGMII_CFG_RGMII_TX_RST_GET(x)\ + FIELD_GET(HSIO_WRAP_RGMII_CFG_RGMII_TX_RST, x) + +#define HSIO_WRAP_RGMII_CFG_RGMII_RX_RST BIT(0) +#define HSIO_WRAP_RGMII_CFG_RGMII_RX_RST_SET(x)\ + FIELD_PREP(HSIO_WRAP_RGMII_CFG_RGMII_RX_RST, x) +#define HSIO_WRAP_RGMII_CFG_RGMII_RX_RST_GET(x)\ + FIELD_GET(HSIO_WRAP_RGMII_CFG_RGMII_RX_RST, x) + +/* LAN969X ONLY */ +/* HSIOWRAP:XMII_CFG:DLL_CFG */ +#define HSIO_WRAP_DLL_CFG(g, r) \ + __REG(TARGET_HSIO_WRAP, 0, 1, 116, g, 2, 20, 12, r, 2, 4) + +#define HSIO_WRAP_DLL_CFG_DLL_ENA BIT(19) +#define HSIO_WRAP_DLL_CFG_DLL_ENA_SET(x)\ + FIELD_PREP(HSIO_WRAP_DLL_CFG_DLL_ENA, x) +#define HSIO_WRAP_DLL_CFG_DLL_ENA_GET(x)\ + FIELD_GET(HSIO_WRAP_DLL_CFG_DLL_ENA, x) + +#define HSIO_WRAP_DLL_CFG_DLL_CLK_ENA BIT(18) +#define HSIO_WRAP_DLL_CFG_DLL_CLK_ENA_SET(x)\ + FIELD_PREP(HSIO_WRAP_DLL_CFG_DLL_CLK_ENA, x) +#define HSIO_WRAP_DLL_CFG_DLL_CLK_ENA_GET(x)\ + FIELD_GET(HSIO_WRAP_DLL_CFG_DLL_CLK_ENA, x) + +#define HSIO_WRAP_DLL_CFG_DLL_CLK_SEL GENMASK(17, 15) +#define HSIO_WRAP_DLL_CFG_DLL_CLK_SEL_SET(x)\ + FIELD_PREP(HSIO_WRAP_DLL_CFG_DLL_CLK_SEL, x) +#define HSIO_WRAP_DLL_CFG_DLL_CLK_SEL_GET(x)\ + FIELD_GET(HSIO_WRAP_DLL_CFG_DLL_CLK_SEL, x) + +#define HSIO_WRAP_DLL_CFG_DLL_RST BIT(0) +#define HSIO_WRAP_DLL_CFG_DLL_RST_SET(x)\ + FIELD_PREP(HSIO_WRAP_DLL_CFG_DLL_RST, x) +#define HSIO_WRAP_DLL_CFG_DLL_RST_GET(x)\ + FIELD_GET(HSIO_WRAP_DLL_CFG_DLL_RST, x) + /* LRN:COMMON:COMMON_ACCESS_CTRL */ #define LRN_COMMON_ACCESS_CTRL \ __REG(TARGET_LRN, 0, 1, 0, 0, 1, 72, 0, 0, 1, 4) @@ -8110,4 +8175,84 @@ extern const struct sparx5_regs *regs; #define XQS_CNT(g) \ __REG(TARGET_XQS, 0, 1, 0, g, 1024, 4, 0, 0, 1, 4) +/* LAN969X ONLY */ +/* DEV1G:DEV_CFG_STATUS:DEV_RST_CTRL */ +#define DEVRGMII_DEV_RST_CTRL(t) \ + __REG(TARGET_DEVRGMII, t, 2, 0, 0, 1, 36, 0, 0, 1, 4) + +#define DEVRGMII_DEV_RST_CTRL_SPEED_SEL GENMASK(22, 20) +#define DEVRGMII_DEV_RST_CTRL_SPEED_SEL_SET(x)\ + FIELD_PREP(DEVRGMII_DEV_RST_CTRL_SPEED_SEL, x) +#define DEVRGMII_DEV_RST_CTRL_SPEED_SEL_GET(x)\ + FIELD_GET(DEVRGMII_DEV_RST_CTRL_SPEED_SEL, x) + +/* LAN969X ONLY */ +/* DEV1G:MAC_CFG_STATUS:MAC_ENA_CFG */ +#define DEVRGMII_MAC_ENA_CFG(t) \ + __REG(TARGET_DEVRGMII, t, 2, 36, 0, 1, 36, 0, 0, 1, 4) + +#define DEVRGMII_MAC_ENA_CFG_RX_ENA BIT(4) +#define DEVRGMII_MAC_ENA_CFG_RX_ENA_SET(x)\ + FIELD_PREP(DEVRGMII_MAC_ENA_CFG_RX_ENA, x) +#define DEVRGMII_MAC_ENA_CFG_RX_ENA_GET(x)\ + FIELD_GET(DEVRGMII_MAC_ENA_CFG_RX_ENA, x) + +#define DEVRGMII_MAC_ENA_CFG_TX_ENA BIT(0) +#define DEVRGMII_MAC_ENA_CFG_TX_ENA_SET(x)\ + FIELD_PREP(DEVRGMII_MAC_ENA_CFG_TX_ENA, x) +#define DEVRGMII_MAC_ENA_CFG_TX_ENA_GET(x)\ + FIELD_GET(DEVRGMII_MAC_ENA_CFG_TX_ENA, x) + +/* LAN969X ONLY */ +/* DEV1G:MAC_CFG_STATUS:MAC_TAGS_CFG */ +#define DEVRGMII_MAC_TAGS_CFG(t) \ + __REG(TARGET_DEVRGMII, t, 2, 36, 0, 1, 36, 12, 0, 1, 4) + +#define DEVRGMII_MAC_TAGS_CFG_TAG_ID GENMASK(31, 16) +#define DEVRGMII_MAC_TAGS_CFG_TAG_ID_SET(x)\ + FIELD_PREP(DEVRGMII_MAC_TAGS_CFG_TAG_ID, x) +#define DEVRGMII_MAC_TAGS_CFG_TAG_ID_GET(x)\ + FIELD_GET(DEVRGMII_MAC_TAGS_CFG_TAG_ID, x) + +#define DEVRGMII_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA BIT(3) +#define DEVRGMII_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA_SET(x)\ + FIELD_PREP(DEVRGMII_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA, x) +#define DEVRGMII_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA_GET(x)\ + FIELD_GET(DEVRGMII_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA, x) + +#define DEVRGMII_MAC_TAGS_CFG_PB_ENA GENMASK(2, 1) +#define DEVRGMII_MAC_TAGS_CFG_PB_ENA_SET(x)\ + FIELD_PREP(DEVRGMII_MAC_TAGS_CFG_PB_ENA, x) +#define DEVRGMII_MAC_TAGS_CFG_PB_ENA_GET(x)\ + FIELD_GET(DEVRGMII_MAC_TAGS_CFG_PB_ENA, x) + +#define DEVRGMII_MAC_TAGS_CFG_VLAN_AWR_ENA BIT(0) +#define DEVRGMII_MAC_TAGS_CFG_VLAN_AWR_ENA_SET(x)\ + FIELD_PREP(DEVRGMII_MAC_TAGS_CFG_VLAN_AWR_ENA, x) +#define DEVRGMII_MAC_TAGS_CFG_VLAN_AWR_ENA_GET(x)\ + FIELD_GET(DEVRGMII_MAC_TAGS_CFG_VLAN_AWR_ENA, x) + +/* LAN969X ONLY */ +/* DEV1G:MAC_CFG_STATUS:MAC_IFG_CFG */ +#define DEVRGMII_MAC_IFG_CFG(t) \ + __REG(TARGET_DEVRGMII, t, 2, 36, 0, 1, 36, 24, 0, 1, 4) + +#define DEVRGMII_MAC_IFG_CFG_TX_IFG GENMASK(12, 8) +#define DEVRGMII_MAC_IFG_CFG_TX_IFG_SET(x)\ + FIELD_PREP(DEVRGMII_MAC_IFG_CFG_TX_IFG, x) +#define DEVRGMII_MAC_IFG_CFG_TX_IFG_GET(x)\ + FIELD_GET(DEVRGMII_MAC_IFG_CFG_TX_IFG, x) + +#define DEVRGMII_MAC_IFG_CFG_RX_IFG2 GENMASK(7, 4) +#define DEVRGMII_MAC_IFG_CFG_RX_IFG2_SET(x)\ + FIELD_PREP(DEVRGMII_MAC_IFG_CFG_RX_IFG2, x) +#define DEVRGMII_MAC_IFG_CFG_RX_IFG2_GET(x)\ + FIELD_GET(DEVRGMII_MAC_IFG_CFG_RX_IFG2, x) + +#define DEVRGMII_MAC_IFG_CFG_RX_IFG1 GENMASK(3, 0) +#define DEVRGMII_MAC_IFG_CFG_RX_IFG1_SET(x)\ + FIELD_PREP(DEVRGMII_MAC_IFG_CFG_RX_IFG1, x) +#define DEVRGMII_MAC_IFG_CFG_RX_IFG1_GET(x)\ + FIELD_GET(DEVRGMII_MAC_IFG_CFG_RX_IFG1, x) + #endif /* _SPARX5_MAIN_REGS_H_ */ From 010fe5dff1644f60520302fd43776a54402b623f Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Fri, 20 Dec 2024 14:48:47 +0100 Subject: [PATCH 0998/1450] net: lan969x: add RGMII implementation The lan969x switch device includes two RGMII port interfaces (port 28 and 29) supporting data speeds of 1 Gbps, 100 Mbps and 10 Mbps. MAC level delays are configurable through the HSIO_WRAP target, by choosing a phase shift selector, corresponding to a certain time delay in nano seconds. Add new file: lan969x_rgmii.c that contains the implementation for configuring the RGMII port devices. MAC level delays are configured using the "{rx,tx}-internal-delay-ps" properties. These properties must be specified independently of the phy-mode. If missing, or set to zero, the MAC will not apply any delay. Reviewed-by: Steen Hegelund Reviewed-by: Horatiu Vultur Tested-by: Robert Marko Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20241220-sparx5-lan969x-switch-driver-4-v5-8-fa8ba5dff732@microchip.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/microchip/sparx5/Makefile | 3 +- .../microchip/sparx5/lan969x/lan969x.c | 1 + .../microchip/sparx5/lan969x/lan969x.h | 5 + .../microchip/sparx5/lan969x/lan969x_rgmii.c | 224 ++++++++++++++++++ .../ethernet/microchip/sparx5/sparx5_main.h | 2 + .../ethernet/microchip/sparx5/sparx5_port.c | 6 + 6 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/microchip/sparx5/lan969x/lan969x_rgmii.c diff --git a/drivers/net/ethernet/microchip/sparx5/Makefile b/drivers/net/ethernet/microchip/sparx5/Makefile index 4bf2a885a9da4..3f34e83246a0e 100644 --- a/drivers/net/ethernet/microchip/sparx5/Makefile +++ b/drivers/net/ethernet/microchip/sparx5/Makefile @@ -20,7 +20,8 @@ sparx5-switch-$(CONFIG_LAN969X_SWITCH) += lan969x/lan969x_regs.o \ lan969x/lan969x.o \ lan969x/lan969x_calendar.o \ lan969x/lan969x_vcap_ag_api.o \ - lan969x/lan969x_vcap_impl.o + lan969x/lan969x_vcap_impl.o \ + lan969x/lan969x_rgmii.o # Provide include files ccflags-y += -I$(srctree)/drivers/net/ethernet/microchip/vcap diff --git a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c index be49a99556fe9..396f76b6eea5d 100644 --- a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c +++ b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.c @@ -340,6 +340,7 @@ static const struct sparx5_ops lan969x_ops = { .set_port_mux = &lan969x_port_mux_set, .ptp_irq_handler = &lan969x_ptp_irq_handler, .dsm_calendar_calc = &lan969x_dsm_calendar_calc, + .port_config_rgmii = &lan969x_port_config_rgmii, }; const struct sparx5_match_data lan969x_desc = { diff --git a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.h b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.h index 4b91c47d6d216..9a7ddebecf1e4 100644 --- a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.h +++ b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x.h @@ -67,4 +67,9 @@ static inline bool lan969x_port_is_rgmii(int portno) /* lan969x_calendar.c */ int lan969x_dsm_calendar_calc(struct sparx5 *sparx5, u32 taxi, struct sparx5_calendar_data *data); + +/* lan969x_rgmii.c */ +int lan969x_port_config_rgmii(struct sparx5_port *port, + struct sparx5_port_config *conf); + #endif diff --git a/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x_rgmii.c b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x_rgmii.c new file mode 100644 index 0000000000000..4e422ca50828f --- /dev/null +++ b/drivers/net/ethernet/microchip/sparx5/lan969x/lan969x_rgmii.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* Microchip lan969x Switch driver + * + * Copyright (c) 2024 Microchip Technology Inc. and its subsidiaries. + */ + +#include "lan969x.h" + +/* Tx clock selectors */ +#define LAN969X_RGMII_TX_CLK_SEL_125MHZ 1 /* 1000Mbps */ +#define LAN969X_RGMII_TX_CLK_SEL_25MHZ 2 /* 100Mbps */ +#define LAN969X_RGMII_TX_CLK_SEL_2M5MHZ 3 /* 10Mbps */ + +/* Port speed selectors */ +#define LAN969X_RGMII_SPEED_SEL_10 0 /* Select 10Mbps speed */ +#define LAN969X_RGMII_SPEED_SEL_100 1 /* Select 100Mbps speed */ +#define LAN969X_RGMII_SPEED_SEL_1000 2 /* Select 1000Mbps speed */ + +/* Clock delay selectors */ +#define LAN969X_RGMII_CLK_DELAY_SEL_1_0_NS 2 /* Phase shift 45deg */ +#define LAN969X_RGMII_CLK_DELAY_SEL_1_7_NS 3 /* Phase shift 77deg */ +#define LAN969X_RGMII_CLK_DELAY_SEL_2_0_NS 4 /* Phase shift 90deg */ +#define LAN969X_RGMII_CLK_DELAY_SEL_2_5_NS 5 /* Phase shift 112deg */ +#define LAN969X_RGMII_CLK_DELAY_SEL_3_0_NS 6 /* Phase shift 135deg */ +#define LAN969X_RGMII_CLK_DELAY_SEL_3_3_NS 7 /* Phase shift 147deg */ + +#define LAN969X_RGMII_PORT_START_IDX 28 /* Index of the first RGMII port */ +#define LAN969X_RGMII_IFG_TX 4 /* TX Inter Frame Gap value */ +#define LAN969X_RGMII_IFG_RX1 5 /* RX1 Inter Frame Gap value */ +#define LAN969X_RGMII_IFG_RX2 1 /* RX2 Inter Frame Gap value */ + +#define RGMII_PORT_IDX(port) ((port)->portno - LAN969X_RGMII_PORT_START_IDX) + +/* Get the tx clock selector based on the port speed. */ +static int lan969x_rgmii_get_clk_sel(int speed) +{ + return (speed == SPEED_10 ? LAN969X_RGMII_TX_CLK_SEL_2M5MHZ : + speed == SPEED_100 ? LAN969X_RGMII_TX_CLK_SEL_25MHZ : + LAN969X_RGMII_TX_CLK_SEL_125MHZ); +} + +/* Get the port speed selector based on the port speed. */ +static int lan969x_rgmii_get_speed_sel(int speed) +{ + return (speed == SPEED_10 ? LAN969X_RGMII_SPEED_SEL_10 : + speed == SPEED_100 ? LAN969X_RGMII_SPEED_SEL_100 : + LAN969X_RGMII_SPEED_SEL_1000); +} + +/* Get the clock delay selector based on the clock delay in picoseconds. */ +static int lan969x_rgmii_get_clk_delay_sel(struct sparx5_port *port, + u32 delay_ps, u32 *clk_delay_sel) +{ + switch (delay_ps) { + case 0: + /* Hardware default selector. */ + *clk_delay_sel = LAN969X_RGMII_CLK_DELAY_SEL_2_5_NS; + break; + case 1000: + *clk_delay_sel = LAN969X_RGMII_CLK_DELAY_SEL_1_0_NS; + break; + case 1700: + *clk_delay_sel = LAN969X_RGMII_CLK_DELAY_SEL_1_7_NS; + break; + case 2000: + *clk_delay_sel = LAN969X_RGMII_CLK_DELAY_SEL_2_0_NS; + break; + case 2500: + *clk_delay_sel = LAN969X_RGMII_CLK_DELAY_SEL_2_5_NS; + break; + case 3000: + *clk_delay_sel = LAN969X_RGMII_CLK_DELAY_SEL_3_0_NS; + break; + case 3300: + *clk_delay_sel = LAN969X_RGMII_CLK_DELAY_SEL_3_3_NS; + break; + default: + dev_err(port->sparx5->dev, "Invalid RGMII delay: %u", delay_ps); + return -EINVAL; + } + + return 0; +} + +/* Configure the RGMII tx clock frequency. */ +static void lan969x_rgmii_tx_clk_config(struct sparx5_port *port, + struct sparx5_port_config *conf) +{ + u32 clk_sel = lan969x_rgmii_get_clk_sel(conf->speed); + u32 idx = RGMII_PORT_IDX(port); + + /* Take the RGMII clock domain out of reset and set tx clock + * frequency. + */ + spx5_rmw(HSIO_WRAP_RGMII_CFG_TX_CLK_CFG_SET(clk_sel) | + HSIO_WRAP_RGMII_CFG_RGMII_TX_RST_SET(0) | + HSIO_WRAP_RGMII_CFG_RGMII_RX_RST_SET(0), + HSIO_WRAP_RGMII_CFG_TX_CLK_CFG | + HSIO_WRAP_RGMII_CFG_RGMII_TX_RST | + HSIO_WRAP_RGMII_CFG_RGMII_RX_RST, + port->sparx5, HSIO_WRAP_RGMII_CFG(idx)); +} + +/* Configure the RGMII port device. */ +static void lan969x_rgmii_port_device_config(struct sparx5_port *port, + struct sparx5_port_config *conf) +{ + u32 dtag, dotag, etype, speed_sel, idx = RGMII_PORT_IDX(port); + + speed_sel = lan969x_rgmii_get_speed_sel(conf->speed); + + etype = (port->vlan_type == SPX5_VLAN_PORT_TYPE_S_CUSTOM ? + port->custom_etype : + port->vlan_type == SPX5_VLAN_PORT_TYPE_C ? + ETH_P_8021Q : ETH_P_8021AD); + + dtag = port->max_vlan_tags == SPX5_PORT_MAX_TAGS_TWO; + dotag = port->max_vlan_tags != SPX5_PORT_MAX_TAGS_NONE; + + /* Enable the MAC. */ + spx5_wr(DEVRGMII_MAC_ENA_CFG_RX_ENA_SET(1) | + DEVRGMII_MAC_ENA_CFG_TX_ENA_SET(1), + port->sparx5, DEVRGMII_MAC_ENA_CFG(idx)); + + /* Configure the Inter Frame Gap. */ + spx5_wr(DEVRGMII_MAC_IFG_CFG_TX_IFG_SET(LAN969X_RGMII_IFG_TX) | + DEVRGMII_MAC_IFG_CFG_RX_IFG1_SET(LAN969X_RGMII_IFG_RX1) | + DEVRGMII_MAC_IFG_CFG_RX_IFG2_SET(LAN969X_RGMII_IFG_RX2), + port->sparx5, DEVRGMII_MAC_IFG_CFG(idx)); + + /* Configure port data rate. */ + spx5_wr(DEVRGMII_DEV_RST_CTRL_SPEED_SEL_SET(speed_sel), + port->sparx5, DEVRGMII_DEV_RST_CTRL(idx)); + + /* Configure VLAN awareness. */ + spx5_wr(DEVRGMII_MAC_TAGS_CFG_TAG_ID_SET(etype) | + DEVRGMII_MAC_TAGS_CFG_PB_ENA_SET(dtag) | + DEVRGMII_MAC_TAGS_CFG_VLAN_AWR_ENA_SET(dotag) | + DEVRGMII_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA_SET(dotag), + port->sparx5, + DEVRGMII_MAC_TAGS_CFG(idx)); +} + +/* Configure the RGMII delay lines in the MAC. + * + * We use the rx-internal-delay-ps" and "tx-internal-delay-ps" properties to + * configure the rx and tx delays for the MAC. If these properties are missing + * or set to zero, the MAC will not apply any delay. + * + * The PHY side delays are determined by the PHY mode + * (e.g. PHY_INTERFACE_MODE_RGMII_{ID, RXID, TXID}), and ignored by the MAC side + * entirely. + */ +static int lan969x_rgmii_delay_config(struct sparx5_port *port, + struct sparx5_port_config *conf) +{ + u32 tx_clk_sel, rx_clk_sel, tx_delay_ps = 0, rx_delay_ps = 0; + u32 idx = RGMII_PORT_IDX(port); + int err; + + of_property_read_u32(port->of_node, "rx-internal-delay-ps", + &rx_delay_ps); + + of_property_read_u32(port->of_node, "tx-internal-delay-ps", + &tx_delay_ps); + + err = lan969x_rgmii_get_clk_delay_sel(port, rx_delay_ps, &rx_clk_sel); + if (err) + return err; + + err = lan969x_rgmii_get_clk_delay_sel(port, tx_delay_ps, &tx_clk_sel); + if (err) + return err; + + /* Configure rx delay. */ + spx5_rmw(HSIO_WRAP_DLL_CFG_DLL_RST_SET(0) | + HSIO_WRAP_DLL_CFG_DLL_ENA_SET(1) | + HSIO_WRAP_DLL_CFG_DLL_CLK_ENA_SET(!!rx_delay_ps) | + HSIO_WRAP_DLL_CFG_DLL_CLK_SEL_SET(rx_clk_sel), + HSIO_WRAP_DLL_CFG_DLL_RST | + HSIO_WRAP_DLL_CFG_DLL_ENA | + HSIO_WRAP_DLL_CFG_DLL_CLK_ENA | + HSIO_WRAP_DLL_CFG_DLL_CLK_SEL, + port->sparx5, HSIO_WRAP_DLL_CFG(idx, 0)); + + /* Configure tx delay. */ + spx5_rmw(HSIO_WRAP_DLL_CFG_DLL_RST_SET(0) | + HSIO_WRAP_DLL_CFG_DLL_ENA_SET(1) | + HSIO_WRAP_DLL_CFG_DLL_CLK_ENA_SET(!!tx_delay_ps) | + HSIO_WRAP_DLL_CFG_DLL_CLK_SEL_SET(tx_clk_sel), + HSIO_WRAP_DLL_CFG_DLL_RST | + HSIO_WRAP_DLL_CFG_DLL_ENA | + HSIO_WRAP_DLL_CFG_DLL_CLK_ENA | + HSIO_WRAP_DLL_CFG_DLL_CLK_SEL, + port->sparx5, HSIO_WRAP_DLL_CFG(idx, 1)); + + return 0; +} + +/* Configure GPIO's to be used as RGMII interface. */ +static void lan969x_rgmii_gpio_config(struct sparx5_port *port) +{ + u32 idx = RGMII_PORT_IDX(port); + + /* Enable the RGMII on the GPIOs. */ + spx5_wr(HSIO_WRAP_XMII_CFG_GPIO_XMII_CFG_SET(1), port->sparx5, + HSIO_WRAP_XMII_CFG(!idx)); +} + +int lan969x_port_config_rgmii(struct sparx5_port *port, + struct sparx5_port_config *conf) +{ + int err; + + err = lan969x_rgmii_delay_config(port, conf); + if (err) + return err; + + lan969x_rgmii_tx_clk_config(port, conf); + lan969x_rgmii_gpio_config(port); + lan969x_rgmii_port_device_config(port, conf); + + return 0; +} diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h index c58d7841638ef..3ae760da17e2b 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h @@ -324,6 +324,8 @@ struct sparx5_ops { irqreturn_t (*ptp_irq_handler)(int irq, void *args); int (*dsm_calendar_calc)(struct sparx5 *sparx5, u32 taxi, struct sparx5_calendar_data *data); + int (*port_config_rgmii)(struct sparx5_port *port, + struct sparx5_port_config *conf); }; struct sparx5_main_io_resource { diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c index 86d6c9e9ec7c1..04bc8fffaf961 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c @@ -1012,6 +1012,12 @@ int sparx5_port_config(struct sparx5 *sparx5, if (err) return err; + if (rgmii) { + err = ops->port_config_rgmii(port, conf); + if (err) + return err; + } + /* high speed device is already configured */ if (!rgmii && !high_speed_dev) sparx5_port_config_low_set(sparx5, port, conf); From f0706c04721becee4e0576f0c56e871c11b1e84e Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Fri, 20 Dec 2024 14:48:48 +0100 Subject: [PATCH 0999/1450] dt-bindings: net: sparx5: document RGMII delays The lan969x switch device supports two RGMII port interfaces that can be configured for MAC level rx and tx delays. Document two new properties {rx,tx}-internal-delay-ps in the bindings, used to select these delays. Tested-by: Robert Marko Reviewed-by: Rob Herring (Arm) Signed-off-by: Daniel Machon Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241220-sparx5-lan969x-switch-driver-4-v5-9-fa8ba5dff732@microchip.com Signed-off-by: Jakub Kicinski --- .../bindings/net/microchip,sparx5-switch.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/devicetree/bindings/net/microchip,sparx5-switch.yaml b/Documentation/devicetree/bindings/net/microchip,sparx5-switch.yaml index dedfad526666c..a73fc50369057 100644 --- a/Documentation/devicetree/bindings/net/microchip,sparx5-switch.yaml +++ b/Documentation/devicetree/bindings/net/microchip,sparx5-switch.yaml @@ -129,6 +129,24 @@ properties: minimum: 0 maximum: 383 + rx-internal-delay-ps: + description: + RGMII Receive Clock Delay defined in pico seconds, used to select + the DLL phase shift between 1000 ps (45 degree shift at 1Gbps) and + 3300 ps (147 degree shift at 1Gbps). A value of 0 ps will disable + any delay. The Default is no delay. + enum: [0, 1000, 1700, 2000, 2500, 3000, 3300] + default: 0 + + tx-internal-delay-ps: + description: + RGMII Transmit Clock Delay defined in pico seconds, used to select + the DLL phase shift between 1000 ps (45 degree shift at 1Gbps) and + 3300 ps (147 degree shift at 1Gbps). A value of 0 ps will disable + any delay. The Default is no delay. + enum: [0, 1000, 1700, 2000, 2500, 3000, 3300] + default: 0 + required: - reg - phys From d29662695ed7c015521e5fc9387df25aab192a2e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 13 Nov 2024 18:16:49 +0100 Subject: [PATCH 1000/1450] btrfs: fix use-after-free waiting for encoded read endios Fix a use-after-free in the I/O completion path for encoded reads by using a completion instead of a wait_queue for synchronizing the destruction of 'struct btrfs_encoded_read_private'. Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 94c8809e81705..6baa0269a85b0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9078,9 +9078,9 @@ static ssize_t btrfs_encoded_read_inline( } struct btrfs_encoded_read_private { - wait_queue_head_t wait; + struct completion done; void *uring_ctx; - atomic_t pending; + refcount_t pending_refs; blk_status_t status; }; @@ -9099,14 +9099,14 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (atomic_dec_and_test(&priv->pending)) { + if (refcount_dec_and_test(&priv->pending_refs)) { int err = blk_status_to_errno(READ_ONCE(priv->status)); if (priv->uring_ctx) { btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - wake_up(&priv->wait); + complete(&priv->done); } } bio_put(&bbio->bio); @@ -9126,8 +9126,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!priv) return -ENOMEM; - init_waitqueue_head(&priv->wait); - atomic_set(&priv->pending, 1); + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9140,7 +9140,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, @@ -9155,11 +9155,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); if (uring_ctx) { - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { ret = blk_status_to_errno(READ_ONCE(priv->status)); btrfs_uring_read_extent_endio(uring_ctx, ret); kfree(priv); @@ -9168,8 +9168,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { - if (atomic_dec_return(&priv->pending) != 0) - io_wait_event(priv->wait, !atomic_read(&priv->pending)); + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); /* See btrfs_encoded_read_endio() for ordering. */ ret = blk_status_to_errno(READ_ONCE(priv->status)); kfree(priv); From 44f52bbe96dfdbe4aca3818a2534520082a07040 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Dec 2024 16:08:07 +0000 Subject: [PATCH 1001/1450] btrfs: fix use-after-free when COWing tree bock and tracing is enabled When a COWing a tree block, at btrfs_cow_block(), and we have the tracepoint trace_btrfs_cow_block() enabled and preemption is also enabled (CONFIG_PREEMPT=y), we can trigger a use-after-free in the COWed extent buffer while inside the tracepoint code. This is because in some paths that call btrfs_cow_block(), such as btrfs_search_slot(), we are holding the last reference on the extent buffer @buf so btrfs_force_cow_block() drops the last reference on the @buf extent buffer when it calls free_extent_buffer_stale(buf), which schedules the release of the extent buffer with RCU. This means that if we are on a kernel with preemption, the current task may be preempted before calling trace_btrfs_cow_block() and the extent buffer already released by the time trace_btrfs_cow_block() is called, resulting in a use-after-free. Fix this by moving the trace_btrfs_cow_block() from btrfs_cow_block() to btrfs_force_cow_block() before the COWed extent buffer is freed. This also has a side effect of invoking the tracepoint in the tree defrag code, at defrag.c:btrfs_realloc_node(), since btrfs_force_cow_block() is called there, but this is fine and it was actually missing there. Reported-by: syzbot+8517da8635307182c8a5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6759a9b9.050a0220.1ac542.000d.GAE@google.com/ CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 693dc27ffb897..185985a337b30 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -654,6 +654,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, goto error_unlock_cow; } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); @@ -710,7 +712,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); @@ -751,12 +752,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, - cow_ret, search_start, 0, nest); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); From 3e74859ee35edc33a022c3f3971df066ea0ca6b9 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 13 Dec 2024 12:22:32 -0800 Subject: [PATCH 1002/1450] btrfs: check folio mapping after unlock in relocate_one_folio() When we call btrfs_read_folio() to bring a folio uptodate, we unlock the folio. The result of that is that a different thread can modify the mapping (like remove it with invalidate) before we call folio_lock(). This results in an invalid page and we need to try again. In particular, if we are relocating concurrently with aborting a transaction, this can result in a crash like the following: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] SMP CPU: 76 PID: 1411631 Comm: kworker/u322:5 Workqueue: events_unbound btrfs_reclaim_bgs_work RIP: 0010:set_page_extent_mapped+0x20/0xb0 RSP: 0018:ffffc900516a7be8 EFLAGS: 00010246 RAX: ffffea009e851d08 RBX: ffffea009e0b1880 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffc900516a7b90 RDI: ffffea009e0b1880 RBP: 0000000003573000 R08: 0000000000000001 R09: ffff88c07fd2f3f0 R10: 0000000000000000 R11: 0000194754b575be R12: 0000000003572000 R13: 0000000003572fff R14: 0000000000100cca R15: 0000000005582fff FS: 0000000000000000(0000) GS:ffff88c07fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000407d00f002 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __die+0x78/0xc0 ? page_fault_oops+0x2a8/0x3a0 ? __switch_to+0x133/0x530 ? wq_worker_running+0xa/0x40 ? exc_page_fault+0x63/0x130 ? asm_exc_page_fault+0x22/0x30 ? set_page_extent_mapped+0x20/0xb0 relocate_file_extent_cluster+0x1a7/0x940 relocate_data_extent+0xaf/0x120 relocate_block_group+0x20f/0x480 btrfs_relocate_block_group+0x152/0x320 btrfs_relocate_chunk+0x3d/0x120 btrfs_reclaim_bgs_work+0x2ae/0x4e0 process_scheduled_works+0x184/0x370 worker_thread+0xc6/0x3e0 ? blk_add_timer+0xb0/0xb0 kthread+0xae/0xe0 ? flush_tlb_kernel_range+0x90/0x90 ret_from_fork+0x2f/0x40 ? flush_tlb_kernel_range+0x90/0x90 ret_from_fork_asm+0x11/0x20 This occurs because cleanup_one_transaction() calls destroy_delalloc_inodes() which calls invalidate_inode_pages2() which takes the folio_lock before setting mapping to NULL. We fail to check this, and subsequently call set_extent_mapping(), which assumes that mapping != NULL (in fact it asserts that in debug mode) Note that the "fixes" patch here is not the one that introduced the race (the very first iteration of this code from 2009) but a more recent change that made this particular crash happen in practice. Fixes: e7f1326cc24e ("btrfs: set page extent mapped after read_folio in relocate_one_page") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index bf267bdfa8f81..db8b42f674b7c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2902,6 +2902,7 @@ static int relocate_one_folio(struct reloc_control *rc, const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); +again: folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { @@ -2937,6 +2938,11 @@ static int relocate_one_folio(struct reloc_control *rc, ret = -EIO; goto release_folio; } + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } /* From 0fba7be1ca6df2881e68386e5575fe096f33c4ca Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 13 Dec 2024 12:33:22 -0800 Subject: [PATCH 1003/1450] btrfs: check folio mapping after unlock in put_file_data() When we call btrfs_read_folio() we get an unlocked folio, so it is possible for a different thread to concurrently modify folio->mapping. We must check that this hasn't happened once we do have the lock. CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/send.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7254279c3cc92..498c843232536 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5280,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); +again: folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, @@ -5312,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) ret = -EIO; break; } + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, From 0525064bb82e50d59543b62b9d41a606198a4a44 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Nov 2024 12:25:30 +0000 Subject: [PATCH 1004/1450] btrfs: fix race with memory mapped writes when activating swap file When activating the swap file we flush all delalloc and wait for ordered extent completion, so that we don't miss any delalloc and extents before we check that the file's extent layout is usable for a swap file and activate the swap file. We are called with the inode's VFS lock acquired, so we won't race with buffered and direct IO writes, however we can still race with memory mapped writes since they don't acquire the inode's VFS lock. The race window is between flushing all delalloc and locking the whole file's extent range, since memory mapped writes lock an extent range with the length of a page. Fix this by acquiring the inode's mmap lock before we flush delalloc. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6baa0269a85b0..b2abc0aa53003 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9809,6 +9809,15 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, u64 isize; u64 start; + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); + /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and @@ -9816,22 +9825,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* @@ -9846,7 +9858,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -9860,7 +9873,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -9881,7 +9895,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", btrfs_root_id(root)); - return -EPERM; + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); @@ -10036,6 +10051,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); if (ret) return ret; From 03018e5d8508254534511d40fb57bc150e6a87f2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Dec 2024 12:54:14 +0000 Subject: [PATCH 1005/1450] btrfs: fix swap file activation failure due to extents that used to be shared When activating a swap file, to determine if an extent is shared we use can_nocow_extent(), which ends up at btrfs_cross_ref_exist(). That helper is meant to be quick because it's used in the NOCOW write path, when flushing delalloc and when doing a direct IO write, however it does return some false positives, meaning it may indicate that an extent is shared even if it's no longer the case. For the write path this is fine, we just do a unnecessary COW operation instead of doing a more rigorous check which would be too heavy (calling btrfs_is_data_extent_shared()). However when activating a swap file, the false positives simply result in a failure, which is confusing for users/applications. One particular case where this happens is when a data extent only has 1 reference but that reference is not inlined in the extent item located in the extent tree - this happens when we create more than 33 references for an extent and then delete those 33 references plus every other non-inline reference except one. The function check_committed_ref() assumes that if the size of an extent item doesn't match the size of struct btrfs_extent_item plus the size of an inline reference (plus an owner reference in case simple quotas are enabled), then the extent is shared - that is not the case however, we can have a single reference but it's not inlined - the reason we do this is to be fast and avoid inspecting non-inline references which may be located in another leaf of the extent tree, slowing down write paths. The following test script reproduces the bug: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi NUM_CLONES=50 umount $DEV &> /dev/null run_test() { local sync_after_add_reflinks=$1 local sync_after_remove_reflinks=$2 mkfs.btrfs -f $DEV > /dev/null #mkfs.xfs -f $DEV > /dev/null mount $DEV $MNT touch $MNT/foo chmod 0600 $MNT/foo # On btrfs the file must be NOCOW. chattr +C $MNT/foo &> /dev/null xfs_io -s -c "pwrite -b 1M 0 1M" $MNT/foo mkswap $MNT/foo for ((i = 1; i <= $NUM_CLONES; i++)); do touch $MNT/foo_clone_$i chmod 0600 $MNT/foo_clone_$i # On btrfs the file must be NOCOW. chattr +C $MNT/foo_clone_$i &> /dev/null cp --reflink=always $MNT/foo $MNT/foo_clone_$i done if [ $sync_after_add_reflinks -ne 0 ]; then # Flush delayed refs and commit current transaction. sync -f $MNT fi # Remove the original file and all clones except the last. rm -f $MNT/foo for ((i = 1; i < $NUM_CLONES; i++)); do rm -f $MNT/foo_clone_$i done if [ $sync_after_remove_reflinks -ne 0 ]; then # Flush delayed refs and commit current transaction. sync -f $MNT fi # Now use the last clone as a swap file. It should work since # its extent are not shared anymore. swapon $MNT/foo_clone_${NUM_CLONES} swapoff $MNT/foo_clone_${NUM_CLONES} umount $MNT } echo -e "\nTest without sync after creating and removing clones" run_test 0 0 echo -e "\nTest with sync after creating clones" run_test 1 0 echo -e "\nTest with sync after removing clones" run_test 0 1 echo -e "\nTest with sync after creating and removing clones" run_test 1 1 Running the test: $ ./test.sh Test without sync after creating and removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0017 sec (556.793 MiB/sec and 556.7929 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=a6b9c29e-5ef4-4689-a8ac-bc199c750f02 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Test with sync after creating clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0036 sec (271.739 MiB/sec and 271.7391 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=5e9008d6-1f7a-4948-a1b4-3f30aba20a33 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Test with sync after removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0103 sec (96.665 MiB/sec and 96.6651 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=916c2740-fa9f-4385-9f06-29c3f89e4764 Test with sync after creating and removing clones wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0031 sec (314.268 MiB/sec and 314.2678 ops/sec) Setting up swapspace version 1, size = 1020 KiB (1044480 bytes) no label, UUID=06aab1dd-4d90-49c0-bd9f-3a8db4e2f912 swapon: /mnt/sdi/foo_clone_50: swapon failed: Invalid argument swapoff: /mnt/sdi/foo_clone_50: swapoff failed: Invalid argument Fix this by reworking btrfs_swap_activate() to instead of using extent maps and checking for shared extents with can_nocow_extent(), iterate over the inode's file extent items and use the accurate btrfs_is_data_extent_shared(). CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 96 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b2abc0aa53003..b87f19630b009 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9799,15 +9799,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; /* * Acquire the inode's mmap lock to prevent races with memory mapped @@ -9846,6 +9847,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out_unlock_mmap; } + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; + } + /* * Balance or device remove/replace/resize can move stuff around from * under us. The exclop protection makes sure they aren't running/won't @@ -9904,24 +9912,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->disk_bytenr == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->disk_bytenr == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -9933,23 +9956,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = extent_map_block_start(em) + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); - ret = can_nocow_extent(inode, start, &len, NULL, false, true); + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -9984,7 +10029,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -10025,20 +10069,16 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - - start += len; } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); @@ -10053,6 +10093,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, out_unlock_mmap: up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; From 9a45022a0efadd99bcc58f7f1cc2b6fb3b808c40 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Dec 2024 16:31:41 +0000 Subject: [PATCH 1006/1450] btrfs: allow swap activation to be interruptible During swap activation we iterate over the extents of a file, then do several checks for each extent, some of which may take some significant time such as checking if an extent is shared. Since a file can have many thousands of extents, this can be a very slow operation and it's currently not interruptible. I had a bug during development of a previous patch that resulted in an infinite loop when iterating the extents, so a core was busy looping and I couldn't cancel the operation, which is very annoying and requires a reboot. So make the loop interruptible by checking for fatal signals at the end of each iteration and stopping immediately if there is one. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b87f19630b009..c4675f4345fd7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10073,6 +10073,11 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, bsi.block_start = physical_block_start; bsi.block_len = len; } + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } } if (bsi.block_len) From 2c8507c63f5498d4ee4af404a8e44ceae4345056 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Dec 2024 16:43:44 +0000 Subject: [PATCH 1007/1450] btrfs: avoid monopolizing a core when activating a swap file During swap activation we iterate over the extents of a file and we can have many thousands of them, so we can end up in a busy loop monopolizing a core. Avoid this by doing a voluntary reschedule after processing each extent. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4675f4345fd7..623d9d7ab4803 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10078,6 +10078,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINTR; goto out; } + + cond_resched(); } if (bsi.block_len) From f2363e6fcc7938c5f0f6ac066fad0dd247598b51 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Wed, 11 Dec 2024 19:13:15 +0800 Subject: [PATCH 1008/1450] btrfs: fix transaction atomicity bug when enabling simple quotas Set squota incompat bit before committing the transaction that enables the feature. With the config CONFIG_BTRFS_ASSERT enabled, an assertion failure occurs regarding the simple quota feature. [5.596534] assertion failed: btrfs_fs_incompat(fs_info, SIMPLE_QUOTA), in fs/btrfs/qgroup.c:365 [5.597098] ------------[ cut here ]------------ [5.597371] kernel BUG at fs/btrfs/qgroup.c:365! [5.597946] CPU: 1 UID: 0 PID: 268 Comm: mount Not tainted 6.13.0-rc2-00031-gf92f4749861b #146 [5.598450] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [5.599008] RIP: 0010:btrfs_read_qgroup_config+0x74d/0x7a0 [5.604303] [5.605230] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.605538] ? exc_invalid_op+0x56/0x70 [5.605775] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.606066] ? asm_exc_invalid_op+0x1f/0x30 [5.606441] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.606741] ? btrfs_read_qgroup_config+0x74d/0x7a0 [5.607038] ? try_to_wake_up+0x317/0x760 [5.607286] open_ctree+0xd9c/0x1710 [5.607509] btrfs_get_tree+0x58a/0x7e0 [5.608002] vfs_get_tree+0x2e/0x100 [5.608224] fc_mount+0x16/0x60 [5.608420] btrfs_get_tree+0x2f8/0x7e0 [5.608897] vfs_get_tree+0x2e/0x100 [5.609121] path_mount+0x4c8/0xbc0 [5.609538] __x64_sys_mount+0x10d/0x150 The issue can be easily reproduced using the following reproducer: root@q:linux# cat repro.sh set -e mkfs.btrfs -q -f /dev/sdb mount /dev/sdb /mnt/btrfs btrfs quota enable -s /mnt/btrfs umount /mnt/btrfs mount /dev/sdb /mnt/btrfs The issue is that when enabling quotas, at btrfs_quota_enable(), we set BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE at fs_info->qgroup_flags and persist it in the quota root in the item with the key BTRFS_QGROUP_STATUS_KEY, but we only set the incompat bit BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA after we commit the transaction used to enable simple quotas. This means that if after that transaction commit we unmount the filesystem without starting and committing any other transaction, or we have a power failure, the next time we mount the filesystem we will find the flag BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE set in the item with the key BTRFS_QGROUP_STATUS_KEY but we will not find the incompat bit BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA set in the superblock, triggering an assertion failure at: btrfs_read_qgroup_config() -> qgroup_read_enable_gen() To fix this issue, set the BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA flag immediately after setting the BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE. This ensures that both flags are flushed to disk within the same transaction. Fixes: 182940f4f4db ("btrfs: qgroup: add new quota mode for simple quotas") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Filipe Manana Signed-off-by: Julian Sun Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a6f92836c9b11..f9b214992212d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1121,6 +1121,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -1254,8 +1255,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (simple) - btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ From fca432e73db2bec0fdbfbf6d98d3ebcd5388a977 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 18 Dec 2024 17:00:56 +1030 Subject: [PATCH 1009/1450] btrfs: sysfs: fix direct super block member reads The following sysfs entries are reading super block member directly, which can have a different endian and cause wrong values: - sys/fs/btrfs//nodesize - sys/fs/btrfs//sectorsize - sys/fs/btrfs//clone_alignment Thankfully those values (nodesize and sectorsize) are always aligned inside the btrfs_super_block, so it won't trigger unaligned read errors, just endian problems. Fix them by using the native cached members instead. Fixes: df93589a1737 ("btrfs: export more from FS_INFO to sysfs") CC: stable@vger.kernel.org Reviewed-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index fdcbf650ac31f..7f09b6c9cc2d3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1118,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -1128,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -1180,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); From 452f4b31e3f70a52b97890888eeb9eaa9a87139a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Mon, 25 Nov 2024 11:50:25 +0100 Subject: [PATCH 1010/1450] tracing: Constify string literal data member in struct trace_event_call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name member of the struct trace_event_call is assigned with generated string literals; declare them pointer to read-only. Reported by clang: security/landlock/syscalls.c:179:1: warning: initializing 'char *' with an expression of type 'const char[34]' discards qualifiers [-Wincompatible-pointer-types-discards-qualifiers] 179 | SYSCALL_DEFINE3(landlock_create_ruleset, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 180 | const struct landlock_ruleset_attr __user *const, attr, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 181 | const size_t, size, const __u32, flags) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:226:36: note: expanded from macro 'SYSCALL_DEFINE3' 226 | #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:234:2: note: expanded from macro 'SYSCALL_DEFINEx' 234 | SYSCALL_METADATA(sname, x, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:184:2: note: expanded from macro 'SYSCALL_METADATA' 184 | SYSCALL_TRACE_ENTER_EVENT(sname); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:151:30: note: expanded from macro 'SYSCALL_TRACE_ENTER_EVENT' 151 | .name = "sys_enter"#sname, \ | ^~~~~~~~~~~~~~~~~ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mickaël Salaün Cc: Günther Noack Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Bill Wendling Cc: Justin Stitt Link: https://lore.kernel.org/20241125105028.42807-1-cgoettsche@seltendoof.de Fixes: b77e38aa240c3 ("tracing: add event trace infrastructure") Signed-off-by: Christian Göttsche Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c4..58ad4ead33fcd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -364,7 +364,7 @@ struct trace_event_call { struct list_head list; struct trace_event_class *class; union { - char *name; + const char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; From 98feccbf32cfdde8c722bc4587aaa60ee5ac33f0 Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Mon, 16 Dec 2024 15:32:38 +0800 Subject: [PATCH 1011/1450] tracing: Prevent bad count for tracing_cpumask_write If a large count is provided, it will trigger a warning in bitmap_parse_user. Also check zero for it. Cc: stable@vger.kernel.org Fixes: 9e01c1b74c953 ("cpumask: convert kernel trace functions") Link: https://lore.kernel.org/20241216073238.2573704-1-lizhi.xu@windriver.com Reported-by: syzbot+0aecfd34fb878546f3fd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0aecfd34fb878546f3fd Tested-by: syzbot+0aecfd34fb878546f3fd@syzkaller.appspotmail.com Signed-off-by: Lizhi Xu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 957f941a08e74..f8aebcb01e62c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5087,6 +5087,9 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, cpumask_var_t tracing_cpumask_new; int err; + if (count == 0 || count > KMALLOC_MAX_SIZE) + return -EINVAL; + if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; From ccfa3131d4a0347988e73638edea5c8281b6d2c7 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 21 Dec 2024 16:57:12 +0900 Subject: [PATCH 1012/1450] dmaengine: fsl-edma: implement the cleanup path of fsl_edma3_attach_pd() Current implementation of fsl_edma3_attach_pd() does not provide a cleanup path, resulting in a memory leak. For example, dev_pm_domain_detach() is not called after dev_pm_domain_attach_by_id(), and the device link created with the DL_FLAG_STATELESS is not released explicitly. Therefore, provide a cleanup function fsl_edma3_detach_pd() and call it upon failure. Also add a devm_add_action_or_reset() call with this function after a successful fsl_edma3_attach_pd(). Fixes: 72f5801a4e2b ("dmaengine: fsl-edma: integrate v3 support") Signed-off-by: Joe Hattori Link: https://lore.kernel.org/r/20241221075712.3297200-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Vinod Koul --- drivers/dma/fsl-edma-common.h | 1 + drivers/dma/fsl-edma-main.c | 41 ++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/drivers/dma/fsl-edma-common.h b/drivers/dma/fsl-edma-common.h index ce37e1ee9c462..fe8f103d4a637 100644 --- a/drivers/dma/fsl-edma-common.h +++ b/drivers/dma/fsl-edma-common.h @@ -166,6 +166,7 @@ struct fsl_edma_chan { struct work_struct issue_worker; struct platform_device *pdev; struct device *pd_dev; + struct device_link *pd_dev_link; u32 srcid; struct clk *clk; int priority; diff --git a/drivers/dma/fsl-edma-main.c b/drivers/dma/fsl-edma-main.c index 60de1003193aa..1a613236b3e41 100644 --- a/drivers/dma/fsl-edma-main.c +++ b/drivers/dma/fsl-edma-main.c @@ -417,10 +417,33 @@ static const struct of_device_id fsl_edma_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, fsl_edma_dt_ids); +static void fsl_edma3_detach_pd(struct fsl_edma_engine *fsl_edma) +{ + struct fsl_edma_chan *fsl_chan; + int i; + + for (i = 0; i < fsl_edma->n_chans; i++) { + if (fsl_edma->chan_masked & BIT(i)) + continue; + fsl_chan = &fsl_edma->chans[i]; + if (fsl_chan->pd_dev_link) + device_link_del(fsl_chan->pd_dev_link); + if (fsl_chan->pd_dev) { + dev_pm_domain_detach(fsl_chan->pd_dev, false); + pm_runtime_dont_use_autosuspend(fsl_chan->pd_dev); + pm_runtime_set_suspended(fsl_chan->pd_dev); + } + } +} + +static void devm_fsl_edma3_detach_pd(void *data) +{ + fsl_edma3_detach_pd(data); +} + static int fsl_edma3_attach_pd(struct platform_device *pdev, struct fsl_edma_engine *fsl_edma) { struct fsl_edma_chan *fsl_chan; - struct device_link *link; struct device *pd_chan; struct device *dev; int i; @@ -436,15 +459,16 @@ static int fsl_edma3_attach_pd(struct platform_device *pdev, struct fsl_edma_eng pd_chan = dev_pm_domain_attach_by_id(dev, i); if (IS_ERR_OR_NULL(pd_chan)) { dev_err(dev, "Failed attach pd %d\n", i); - return -EINVAL; + goto detach; } - link = device_link_add(dev, pd_chan, DL_FLAG_STATELESS | + fsl_chan->pd_dev_link = device_link_add(dev, pd_chan, DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE); - if (!link) { + if (!fsl_chan->pd_dev_link) { dev_err(dev, "Failed to add device_link to %d\n", i); - return -EINVAL; + dev_pm_domain_detach(pd_chan, false); + goto detach; } fsl_chan->pd_dev = pd_chan; @@ -455,6 +479,10 @@ static int fsl_edma3_attach_pd(struct platform_device *pdev, struct fsl_edma_eng } return 0; + +detach: + fsl_edma3_detach_pd(fsl_edma); + return -EINVAL; } static int fsl_edma_probe(struct platform_device *pdev) @@ -544,6 +572,9 @@ static int fsl_edma_probe(struct platform_device *pdev) ret = fsl_edma3_attach_pd(pdev, fsl_edma); if (ret) return ret; + ret = devm_add_action_or_reset(&pdev->dev, devm_fsl_edma3_detach_pd, fsl_edma); + if (ret) + return ret; } if (drvdata->flags & FSL_EDMA_DRV_TCD64) From 2ac5415022d16d63d912a39a06f32f1f51140261 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 20 Dec 2024 23:23:25 +0100 Subject: [PATCH 1013/1450] RDMA/rxe: Remove the direct link to net_device The similar patch in siw is in the link: https://git.kernel.org/rdma/rdma/c/16b87037b48889 This problem also occurred in RXE. The following analyze this problem. In the following Call Traces: " BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 net/core/dev.c:8782 Read of size 4 at addr ffff8880554640b0 by task kworker/1:4/5295 CPU: 1 UID: 0 PID: 5295 Comm: kworker/1:4 Not tainted 6.12.0-rc3-syzkaller-00399-g9197b73fd7bb #0 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: infiniband ib_cache_event_task Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 dev_get_flags+0x188/0x1d0 net/core/dev.c:8782 rxe_query_port+0x12d/0x260 drivers/infiniband/sw/rxe/rxe_verbs.c:60 __ib_query_port drivers/infiniband/core/device.c:2111 [inline] ib_query_port+0x168/0x7d0 drivers/infiniband/core/device.c:2143 ib_cache_update+0x1a9/0xb80 drivers/infiniband/core/cache.c:1494 ib_cache_event_task+0xf3/0x1e0 drivers/infiniband/core/cache.c:1568 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa65/0x1850 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f2/0x390 kernel/kthread.c:389 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 " 1). In the link [1], " infiniband syz2: set down " This means that on 839.350575, the event ib_cache_event_task was sent andi queued in ib_wq. 2). In the link [1], " team0 (unregistering): Port device team_slave_0 removed " It indicates that before 843.251853, the net device should be freed. 3). In the link [1], " BUG: KASAN: slab-use-after-free in dev_get_flags+0x188/0x1d0 " This means that on 850.559070, this slab-use-after-free problem occurred. In all, on 839.350575, the event ib_cache_event_task was sent and queued in ib_wq, before 843.251853, the net device veth was freed. on 850.559070, this event was executed, and the mentioned freed net device was called. Thus, the above call trace occurred. [1] https://syzkaller.appspot.com/x/log.txt?x=12e7025f980000 Reported-by: syzbot+4b87489410b4efd181bf@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4b87489410b4efd181bf Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20241220222325.2487767-1-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe.c | 23 +++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe.h | 3 ++- drivers/infiniband/sw/rxe/rxe_mcast.c | 22 ++++++++++++++++++++-- drivers/infiniband/sw/rxe/rxe_net.c | 24 ++++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe_verbs.c | 26 +++++++++++++++++++++----- drivers/infiniband/sw/rxe/rxe_verbs.h | 11 ++++++++--- 6 files changed, 90 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 255677bc12b2a..1ba4a0c8726ae 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -40,6 +40,8 @@ void rxe_dealloc(struct ib_device *ib_dev) /* initialize rxe device parameters */ static void rxe_init_device_param(struct rxe_dev *rxe) { + struct net_device *ndev; + rxe->max_inline_data = RXE_MAX_INLINE_DATA; rxe->attr.vendor_id = RXE_VENDOR_ID; @@ -71,8 +73,15 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; rxe->attr.max_pkeys = RXE_MAX_PKEYS; rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, - rxe->ndev->dev_addr); + ndev->dev_addr); + + dev_put(ndev); rxe->max_ucontext = RXE_MAX_UCONTEXT; } @@ -109,10 +118,15 @@ static void rxe_init_port_param(struct rxe_port *port) static void rxe_init_ports(struct rxe_dev *rxe) { struct rxe_port *port = &rxe->port; + struct net_device *ndev; rxe_init_port_param(port); + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; addrconf_addr_eui48((unsigned char *)&port->port_guid, - rxe->ndev->dev_addr); + ndev->dev_addr); + dev_put(ndev); spin_lock_init(&port->port_lock); } @@ -167,12 +181,13 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev) { rxe_init(rxe); rxe_set_mtu(rxe, mtu); - return rxe_register_device(rxe, ibdev_name); + return rxe_register_device(rxe, ibdev_name, ndev); } static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index d8fb2c7af30a7..fe7f970667325 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -139,7 +139,8 @@ enum resp_states { void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev); void rxe_rcv(struct sk_buff *skb); diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 86cc2e18a7fda..07ff47bae31df 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -31,10 +31,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_add(rxe->ndev, ll_addr); + ret = dev_mc_add(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** @@ -47,10 +56,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_del(rxe->ndev, ll_addr); + ret = dev_mc_del(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 75d1407db52d4..8cc64ceeb3569 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -524,7 +524,16 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, */ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) { - return rxe->ndev->name; + struct net_device *ndev; + char *ndev_name; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return NULL; + ndev_name = ndev->name; + dev_put(ndev); + + return ndev_name; } int rxe_net_add(const char *ibdev_name, struct net_device *ndev) @@ -536,10 +545,9 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) if (!rxe) return -ENOMEM; - rxe->ndev = ndev; ib_mark_name_assigned_by_user(&rxe->ib_dev); - err = rxe_add(rxe, ndev->mtu, ibdev_name); + err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev); if (err) { ib_dealloc_device(&rxe->ib_dev); return err; @@ -587,10 +595,18 @@ void rxe_port_down(struct rxe_dev *rxe) void rxe_set_port_state(struct rxe_dev *rxe) { - if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; + + if (netif_running(ndev) && netif_carrier_ok(ndev)) rxe_port_up(rxe); else rxe_port_down(rxe); + + dev_put(ndev); } static int rxe_notify(struct notifier_block *not_blk, diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 5c18f7e342f29..8a5fc20fd1869 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -41,6 +41,7 @@ static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(ibdev); + struct net_device *ndev; int err, ret; if (port_num != 1) { @@ -49,6 +50,12 @@ static int rxe_query_port(struct ib_device *ibdev, goto err_out; } + ndev = rxe_ib_device_get_netdev(ibdev); + if (!ndev) { + err = -ENODEV; + goto err_out; + } + memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); @@ -57,13 +64,14 @@ static int rxe_query_port(struct ib_device *ibdev, if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; - else if (dev_get_flags(rxe->ndev) & IFF_UP) + else if (dev_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; mutex_unlock(&rxe->usdev_lock); + dev_put(ndev); return ret; err_out: @@ -1425,9 +1433,16 @@ static const struct attribute_group rxe_attr_group = { static int rxe_enable_driver(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(ib_dev); + if (!ndev) + return -ENODEV; rxe_set_port_state(rxe); - dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); + dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev)); + + dev_put(ndev); return 0; } @@ -1495,7 +1510,8 @@ static const struct ib_device_ops rxe_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), }; -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev) { int err; struct ib_device *dev = &rxe->ib_dev; @@ -1507,13 +1523,13 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, - rxe->ndev->dev_addr); + ndev->dev_addr); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); ib_set_device_ops(dev, &rxe_dev_ops); - err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1); + err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 3c1354f82283e..6573ceec0ef58 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -370,6 +370,7 @@ struct rxe_port { u32 qp_gsi_index; }; +#define RXE_PORT 1 struct rxe_dev { struct ib_device ib_dev; struct ib_device_attr attr; @@ -377,8 +378,6 @@ struct rxe_dev { int max_inline_data; struct mutex usdev_lock; - struct net_device *ndev; - struct rxe_pool uc_pool; struct rxe_pool pd_pool; struct rxe_pool ah_pool; @@ -406,6 +405,11 @@ struct rxe_dev { struct crypto_shash *tfm; }; +static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev) +{ + return ib_device_get_netdev(dev, RXE_PORT); +} + static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) { atomic64_inc(&rxe->stats_counters[index]); @@ -471,6 +475,7 @@ static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) return to_rpd(mw->ibmw.pd); } -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name); +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev); #endif /* RXE_VERBS_H */ From 385a95cc72941c7f88630a7bc4176048cc03b395 Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Mon, 16 Dec 2024 23:45:54 +0530 Subject: [PATCH 1014/1450] drm/i915/cx0_phy: Fix C10 pll programming sequence According to spec VDR_CUSTOM_WIDTH register gets programmed after pll specific VDR registers and TX Lane programming registers are done. Moreover we only program into C10_VDR_CONTROL1 to update config and setup master lane once all VDR registers are written into. Bspec: 67636 Fixes: 51390cc0e00a ("drm/i915/mtl: Add Support for C10 PHY message bus and pll programming") Signed-off-by: Suraj Kandpal Reviewed-by: Ankit Nautiyal Link: https://patchwork.freedesktop.org/patch/msgid/20241216181554.2861381-1-suraj.kandpal@intel.com (cherry picked from commit f9d418552ba1e3a0e92487ff82eb515dab7516c0) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_cx0_phy.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cx0_phy.c b/drivers/gpu/drm/i915/display/intel_cx0_phy.c index 71dc659228ab9..0c7aee13495a3 100644 --- a/drivers/gpu/drm/i915/display/intel_cx0_phy.c +++ b/drivers/gpu/drm/i915/display/intel_cx0_phy.c @@ -2115,14 +2115,6 @@ static void intel_c10_pll_program(struct intel_display *display, 0, C10_VDR_CTRL_MSGBUS_ACCESS, MB_WRITE_COMMITTED); - /* Custom width needs to be programmed to 0 for both the phy lanes */ - intel_cx0_rmw(encoder, INTEL_CX0_BOTH_LANES, PHY_C10_VDR_CUSTOM_WIDTH, - C10_VDR_CUSTOM_WIDTH_MASK, C10_VDR_CUSTOM_WIDTH_8_10, - MB_WRITE_COMMITTED); - intel_cx0_rmw(encoder, INTEL_CX0_BOTH_LANES, PHY_C10_VDR_CONTROL(1), - 0, C10_VDR_CTRL_UPDATE_CFG, - MB_WRITE_COMMITTED); - /* Program the pll values only for the master lane */ for (i = 0; i < ARRAY_SIZE(pll_state->pll); i++) intel_cx0_write(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_PLL(i), @@ -2132,6 +2124,10 @@ static void intel_c10_pll_program(struct intel_display *display, intel_cx0_write(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_CMN(0), pll_state->cmn, MB_WRITE_COMMITTED); intel_cx0_write(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_TX(0), pll_state->tx, MB_WRITE_COMMITTED); + /* Custom width needs to be programmed to 0 for both the phy lanes */ + intel_cx0_rmw(encoder, INTEL_CX0_BOTH_LANES, PHY_C10_VDR_CUSTOM_WIDTH, + C10_VDR_CUSTOM_WIDTH_MASK, C10_VDR_CUSTOM_WIDTH_8_10, + MB_WRITE_COMMITTED); intel_cx0_rmw(encoder, INTEL_CX0_LANE0, PHY_C10_VDR_CONTROL(1), 0, C10_VDR_CTRL_MASTER_LANE | C10_VDR_CTRL_UPDATE_CFG, MB_WRITE_COMMITTED); From 20e7c5313ffbf11c34a46395345677adbe890bee Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Thu, 19 Dec 2024 16:00:19 -0500 Subject: [PATCH 1015/1450] drm/i915/dg1: Fix power gate sequence. sub-pipe PG is not present on DG1. Setting these bits can disable other power gates and cause GPU hangs on video playbacks. VLK: 16314, 4304 Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13381 Fixes: 85a12d7eb8fe ("drm/i915/tgl: Fix Media power gate sequence.") Cc: Vinay Belgaumkar Cc: Himal Prasad Ghimiray Reviewed-by: Vinay Belgaumkar Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20241219210019.70532-1-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit de7061947b4ed4be857d452c60d5fb795831d79e) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_rc6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index c864d101faf94..9378d5901c493 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -133,7 +133,7 @@ static void gen11_rc6_enable(struct intel_rc6 *rc6) GEN9_MEDIA_PG_ENABLE | GEN11_MEDIA_SAMPLER_PG_ENABLE; - if (GRAPHICS_VER(gt->i915) >= 12) { + if (GRAPHICS_VER(gt->i915) >= 12 && !IS_DG1(gt->i915)) { for (i = 0; i < I915_MAX_VCS; i++) if (HAS_ENGINE(gt, _VCS(i))) pg_enable |= (VDN_HCP_POWERGATE_ENABLE(i) | From 362f1bf98a3ecb5a2a4fcbdaa9718c8403beceb2 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Fri, 11 Oct 2024 22:57:59 +0200 Subject: [PATCH 1016/1450] dmaengine: mv_xor: fix child node refcount handling in early exit The for_each_child_of_node() loop requires explicit calls to of_node_put() to decrement the child's refcount upon early exits (break, goto, return). Add the missing calls in the two early exits before the goto instructions. Cc: stable@vger.kernel.org Fixes: f7d12ef53ddf ("dma: mv_xor: add Device Tree binding") Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20241011-dma_mv_xor_of_node_put-v1-1-3c2de819f463@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/mv_xor.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index 43efce77bb577..40b76b40bc30c 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1388,6 +1388,7 @@ static int mv_xor_probe(struct platform_device *pdev) irq = irq_of_parse_and_map(np, 0); if (!irq) { ret = -ENODEV; + of_node_put(np); goto err_channel_add; } @@ -1396,6 +1397,7 @@ static int mv_xor_probe(struct platform_device *pdev) if (IS_ERR(chan)) { ret = PTR_ERR(chan); irq_dispose_mapping(irq); + of_node_put(np); goto err_channel_add; } From ebc008699fd95701c9af5ebaeb0793eef81a71d5 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Thu, 12 Dec 2024 18:14:12 +0530 Subject: [PATCH 1017/1450] dmaengine: tegra: Return correct DMA status when paused Currently, the driver does not return the correct DMA status when a DMA pause is issued by the client drivers. This causes GPCDMA users to assume that DMA is still running, while in reality, the DMA is paused. Return DMA_PAUSED for tx_status() if the channel is paused in the middle of a transfer. Fixes: ee17028009d4 ("dmaengine: tegra: Add tegra gpcdma driver") Cc: stable@vger.kernel.org Signed-off-by: Akhil R Signed-off-by: Kartik Rajput Link: https://lore.kernel.org/r/20241212124412.5650-1-kkartik@nvidia.com Signed-off-by: Vinod Koul --- drivers/dma/tegra186-gpc-dma.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/dma/tegra186-gpc-dma.c b/drivers/dma/tegra186-gpc-dma.c index cacf3757adc2e..4d6fe0efa76e4 100644 --- a/drivers/dma/tegra186-gpc-dma.c +++ b/drivers/dma/tegra186-gpc-dma.c @@ -231,6 +231,7 @@ struct tegra_dma_channel { bool config_init; char name[30]; enum dma_transfer_direction sid_dir; + enum dma_status status; int id; int irq; int slave_id; @@ -393,6 +394,8 @@ static int tegra_dma_pause(struct tegra_dma_channel *tdc) tegra_dma_dump_chan_regs(tdc); } + tdc->status = DMA_PAUSED; + return ret; } @@ -419,6 +422,8 @@ static void tegra_dma_resume(struct tegra_dma_channel *tdc) val = tdc_read(tdc, TEGRA_GPCDMA_CHAN_CSRE); val &= ~TEGRA_GPCDMA_CHAN_CSRE_PAUSE; tdc_write(tdc, TEGRA_GPCDMA_CHAN_CSRE, val); + + tdc->status = DMA_IN_PROGRESS; } static int tegra_dma_device_resume(struct dma_chan *dc) @@ -544,6 +549,7 @@ static void tegra_dma_xfer_complete(struct tegra_dma_channel *tdc) tegra_dma_sid_free(tdc); tdc->dma_desc = NULL; + tdc->status = DMA_COMPLETE; } static void tegra_dma_chan_decode_error(struct tegra_dma_channel *tdc, @@ -716,6 +722,7 @@ static int tegra_dma_terminate_all(struct dma_chan *dc) tdc->dma_desc = NULL; } + tdc->status = DMA_COMPLETE; tegra_dma_sid_free(tdc); vchan_get_all_descriptors(&tdc->vc, &head); spin_unlock_irqrestore(&tdc->vc.lock, flags); @@ -769,6 +776,9 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc, if (ret == DMA_COMPLETE) return ret; + if (tdc->status == DMA_PAUSED) + ret = DMA_PAUSED; + spin_lock_irqsave(&tdc->vc.lock, flags); vd = vchan_find_desc(&tdc->vc, cookie); if (vd) { From fe4bfa9b6d7bd752bfe4700c937f235aa8ce997b Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:41 +0800 Subject: [PATCH 1018/1450] phy: core: Fix that API devm_phy_put() fails to release the phy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For devm_phy_put(), its comment says it needs to invoke phy_put() to release the phy, but it will not actually invoke the function since devres_destroy() does not call devm_phy_release(), and the missing phy_put() call will cause: - The phy fails to be released. - devm_phy_put() can not fully undo what API devm_phy_get() does. - Leak refcount of both the module and device for below typical usage: devm_phy_get(); // or its variant ... err = do_something(); if (err) goto err_out; ... err_out: devm_phy_put(); // leak refcount here The file(s) affected by this issue are shown below since they have such typical usage. drivers/pci/controller/cadence/pcie-cadence.c drivers/net/ethernet/ti/am65-cpsw-nuss.c Fix by using devres_release() instead of devres_destroy() within the API. Fixes: ff764963479a ("drivers: phy: add generic PHY framework") Cc: stable@vger.kernel.org Cc: Lorenzo Pieralisi Cc: Krzysztof Wilczyński Cc: Bjorn Helgaas Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-1-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index f053b525ccffa..f190d7126613a 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -737,7 +737,7 @@ void devm_phy_put(struct device *dev, struct phy *phy) if (!phy) return; - r = devres_destroy(dev, devm_phy_release, devm_phy_match, phy); + r = devres_release(dev, devm_phy_release, devm_phy_match, phy); dev_WARN_ONCE(dev, r, "couldn't find PHY resource\n"); } EXPORT_SYMBOL_GPL(devm_phy_put); From c0b82ab95b4f1fbc3e3aeab9d829d012669524b6 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:42 +0800 Subject: [PATCH 1019/1450] phy: core: Fix that API devm_of_phy_provider_unregister() fails to unregister the phy provider For devm_of_phy_provider_unregister(), its comment says it needs to invoke of_phy_provider_unregister() to unregister the phy provider, but it will not actually invoke the function since devres_destroy() does not call devm_phy_provider_release(), and the missing of_phy_provider_unregister() call will cause: - The phy provider fails to be unregistered. - Leak both memory and the OF node refcount. Fortunately, the faulty API has not been used by current kernel tree. Fix by using devres_release() instead of devres_destroy() within the API. Fixes: ff764963479a ("drivers: phy: add generic PHY framework") Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/stable/20241213-phy_core_fix-v6-2-40ae28f5015a%40quicinc.com Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-2-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index f190d7126613a..de07e1616b34d 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -1259,12 +1259,12 @@ EXPORT_SYMBOL_GPL(of_phy_provider_unregister); * of_phy_provider_unregister to unregister the phy provider. */ void devm_of_phy_provider_unregister(struct device *dev, - struct phy_provider *phy_provider) + struct phy_provider *phy_provider) { int r; - r = devres_destroy(dev, devm_phy_provider_release, devm_phy_match, - phy_provider); + r = devres_release(dev, devm_phy_provider_release, devm_phy_match, + phy_provider); dev_WARN_ONCE(dev, r, "couldn't find PHY provider device resource\n"); } EXPORT_SYMBOL_GPL(devm_of_phy_provider_unregister); From 4dc48c88fcf82b89fdebd83a906aaa64f40fb8a9 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:43 +0800 Subject: [PATCH 1020/1450] phy: core: Fix that API devm_phy_destroy() fails to destroy the phy For devm_phy_destroy(), its comment says it needs to invoke phy_destroy() to destroy the phy, but it will not actually invoke the function since devres_destroy() does not call devm_phy_consume(), and the missing phy_destroy() call will cause that the phy fails to be destroyed. Fortunately, the faulty API has not been used by current kernel tree. Fix by using devres_release() instead of devres_destroy() within the API. Fixes: ff764963479a ("drivers: phy: add generic PHY framework") Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-3-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index de07e1616b34d..52ca590a58b9c 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -1121,7 +1121,7 @@ void devm_phy_destroy(struct device *dev, struct phy *phy) { int r; - r = devres_destroy(dev, devm_phy_consume, devm_phy_match, phy); + r = devres_release(dev, devm_phy_consume, devm_phy_match, phy); dev_WARN_ONCE(dev, r, "couldn't find PHY resource\n"); } EXPORT_SYMBOL_GPL(devm_phy_destroy); From 5ebdc6be16c2000e37fcb8b4072d442d268ad492 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:44 +0800 Subject: [PATCH 1021/1450] phy: core: Fix an OF node refcount leakage in _of_phy_get() _of_phy_get() will directly return when suffers of_device_is_compatible() error, but it forgets to decrease refcount of OF node @args.np before error return, the refcount was increased by previous of_parse_phandle_with_args() so causes the OF node's refcount leakage. Fix by decreasing the refcount via of_node_put() before the error return. Fixes: b7563e2796f8 ("phy: work around 'phys' references to usb-nop-xceiv devices") Cc: stable@vger.kernel.org Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-4-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index 52ca590a58b9c..b88fbda6c0461 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -629,8 +629,10 @@ static struct phy *_of_phy_get(struct device_node *np, int index) return ERR_PTR(-ENODEV); /* This phy type handled by the usb-phy subsystem for now */ - if (of_device_is_compatible(args.np, "usb-nop-xceiv")) - return ERR_PTR(-ENODEV); + if (of_device_is_compatible(args.np, "usb-nop-xceiv")) { + phy = ERR_PTR(-ENODEV); + goto out_put_node; + } mutex_lock(&phy_provider_mutex); phy_provider = of_phy_provider_lookup(args.np); @@ -652,6 +654,7 @@ static struct phy *_of_phy_get(struct device_node *np, int index) out_unlock: mutex_unlock(&phy_provider_mutex); +out_put_node: of_node_put(args.np); return phy; From a2d633cb1421e679b56f1a9fe1f42f089706f1ed Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 13 Dec 2024 20:36:45 +0800 Subject: [PATCH 1022/1450] phy: core: Fix an OF node refcount leakage in of_phy_provider_lookup() For macro for_each_child_of_node(parent, child), refcount of @child has been increased before entering its loop body, so normally needs to call of_node_put(@child) before returning from the loop body to avoid refcount leakage. of_phy_provider_lookup() has such usage but does not call of_node_put() before returning, so cause leakage of the OF node refcount. Fix by simply calling of_node_put() before returning from the loop body. The APIs affected by this issue are shown below since they indirectly invoke problematic of_phy_provider_lookup(). phy_get() of_phy_get() devm_phy_get() devm_of_phy_get() devm_of_phy_get_by_index() Fixes: 2a4c37016ca9 ("phy: core: Fix of_phy_provider_lookup to return PHY provider for sub node") Cc: stable@vger.kernel.org Reviewed-by: Johan Hovold Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241213-phy_core_fix-v6-5-40ae28f5015a@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index b88fbda6c0461..413f76e2d1744 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -145,8 +145,10 @@ static struct phy_provider *of_phy_provider_lookup(struct device_node *node) return phy_provider; for_each_child_of_node(phy_provider->children, child) - if (child == node) + if (child == node) { + of_node_put(child); return phy_provider; + } } return ERR_PTR(-EPROBE_DEFER); From 739214dd1c209e34323814fb815fb17cccb9f95b Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Sun, 15 Dec 2024 16:05:55 -0600 Subject: [PATCH 1023/1450] phy: freescale: fsl-samsung-hdmi: Fix 64-by-32 division cocci warnings The Kernel test robot returns the following warning: do_div() does a 64-by-32 division, please consider using div64_ul instead. To prevent the 64-by-32 divsion, consolidate both the multiplication and the do_div into one line which explicitly uses u64 sizes. Fixes: 1951dbb41d1d ("phy: freescale: fsl-samsung-hdmi: Support dynamic integer") Signed-off-by: Adam Ford Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412091243.fSObwwPi-lkp@intel.com/ Link: https://lore.kernel.org/r/20241215220555.99113-1-aford173@gmail.com Signed-off-by: Vinod Koul --- drivers/phy/freescale/phy-fsl-samsung-hdmi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c index 2c8038864357b..d3ccf547ba1c2 100644 --- a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c +++ b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c @@ -424,8 +424,7 @@ static unsigned long fsl_samsung_hdmi_phy_find_pms(unsigned long fout, u8 *p, u1 * Fvco = (M * f_ref) / P, * where f_ref is 24MHz. */ - tmp = (u64)_m * 24 * MHZ; - do_div(tmp, _p); + tmp = div64_ul((u64)_m * 24 * MHZ, _p); if (tmp < 750 * MHZ || tmp > 3000 * MHZ) continue; From 17194c2998d39ab366a2ecbc4d1f3281e00d6a05 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Dec 2024 09:30:51 +0100 Subject: [PATCH 1024/1450] phy: mediatek: phy-mtk-hdmi: add regulator dependency The driver no longer builds when regulator support is unavailable: arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi.o: in function `mtk_hdmi_phy_register_regulators': phy-mtk-hdmi.c:(.text.unlikely+0x3e): undefined reference to `devm_regulator_register' arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.o: in function `mtk_hdmi_phy_pwr5v_is_enabled': phy-mtk-hdmi-mt8195.c:(.text+0x326): undefined reference to `rdev_get_drvdata' arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.o: in function `mtk_hdmi_phy_pwr5v_disable': phy-mtk-hdmi-mt8195.c:(.text+0x346): undefined reference to `rdev_get_drvdata' arm-linux-gnueabi-ld: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.o: in function `mtk_hdmi_phy_pwr5v_enable': Fixes: 49393b2da1cd ("phy: mediatek: phy-mtk-hdmi: Register PHY provided regulator") Signed-off-by: Arnd Bergmann Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20241213083056.2596499-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/mediatek/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/phy/mediatek/Kconfig b/drivers/phy/mediatek/Kconfig index 60e00057e8bc7..ba6461350951c 100644 --- a/drivers/phy/mediatek/Kconfig +++ b/drivers/phy/mediatek/Kconfig @@ -65,6 +65,7 @@ config PHY_MTK_HDMI depends on ARCH_MEDIATEK || COMPILE_TEST depends on COMMON_CLK depends on OF + depends on REGULATOR select GENERIC_PHY help Support HDMI PHY for Mediatek SoCs. From d57212f281fda9056412cd6cca983d9d2eb89f53 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Tue, 24 Dec 2024 12:43:58 +0800 Subject: [PATCH 1025/1450] workqueue: add printf attribute to __alloc_workqueue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a compiler warning with W=1: kernel/workqueue.c: error: function ‘__alloc_workqueue’ might be a candidate for ‘gnu_printf’ format attribute[-Werror=suggest-attribute=format] 5657 | name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); | ^~~~~~~~ Fixes: 9b59a85a84dc ("workqueue: Don't call va_start / va_end twice") Signed-off-by: Su Hui Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8336218ec4b86..f7d8fc2045795 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5645,6 +5645,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) } while (activated); } +__printf(1, 0) static struct workqueue_struct *__alloc_workqueue(const char *fmt, unsigned int flags, int max_active, va_list args) From 35bf430e08a18fdab6eb94492a06d9ad14c6179b Mon Sep 17 00:00:00 2001 From: Henry Huang Date: Sun, 22 Dec 2024 23:43:16 +0800 Subject: [PATCH 1026/1450] sched_ext: initialize kit->cursor.flags struct bpf_iter_scx_dsq *it maybe not initialized. If we didn't call scx_bpf_dsq_move_set_vtime and scx_bpf_dsq_move_set_slice before scx_bpf_dsq_move, it would cause unexpected behaviors: 1. Assign a huge slice into p->scx.slice 2. Assign a invalid vtime into p->scx.dsq_vtime Signed-off-by: Henry Huang Fixes: 6462dd53a260 ("sched_ext: Compact struct bpf_iter_scx_dsq_kern") Cc: stable@vger.kernel.org # v6.12 Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 98519e6d0dcd0..19d2699cf6383 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7013,7 +7013,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, return -ENOENT; INIT_LIST_HEAD(&kit->cursor.node); - kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags; + kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; kit->cursor.priv = READ_ONCE(kit->dsq->seq); return 0; From 542ed8145e6f9392e3d0a86a0e9027d2ffd183e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 21 Dec 2024 00:29:20 +0100 Subject: [PATCH 1027/1450] netfilter: nft_set_hash: unaligned atomic read on struct nft_set_ext Access to genmask field in struct nft_set_ext results in unaligned atomic read: [ 72.130109] Unable to handle kernel paging request at virtual address ffff0000c2bb708c [ 72.131036] Mem abort info: [ 72.131213] ESR = 0x0000000096000021 [ 72.131446] EC = 0x25: DABT (current EL), IL = 32 bits [ 72.132209] SET = 0, FnV = 0 [ 72.133216] EA = 0, S1PTW = 0 [ 72.134080] FSC = 0x21: alignment fault [ 72.135593] Data abort info: [ 72.137194] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000 [ 72.142351] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 72.145989] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 72.150115] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000237d27000 [ 72.154893] [ffff0000c2bb708c] pgd=0000000000000000, p4d=180000023ffff403, pud=180000023f84b403, pmd=180000023f835403, +pte=0068000102bb7707 [ 72.163021] Internal error: Oops: 0000000096000021 [#1] SMP [...] [ 72.170041] CPU: 7 UID: 0 PID: 54 Comm: kworker/7:0 Tainted: G E 6.13.0-rc3+ #2 [ 72.170509] Tainted: [E]=UNSIGNED_MODULE [ 72.170720] Hardware name: QEMU QEMU Virtual Machine, BIOS edk2-stable202302-for-qemu 03/01/2023 [ 72.171192] Workqueue: events_power_efficient nft_rhash_gc [nf_tables] [ 72.171552] pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 72.171915] pc : nft_rhash_gc+0x200/0x2d8 [nf_tables] [ 72.172166] lr : nft_rhash_gc+0x128/0x2d8 [nf_tables] [ 72.172546] sp : ffff800081f2bce0 [ 72.172724] x29: ffff800081f2bd40 x28: ffff0000c2bb708c x27: 0000000000000038 [ 72.173078] x26: ffff0000c6780ef0 x25: ffff0000c643df00 x24: ffff0000c6778f78 [ 72.173431] x23: 000000000000001a x22: ffff0000c4b1f000 x21: ffff0000c6780f78 [ 72.173782] x20: ffff0000c2bb70dc x19: ffff0000c2bb7080 x18: 0000000000000000 [ 72.174135] x17: ffff0000c0a4e1c0 x16: 0000000000003000 x15: 0000ac26d173b978 [ 72.174485] x14: ffffffffffffffff x13: 0000000000000030 x12: ffff0000c6780ef0 [ 72.174841] x11: 0000000000000000 x10: ffff800081f2bcf8 x9 : ffff0000c3000000 [ 72.175193] x8 : 00000000000004be x7 : 0000000000000000 x6 : 0000000000000000 [ 72.175544] x5 : 0000000000000040 x4 : ffff0000c3000010 x3 : 0000000000000000 [ 72.175871] x2 : 0000000000003a98 x1 : ffff0000c2bb708c x0 : 0000000000000004 [ 72.176207] Call trace: [ 72.176316] nft_rhash_gc+0x200/0x2d8 [nf_tables] (P) [ 72.176653] process_one_work+0x178/0x3d0 [ 72.176831] worker_thread+0x200/0x3f0 [ 72.176995] kthread+0xe8/0xf8 [ 72.177130] ret_from_fork+0x10/0x20 [ 72.177289] Code: 54fff984 d503201f d2800080 91003261 (f820303f) [ 72.177557] ---[ end trace 0000000000000000 ]--- Align struct nft_set_ext to word size to address this and documentation it. pahole reports that this increases the size of elements for rhash and pipapo in 8 bytes on x86_64. Fixes: 7ffc7481153b ("netfilter: nft_set_hash: skip duplicated elements pending gc run") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4afa64c813045..0027beca5cd50 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -733,15 +733,18 @@ struct nft_set_ext_tmpl { /** * struct nft_set_ext - set extensions * - * @genmask: generation mask + * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data + * + * This structure must be aligned to word size, otherwise atomic bitops + * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; -}; +} __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { From ce2b93fc1dfa1c82f2576aa571731c4e5dcc8dd7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Dec 2024 14:09:15 -1000 Subject: [PATCH 1028/1450] sched_ext: Fix dsq_local_on selftest The dsp_local_on selftest expects the scheduler to fail by trying to schedule an e.g. CPU-affine task to the wrong CPU. However, this isn't guaranteed to happen in the 1 second window that the test is running. Besides, it's odd to have this particular exception path tested when there are no other tests that verify that the interface is working at all - e.g. the test would pass if dsp_local_on interface is completely broken and fails on any attempt. Flip the test so that it verifies that the feature works. While at it, fix a typo in the info message. Signed-off-by: Tejun Heo Reported-by: Ihor Solodrai Link: http://lkml.kernel.org/r/Z1n9v7Z6iNJ-wKmq@slm.duckdns.org Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 5 ++++- tools/testing/selftests/sched_ext/dsp_local_on.c | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index 6325bf76f47ee..fbda6bf546712 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -43,7 +43,10 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) if (!p) return; - target = bpf_get_prandom_u32() % nr_cpus; + if (p->nr_cpus_allowed == nr_cpus) + target = bpf_get_prandom_u32() % nr_cpus; + else + target = scx_bpf_task_cpu(p); scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); bpf_task_release(p); diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c index 472851b568548..0ff27e57fe430 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.c @@ -34,9 +34,10 @@ static enum scx_test_status run(void *ctx) /* Just sleeping is fine, plenty of scheduling events happening */ sleep(1); - SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); bpf_link__destroy(link); + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG)); + return SCX_TEST_PASS; } @@ -50,7 +51,7 @@ static void cleanup(void *ctx) struct scx_test dsp_local_on = { .name = "dsp_local_on", .description = "Verify we can directly dispatch tasks to a local DSQs " - "from osp.dispatch()", + "from ops.dispatch()", .setup = setup, .run = run, .cleanup = cleanup, From dcd59d0d7d51b2a4b768fc132b0d74a97dfd6d6a Mon Sep 17 00:00:00 2001 From: "Dustin L. Howett" Date: Tue, 24 Dec 2024 12:55:58 -0600 Subject: [PATCH 1029/1450] platform/chrome: cros_ec_lpc: fix product identity for early Framework Laptops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The product names for the Framework Laptop (12th and 13th Generation Intel Core) are incorrect as of 62be134abf42. Fixes: 62be134abf42 ("platform/chrome: cros_ec_lpc: switch primary DMI data for Framework Laptop") Cc: stable@vger.kernel.org # 6.12.x Signed-off-by: Dustin L. Howett Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241224-platform-chrome-cros_ec_lpc-fix-product-identity-for-early-framework-laptops-v1-1-0d31d6e1d22c@howett.net Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec_lpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c index 924bf4d3cc77b..8470b7f2b1358 100644 --- a/drivers/platform/chrome/cros_ec_lpc.c +++ b/drivers/platform/chrome/cros_ec_lpc.c @@ -707,7 +707,7 @@ static const struct dmi_system_id cros_ec_lpc_dmi_table[] __initconst = { /* Framework Laptop (12th Gen Intel Core) */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Framework"), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "12th Gen Intel Core"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Laptop (12th Gen Intel Core)"), }, .driver_data = (void *)&framework_laptop_mec_lpc_driver_data, }, @@ -715,7 +715,7 @@ static const struct dmi_system_id cros_ec_lpc_dmi_table[] __initconst = { /* Framework Laptop (13th Gen Intel Core) */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Framework"), - DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "13th Gen Intel Core"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Laptop (13th Gen Intel Core)"), }, .driver_data = (void *)&framework_laptop_mec_lpc_driver_data, }, From 768776dd4efc681cdca33a79e29bb508d6de9bc0 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Mon, 16 Dec 2024 16:16:40 +0100 Subject: [PATCH 1030/1450] i2c: imx: fix missing stop condition in single-master mode A regression was introduced with the implementation of single-master mode, preventing proper stop conditions from being generated. Devices that require a valid stop condition, such as EEPROMs, fail to function correctly as a result. The issue only affects devices with the single-master property enabled. This commit resolves the issue by re-enabling I2C bus busy bit (IBB) polling for single-master mode when generating a stop condition. The fix further ensures that the i2c_imx->stopped flag is cleared at the start of each transfer, allowing the stop condition to be correctly generated in i2c_imx_stop(). According to the reference manual (IMX8MMRM, Rev. 2, 09/2019, page 5270), polling the IBB bit to determine if the bus is free is only necessary in multi-master mode. Consequently, the IBB bit is not polled for the start condition in single-master mode. Fixes: 6692694aca86 ("i2c: imx: do not poll for bus busy in single master mode") Signed-off-by: Stefan Eichenberger Reviewed-by: Frank Li Reviewed-by: Francesco Dolcini Link: https://lore.kernel.org/r/20241216151829.74056-1-eichest@gmail.com Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-imx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index f751d231ded87..488ee35113145 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -532,22 +532,20 @@ static void i2c_imx_dma_free(struct imx_i2c_struct *i2c_imx) static int i2c_imx_bus_busy(struct imx_i2c_struct *i2c_imx, int for_busy, bool atomic) { + bool multi_master = i2c_imx->multi_master; unsigned long orig_jiffies = jiffies; unsigned int temp; - if (!i2c_imx->multi_master) - return 0; - while (1) { temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR); /* check for arbitration lost */ - if (temp & I2SR_IAL) { + if (multi_master && (temp & I2SR_IAL)) { i2c_imx_clear_irq(i2c_imx, I2SR_IAL); return -EAGAIN; } - if (for_busy && (temp & I2SR_IBB)) { + if (for_busy && (!multi_master || (temp & I2SR_IBB))) { i2c_imx->stopped = 0; break; } From e0cec363197e41af870613e8e17b30bf0e3d41b5 Mon Sep 17 00:00:00 2001 From: Carlos Song Date: Wed, 18 Dec 2024 12:42:38 +0800 Subject: [PATCH 1031/1450] i2c: imx: add imx7d compatible string for applying erratum ERR007805 Compatible string "fsl,imx7d-i2c" is not exited at i2c-imx driver compatible string table, at the result, "fsl,imx21-i2c" will be matched, but it will cause erratum ERR007805 not be applied in fact. So Add "fsl,imx7d-i2c" compatible string in i2c-imx driver to apply the erratum ERR007805(https://www.nxp.com/docs/en/errata/IMX7DS_3N09P.pdf). " ERR007805 I2C: When the I2C clock speed is configured for 400 kHz, the SCL low period violates the I2C spec of 1.3 uS min Description: When the I2C module is programmed to operate at the maximum clock speed of 400 kHz (as defined by the I2C spec), the SCL clock low period violates the I2C spec of 1.3 uS min. The user must reduce the clock speed to obtain the SCL low time to meet the 1.3us I2C minimum required. This behavior means the SoC is not compliant to the I2C spec at 400kHz. Workaround: To meet the clock low period requirement in fast speed mode, SCL must be configured to 384KHz or less. " "fsl,imx7d-i2c" already is documented in binding doc. This erratum fix has been included in imx6_i2c_hwdata and it is the same in all I.MX6/7/8, so just reuse it. Fixes: 39c025721d70 ("i2c: imx: Implement errata ERR007805 or e7805 bus frequency limit") Cc: stable@vger.kernel.org # v5.18+ Signed-off-by: Carlos Song Signed-off-by: Haibo Chen Reviewed-by: Frank Li Fixes: 39c025721d70 ("i2c: imx: Implement errata ERR007805 or e7805 bus frequency limit") Acked-by: Oleksij Rempel Link: https://lore.kernel.org/r/20241218044238.143414-1-carlos.song@nxp.com Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-imx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 488ee35113145..5c9a8dfbc4a00 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -335,6 +335,7 @@ static const struct of_device_id i2c_imx_dt_ids[] = { { .compatible = "fsl,imx6sll-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx6sx-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx6ul-i2c", .data = &imx6_i2c_hwdata, }, + { .compatible = "fsl,imx7d-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx7s-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx8mm-i2c", .data = &imx6_i2c_hwdata, }, { .compatible = "fsl,imx8mn-i2c", .data = &imx6_i2c_hwdata, }, From 9a8f9320d67b27ddd7f1ee88d91820197a0e908f Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 18 Dec 2024 12:07:40 +0000 Subject: [PATCH 1032/1450] i2c: microchip-core: actually use repeated sends At present, where repeated sends are intended to be used, the i2c-microchip-core driver sends a stop followed by a start. Lots of i2c devices must not malfunction in the face of this behaviour, because the driver has operated like this for years! Try to keep track of whether or not a repeated send is required, and suppress sending a stop in these cases. CC: stable@vger.kernel.org Fixes: 64a6f1c4987e ("i2c: add support for microchip fpga i2c controllers") Signed-off-by: Conor Dooley Reviewed-by: Andi Shyti Link: https://lore.kernel.org/r/20241218-football-composure-e56df2461461@spud Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-microchip-corei2c.c | 120 ++++++++++++++++----- 1 file changed, 94 insertions(+), 26 deletions(-) diff --git a/drivers/i2c/busses/i2c-microchip-corei2c.c b/drivers/i2c/busses/i2c-microchip-corei2c.c index d1543e7d83808..6a124e903c666 100644 --- a/drivers/i2c/busses/i2c-microchip-corei2c.c +++ b/drivers/i2c/busses/i2c-microchip-corei2c.c @@ -93,27 +93,35 @@ * @base: pointer to register struct * @dev: device reference * @i2c_clk: clock reference for i2c input clock + * @msg_queue: pointer to the messages requiring sending * @buf: pointer to msg buffer for easier use * @msg_complete: xfer completion object * @adapter: core i2c abstraction * @msg_err: error code for completed message * @bus_clk_rate: current i2c bus clock rate * @isr_status: cached copy of local ISR status + * @total_num: total number of messages to be sent/received + * @current_num: index of the current message being sent/received * @msg_len: number of bytes transferred in msg * @addr: address of the current slave + * @restart_needed: whether or not a repeated start is required after current message */ struct mchp_corei2c_dev { void __iomem *base; struct device *dev; struct clk *i2c_clk; + struct i2c_msg *msg_queue; u8 *buf; struct completion msg_complete; struct i2c_adapter adapter; int msg_err; + int total_num; + int current_num; u32 bus_clk_rate; u32 isr_status; u16 msg_len; u8 addr; + bool restart_needed; }; static void mchp_corei2c_core_disable(struct mchp_corei2c_dev *idev) @@ -222,6 +230,47 @@ static int mchp_corei2c_fill_tx(struct mchp_corei2c_dev *idev) return 0; } +static void mchp_corei2c_next_msg(struct mchp_corei2c_dev *idev) +{ + struct i2c_msg *this_msg; + u8 ctrl; + + if (idev->current_num >= idev->total_num) { + complete(&idev->msg_complete); + return; + } + + /* + * If there's been an error, the isr needs to return control + * to the "main" part of the driver, so as not to keep sending + * messages once it completes and clears the SI bit. + */ + if (idev->msg_err) { + complete(&idev->msg_complete); + return; + } + + this_msg = idev->msg_queue++; + + if (idev->current_num < (idev->total_num - 1)) { + struct i2c_msg *next_msg = idev->msg_queue; + + idev->restart_needed = next_msg->flags & I2C_M_RD; + } else { + idev->restart_needed = false; + } + + idev->addr = i2c_8bit_addr_from_msg(this_msg); + idev->msg_len = this_msg->len; + idev->buf = this_msg->buf; + + ctrl = readb(idev->base + CORE_I2C_CTRL); + ctrl |= CTRL_STA; + writeb(ctrl, idev->base + CORE_I2C_CTRL); + + idev->current_num++; +} + static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) { u32 status = idev->isr_status; @@ -247,10 +296,14 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) break; case STATUS_M_SLAW_ACK: case STATUS_M_TX_DATA_ACK: - if (idev->msg_len > 0) + if (idev->msg_len > 0) { mchp_corei2c_fill_tx(idev); - else - last_byte = true; + } else { + if (idev->restart_needed) + finished = true; + else + last_byte = true; + } break; case STATUS_M_TX_DATA_NACK: case STATUS_M_SLAR_NACK: @@ -287,7 +340,7 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) mchp_corei2c_stop(idev); if (last_byte || finished) - complete(&idev->msg_complete); + mchp_corei2c_next_msg(idev); return IRQ_HANDLED; } @@ -311,21 +364,48 @@ static irqreturn_t mchp_corei2c_isr(int irq, void *_dev) return ret; } -static int mchp_corei2c_xfer_msg(struct mchp_corei2c_dev *idev, - struct i2c_msg *msg) +static int mchp_corei2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, + int num) { - u8 ctrl; + struct mchp_corei2c_dev *idev = i2c_get_adapdata(adap); + struct i2c_msg *this_msg = msgs; unsigned long time_left; + u8 ctrl; + + mchp_corei2c_core_enable(idev); + + /* + * The isr controls the flow of a transfer, this info needs to be saved + * to a location that it can access the queue information from. + */ + idev->restart_needed = false; + idev->msg_queue = msgs; + idev->total_num = num; + idev->current_num = 0; - idev->addr = i2c_8bit_addr_from_msg(msg); - idev->msg_len = msg->len; - idev->buf = msg->buf; + /* + * But the first entry to the isr is triggered by the start in this + * function, so the first message needs to be "dequeued". + */ + idev->addr = i2c_8bit_addr_from_msg(this_msg); + idev->msg_len = this_msg->len; + idev->buf = this_msg->buf; idev->msg_err = 0; - reinit_completion(&idev->msg_complete); + if (idev->total_num > 1) { + struct i2c_msg *next_msg = msgs + 1; - mchp_corei2c_core_enable(idev); + idev->restart_needed = next_msg->flags & I2C_M_RD; + } + idev->current_num++; + idev->msg_queue++; + + reinit_completion(&idev->msg_complete); + + /* + * Send the first start to pass control to the isr + */ ctrl = readb(idev->base + CORE_I2C_CTRL); ctrl |= CTRL_STA; writeb(ctrl, idev->base + CORE_I2C_CTRL); @@ -335,20 +415,8 @@ static int mchp_corei2c_xfer_msg(struct mchp_corei2c_dev *idev, if (!time_left) return -ETIMEDOUT; - return idev->msg_err; -} - -static int mchp_corei2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, - int num) -{ - struct mchp_corei2c_dev *idev = i2c_get_adapdata(adap); - int i, ret; - - for (i = 0; i < num; i++) { - ret = mchp_corei2c_xfer_msg(idev, msgs++); - if (ret) - return ret; - } + if (idev->msg_err) + return idev->msg_err; return num; } From 49e1f0fd0d4cb03a16b8526c4e683e1958f71490 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 18 Dec 2024 12:07:42 +0000 Subject: [PATCH 1033/1450] i2c: microchip-core: fix "ghost" detections Running i2c-detect currently produces an output akin to: 0 1 2 3 4 5 6 7 8 9 a b c d e f 00: 08 -- 0a -- 0c -- 0e -- 10: 10 -- 12 -- 14 -- 16 -- UU 19 -- 1b -- 1d -- 1f 20: -- 21 -- 23 -- 25 -- 27 -- 29 -- 2b -- 2d -- 2f 30: -- -- -- -- -- -- -- -- 38 -- 3a -- 3c -- 3e -- 40: 40 -- 42 -- 44 -- 46 -- 48 -- 4a -- 4c -- 4e -- 50: -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 60: 60 -- 62 -- 64 -- 66 -- 68 -- 6a -- 6c -- 6e -- 70: 70 -- 72 -- 74 -- 76 -- This happens because for an i2c_msg with a len of 0 the driver will mark the transmission of the message as a success once the START has been sent, without waiting for the devices on the bus to respond with an ACK/NAK. Since i2cdetect seems to run in a tight loop over all addresses the NAK is treated as part of the next test for the next address. Delete the fast path that marks a message as complete when idev->msg_len is zero after sending a START/RESTART since this isn't a valid scenario. CC: stable@vger.kernel.org Fixes: 64a6f1c4987e ("i2c: add support for microchip fpga i2c controllers") Signed-off-by: Conor Dooley Reviewed-by: Andi Shyti Link: https://lore.kernel.org/r/20241218-outbid-encounter-b2e78b1cc707@spud Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-microchip-corei2c.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-microchip-corei2c.c b/drivers/i2c/busses/i2c-microchip-corei2c.c index 6a124e903c666..5db73429125c0 100644 --- a/drivers/i2c/busses/i2c-microchip-corei2c.c +++ b/drivers/i2c/busses/i2c-microchip-corei2c.c @@ -287,8 +287,6 @@ static irqreturn_t mchp_corei2c_handle_isr(struct mchp_corei2c_dev *idev) ctrl &= ~CTRL_STA; writeb(idev->addr, idev->base + CORE_I2C_DATA); writeb(ctrl, idev->base + CORE_I2C_CTRL); - if (idev->msg_len == 0) - finished = true; break; case STATUS_M_ARB_LOST: idev->msg_err = -EAGAIN; From 75cd4005da5492129917a4a4ee45e81660556104 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 25 Dec 2024 19:06:40 +0800 Subject: [PATCH 1034/1450] ublk: detach gendisk from ublk device if add_disk() fails Inside ublk_abort_requests(), gendisk is grabbed for aborting all inflight requests. And ublk_abort_requests() is called when exiting the uring context or handling timeout. If add_disk() fails, the gendisk may have been freed when calling ublk_abort_requests(), so use-after-free can be caused when getting disk's reference in ublk_abort_requests(). Fixes the bug by detaching gendisk from ublk device if add_disk() fails. Fixes: bd23f6c2c2d0 ("ublk: quiesce request queue when aborting queue") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241225110640.351531-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d4aed12dd436b..934ab9332c80a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1618,6 +1618,21 @@ static void ublk_unquiesce_dev(struct ublk_device *ub) blk_mq_kick_requeue_list(ub->ub_disk->queue); } +static struct gendisk *ublk_detach_disk(struct ublk_device *ub) +{ + struct gendisk *disk; + + /* Sync with ublk_abort_queue() by holding the lock */ + spin_lock(&ub->lock); + disk = ub->ub_disk; + ub->dev_info.state = UBLK_S_DEV_DEAD; + ub->dev_info.ublksrv_pid = -1; + ub->ub_disk = NULL; + spin_unlock(&ub->lock); + + return disk; +} + static void ublk_stop_dev(struct ublk_device *ub) { struct gendisk *disk; @@ -1631,14 +1646,7 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_unquiesce_dev(ub); } del_gendisk(ub->ub_disk); - - /* Sync with ublk_abort_queue() by holding the lock */ - spin_lock(&ub->lock); - disk = ub->ub_disk; - ub->dev_info.state = UBLK_S_DEV_DEAD; - ub->dev_info.ublksrv_pid = -1; - ub->ub_disk = NULL; - spin_unlock(&ub->lock); + disk = ublk_detach_disk(ub); put_disk(disk); unlock: mutex_unlock(&ub->mutex); @@ -2336,7 +2344,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) out_put_cdev: if (ret) { - ub->dev_info.state = UBLK_S_DEV_DEAD; + ublk_detach_disk(ub); ublk_put_device(ub); } if (ret) From e33ac68e5e21ec1292490dfe061e75c0dbdd3bd4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Dec 2024 16:49:23 +0000 Subject: [PATCH 1035/1450] io_uring/sqpoll: fix sqpoll error handling races BUG: KASAN: slab-use-after-free in __lock_acquire+0x370b/0x4a10 kernel/locking/lockdep.c:5089 Call Trace: ... _raw_spin_lock_irqsave+0x3d/0x60 kernel/locking/spinlock.c:162 class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:551 [inline] try_to_wake_up+0xb5/0x23c0 kernel/sched/core.c:4205 io_sq_thread_park+0xac/0xe0 io_uring/sqpoll.c:55 io_sq_thread_finish+0x6b/0x310 io_uring/sqpoll.c:96 io_sq_offload_create+0x162/0x11d0 io_uring/sqpoll.c:497 io_uring_create io_uring/io_uring.c:3724 [inline] io_uring_setup+0x1728/0x3230 io_uring/io_uring.c:3806 ... Kun Hu reports that the SQPOLL creating error path has UAF, which happens if io_uring_alloc_task_context() fails and then io_sq_thread() manages to run and complete before the rest of error handling code, which means io_sq_thread_finish() is looking at already killed task. Note that this is mostly theoretical, requiring fault injection on the allocation side to trigger in practice. Cc: stable@vger.kernel.org Reported-by: Kun Hu Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0f2f1aa5729332612bd01fe0f2f385fd1f06ce7c.1735231717.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 6df5e649c413e..9e5bd79fd2b5f 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -405,6 +405,7 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ @@ -480,6 +481,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->thread = tsk; + task_to_put = get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -490,11 +492,15 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } + if (task_to_put) + put_task_struct(task_to_put); return 0; err_sqpoll: complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); + if (task_to_put) + put_task_struct(task_to_put); return ret; } From 6cc45f8c1f898570916044f606be9890d295e129 Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Wed, 27 Nov 2024 14:41:30 +0100 Subject: [PATCH 1036/1450] rtla/timerlat: Fix histogram ALL for zero samples rtla timerlat hist currently computers the minimum, maximum and average latency even in cases when there are zero samples. This leads to nonsensical values being calculated for maximum and minimum, and to divide by zero for average. A similar bug is fixed by 01b05fc0e5f3 ("rtla/timerlat: Fix histogram report when a cpu count is 0") but the bug still remains for printing the sum over all CPUs in timerlat_print_stats_all. The issue can be reproduced with this command: $ rtla timerlat hist -U -d 1s Index over: count: min: avg: max: Floating point exception (core dumped) (There are always no samples with -U unless the user workload is created.) Fix the bug by omitting max/min/avg when sample count is zero, displaying a dash instead, just like we already do for the individual CPUs. The logic is moved into a new function called format_summary_value, which is used for both the individual CPUs and for the overall summary. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20241127134130.51171-1-tglozar@redhat.com Fixes: 1462501c7a8 ("rtla/timerlat: Add a summary for hist mode") Signed-off-by: Tomas Glozar Signed-off-by: Steven Rostedt (Google) --- tools/tracing/rtla/src/timerlat_hist.c | 177 ++++++++++++++----------- 1 file changed, 96 insertions(+), 81 deletions(-) diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 8b66387e5f35c..4403cc4eba302 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -281,6 +281,21 @@ static void timerlat_hist_header(struct osnoise_tool *tool) trace_seq_reset(s); } +/* + * format_summary_value - format a line of summary value (min, max or avg) + * of hist data + */ +static void format_summary_value(struct trace_seq *seq, + int count, + unsigned long long val, + bool avg) +{ + if (count) + trace_seq_printf(seq, "%9llu ", avg ? val / count : val); + else + trace_seq_printf(seq, "%9c ", '-'); +} + /* * timerlat_print_summary - print the summary of the hist data to the output */ @@ -328,29 +343,23 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) { - if (data->hist[cpu].irq_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_irq); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_irq) + format_summary_value(trace->seq, + data->hist[cpu].irq_count, + data->hist[cpu].min_irq, + false); - if (!params->no_thread) { - if (data->hist[cpu].thread_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_thread); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_thread) + format_summary_value(trace->seq, + data->hist[cpu].thread_count, + data->hist[cpu].min_thread, + false); - if (params->user_hist) { - if (data->hist[cpu].user_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].min_user); - else - trace_seq_printf(trace->seq, " - "); - } + if (params->user_hist) + format_summary_value(trace->seq, + data->hist[cpu].user_count, + data->hist[cpu].min_user, + false); } trace_seq_printf(trace->seq, "\n"); @@ -364,29 +373,23 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) { - if (data->hist[cpu].irq_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].sum_irq / data->hist[cpu].irq_count); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_irq) + format_summary_value(trace->seq, + data->hist[cpu].irq_count, + data->hist[cpu].sum_irq, + true); - if (!params->no_thread) { - if (data->hist[cpu].thread_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].sum_thread / data->hist[cpu].thread_count); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_thread) + format_summary_value(trace->seq, + data->hist[cpu].thread_count, + data->hist[cpu].sum_thread, + true); - if (params->user_hist) { - if (data->hist[cpu].user_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].sum_user / data->hist[cpu].user_count); - else - trace_seq_printf(trace->seq, " - "); - } + if (params->user_hist) + format_summary_value(trace->seq, + data->hist[cpu].user_count, + data->hist[cpu].sum_user, + true); } trace_seq_printf(trace->seq, "\n"); @@ -400,29 +403,23 @@ timerlat_print_summary(struct timerlat_hist_params *params, if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) continue; - if (!params->no_irq) { - if (data->hist[cpu].irq_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_irq); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_irq) + format_summary_value(trace->seq, + data->hist[cpu].irq_count, + data->hist[cpu].max_irq, + false); - if (!params->no_thread) { - if (data->hist[cpu].thread_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_thread); - else - trace_seq_printf(trace->seq, " - "); - } + if (!params->no_thread) + format_summary_value(trace->seq, + data->hist[cpu].thread_count, + data->hist[cpu].max_thread, + false); - if (params->user_hist) { - if (data->hist[cpu].user_count) - trace_seq_printf(trace->seq, "%9llu ", - data->hist[cpu].max_user); - else - trace_seq_printf(trace->seq, " - "); - } + if (params->user_hist) + format_summary_value(trace->seq, + data->hist[cpu].user_count, + data->hist[cpu].max_user, + false); } trace_seq_printf(trace->seq, "\n"); trace_seq_do_printf(trace->seq); @@ -506,16 +503,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params, trace_seq_printf(trace->seq, "min: "); if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - sum.min_irq); + format_summary_value(trace->seq, + sum.irq_count, + sum.min_irq, + false); if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - sum.min_thread); + format_summary_value(trace->seq, + sum.thread_count, + sum.min_thread, + false); if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - sum.min_user); + format_summary_value(trace->seq, + sum.user_count, + sum.min_user, + false); trace_seq_printf(trace->seq, "\n"); @@ -523,16 +526,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params, trace_seq_printf(trace->seq, "avg: "); if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - sum.sum_irq / sum.irq_count); + format_summary_value(trace->seq, + sum.irq_count, + sum.sum_irq, + true); if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - sum.sum_thread / sum.thread_count); + format_summary_value(trace->seq, + sum.thread_count, + sum.sum_thread, + true); if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - sum.sum_user / sum.user_count); + format_summary_value(trace->seq, + sum.user_count, + sum.sum_user, + true); trace_seq_printf(trace->seq, "\n"); @@ -540,16 +549,22 @@ timerlat_print_stats_all(struct timerlat_hist_params *params, trace_seq_printf(trace->seq, "max: "); if (!params->no_irq) - trace_seq_printf(trace->seq, "%9llu ", - sum.max_irq); + format_summary_value(trace->seq, + sum.irq_count, + sum.max_irq, + false); if (!params->no_thread) - trace_seq_printf(trace->seq, "%9llu ", - sum.max_thread); + format_summary_value(trace->seq, + sum.thread_count, + sum.max_thread, + false); if (params->user_hist) - trace_seq_printf(trace->seq, "%9llu ", - sum.max_user); + format_summary_value(trace->seq, + sum.user_count, + sum.max_user, + false); trace_seq_printf(trace->seq, "\n"); trace_seq_do_printf(trace->seq); From 6b830c6a023ff6e8fe05dbe47a9e5cd276df09ee Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:14 +0100 Subject: [PATCH 1037/1450] netlink: specs: mptcp: add missing 'server-side' attr This attribute is added with the 'created' and 'established' events, but the documentation didn't mention it. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-1-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 6 ++++-- include/uapi/linux/mptcp_pm.h | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index dc190bf838fec..fc0603f51665a 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -23,7 +23,8 @@ definitions: - name: created doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport + token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, + server-side A new MPTCP connection has been created. It is the good time to allocate memory and send ADD_ADDR if needed. Depending on the traffic-patterns it can take a long time until the @@ -31,7 +32,8 @@ definitions: - name: established doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport + token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, + server-side A MPTCP connection is established (can start new subflows). - name: closed diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 50589e5dd6a38..b34fd95b6f841 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -13,12 +13,13 @@ * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A new MPTCP connection has been created. It is the good time - * to allocate memory and send ADD_ADDR if needed. Depending on the - * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED - * is sent. + * sport, dport, server-side A new MPTCP connection has been created. It is + * the good time to allocate memory and send ADD_ADDR if needed. Depending on + * the traffic-patterns it can take a long time until the + * MPTCP_EVENT_ESTABLISHED is sent. * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A MPTCP connection is established (can start new subflows). + * sport, dport, server-side A MPTCP connection is established (can start new + * subflows). * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A * new address has been announced by the peer. From bea87657b5ee8e6f18af2833ee4b88212ef52d28 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:15 +0100 Subject: [PATCH 1038/1450] netlink: specs: mptcp: clearly mention attributes The rendered version of the MPTCP events [1] looked strange, because the whole content of the 'doc' was displayed in the same block. It was then not clear that the first words, not even ended by a period, were the attributes that are defined when such events are emitted. These attributes have now been moved to the end, prefixed by 'Attributes:' and ended with a period. Note that '>-' has been added after 'doc:' to allow ':' in the text below. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Link: https://docs.kernel.org/networking/netlink_spec/mptcp_pm.html#event-type [1] Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-2-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 50 ++++++++++----------- include/uapi/linux/mptcp_pm.h | 53 ++++++++++++----------- 2 files changed, 52 insertions(+), 51 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index fc0603f51665a..59087a2305651 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -22,67 +22,67 @@ definitions: doc: unused event - name: created - doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, - server-side + doc: >- A new MPTCP connection has been created. It is the good time to allocate memory and send ADD_ADDR if needed. Depending on the traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. + Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, + dport, server-side. - name: established - doc: - token, family, saddr4 | saddr6, daddr4 | daddr6, sport, dport, - server-side + doc: >- A MPTCP connection is established (can start new subflows). + Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, + dport, server-side. - name: closed - doc: - token + doc: >- A MPTCP connection has stopped. + Attribute: token. - name: announced value: 6 - doc: - token, rem_id, family, daddr4 | daddr6 [, dport] + doc: >- A new address has been announced by the peer. + Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. - name: removed - doc: - token, rem_id + doc: >- An address has been lost by the peer. + Attributes: token, rem_id. - name: sub-established value: 10 - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- A new subflow has been established. 'error' should not be set. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: sub-closed - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- A subflow has been closed. An error (copy of sk_err) could be set if an error has been detected for this subflow. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: sub-priority value: 13 - doc: - token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, backup, if_idx [, error] + doc: >- The priority of a subflow has changed. 'error' should not be set. + Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + daddr6, sport, dport, backup, if_idx [, error]. - name: listener-created value: 15 - doc: - family, sport, saddr4 | saddr6 + doc: >- A new PM listener is created. + Attributes: family, sport, saddr4 | saddr6. - name: listener-closed - doc: - family, sport, saddr4 | saddr6 + doc: >- A PM listener is closed. + Attributes: family, sport, saddr4 | saddr6. attribute-sets: - diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index b34fd95b6f841..84fa8a21dfd02 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -12,32 +12,33 @@ /** * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event - * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A new MPTCP connection has been created. It is - * the good time to allocate memory and send ADD_ADDR if needed. Depending on - * the traffic-patterns it can take a long time until the - * MPTCP_EVENT_ESTABLISHED is sent. - * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A MPTCP connection is established (can start new - * subflows). - * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. - * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A - * new address has been announced by the peer. - * @MPTCP_EVENT_REMOVED: token, rem_id An address has been lost by the peer. - * @MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error] A new - * subflow has been established. 'error' should not be set. - * @MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] A subflow has been - * closed. An error (copy of sk_err) could be set if an error has been - * detected for this subflow. - * @MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] The priority of a - * subflow has changed. 'error' should not be set. - * @MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 A new PM - * listener is created. - * @MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 A PM listener - * is closed. + * @MPTCP_EVENT_CREATED: A new MPTCP connection has been created. It is the + * good time to allocate memory and send ADD_ADDR if needed. Depending on the + * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED + * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new + * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. + * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. + * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. + * @MPTCP_EVENT_REMOVED: An address has been lost by the peer. Attributes: + * token, rem_id. + * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of + * sk_err) could be set if an error has been detected for this subflow. + * Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + * daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes: + * family, sport, saddr4 | saddr6. + * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family, + * sport, saddr4 | saddr6. */ enum mptcp_event_type { MPTCP_EVENT_UNSPEC, From 4f363fe9f6b28ed9b714cd7fe5ce880171927dab Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:16 +0100 Subject: [PATCH 1039/1450] netlink: specs: mptcp: fix missing doc Two operations didn't have a small description. It looks like something that has been missed in the original commit introducing this file. Replace the two "todo" by a small and simple description: Create/Destroy subflow. While at it, also uniform the capital letters, avoid double spaces, and fix the "announce" event description: a new "address" has been announced, not a new "subflow". Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-3-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index 59087a2305651..dfd017780d2f9 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -308,8 +308,8 @@ operations: attributes: - addr - - name: flush-addrs - doc: flush addresses + name: flush-addrs + doc: Flush addresses attribute-set: endpoint dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -353,7 +353,7 @@ operations: - addr-remote - name: announce - doc: announce new sf + doc: Announce new address attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -364,7 +364,7 @@ operations: - token - name: remove - doc: announce removal + doc: Announce removal attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -375,7 +375,7 @@ operations: - loc-id - name: subflow-create - doc: todo + doc: Create subflow attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] @@ -387,7 +387,7 @@ operations: - addr-remote - name: subflow-destroy - doc: todo + doc: Destroy subflow attribute-set: attr dont-validate: [ strict ] flags: [ uns-admin-perm ] From a024e377efed31ecfb39210bed562932321345b3 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Tue, 24 Dec 2024 20:07:20 -0500 Subject: [PATCH 1040/1450] net: llc: reset skb->transport_header 802.2+LLC+SNAP frames received by napi_complete_done with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. As snap_rcv expects transport_header to point to SNAP header (OID:PID) after LLC processing advances offset over LLC header (llc_rcv & llc_fixup_skb), code doesn't find a match and packet is dropped. Between napi_complete_done and snap_rcv, transport_header is not used until __netif_receive_skb_core, where originally it was being reset. Commit fda55eca5a33 ("net: introduce skb_transport_header_was_set()") only does so if not set, on the assumption the value was set correctly by GRO (and also on assumption that "network stacks usually reset the transport header anyway"). Afterwards it is moved forward by llc_fixup_skb. Locally generated traffic shows up at __netif_receive_skb_core with no transport_header set and is processed without issue. On a setup with GRO but no DSA, transport_header and network_header are both set to point to skb->data which is also correct. As issue is LLC specific, to avoid impacting non-LLC traffic, and to follow up on original assumption made on previous code change, llc_fixup_skb to reset the offset after skb pull. llc_fixup_skb assumes the LLC header is at skb->data, and by definition SNAP header immediately follows. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241225010723.2830290-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/llc/llc_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 51bccfb00a9cd..61b0159b2fbee 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -124,8 +124,8 @@ static inline int llc_fixup_skb(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; - skb->transport_header += llc_len; skb_pull(skb, llc_len); + skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; From 4db3d750ac7e894278ef1cb1c53cc7d883060496 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Wed, 18 Dec 2024 10:49:57 -0800 Subject: [PATCH 1041/1450] nvmet: Don't overflow subsysnqn nvmet_root_discovery_nqn_store treats the subsysnqn string like a fixed size buffer, even though it is dynamically allocated to the size of the string. Create a new string with kstrndup instead of using the old buffer. Reported-by: syzbot+ff4aab278fa7e27e0f9e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ff4aab278fa7e27e0f9e Fixes: 95409e277d83 ("nvmet: implement unique discovery NQN") Signed-off-by: Leo Stone Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index eeee9e9b854c1..9c109b93ffbf8 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -2254,12 +2254,17 @@ static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item, const char *page, size_t count) { struct list_head *entry; + char *old_nqn, *new_nqn; size_t len; len = strcspn(page, "\n"); if (!len || len > NVMF_NQN_FIELD_LEN - 1) return -EINVAL; + new_nqn = kstrndup(page, len, GFP_KERNEL); + if (!new_nqn) + return -ENOMEM; + down_write(&nvmet_config_sem); list_for_each(entry, &nvmet_subsystems_group.cg_children) { struct config_item *item = @@ -2268,13 +2273,15 @@ static ssize_t nvmet_root_discovery_nqn_store(struct config_item *item, if (!strncmp(config_item_name(item), page, len)) { pr_err("duplicate NQN %s\n", config_item_name(item)); up_write(&nvmet_config_sem); + kfree(new_nqn); return -EINVAL; } } - memset(nvmet_disc_subsys->subsysnqn, 0, NVMF_NQN_FIELD_LEN); - memcpy(nvmet_disc_subsys->subsysnqn, page, len); + old_nqn = nvmet_disc_subsys->subsysnqn; + nvmet_disc_subsys->subsysnqn = new_nqn; up_write(&nvmet_config_sem); + kfree(old_nqn); return len; } From b579d6fdc3a9149bb4d2b3133cc0767130ed13e6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 17 Dec 2024 18:33:25 -0800 Subject: [PATCH 1042/1450] nvmet: propagate npwg topology Ensure we propagate npwg to the target as well instead of assuming its the same logical blocks per physical block. This ensures devices with large IUs information properly propagated on the target. Signed-off-by: Luis Chamberlain Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/io-cmd-bdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 0bda83d0fc3e0..eaf31c823cbe8 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -36,7 +36,7 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) */ id->nsfeat |= 1 << 4; /* NPWG = Namespace Preferred Write Granularity. 0's based */ - id->npwg = lpp0b; + id->npwg = to0based(bdev_io_min(bdev) / bdev_logical_block_size(bdev)); /* NPWA = Namespace Preferred Write Alignment. 0's based */ id->npwa = id->npwg; /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ From 74d16965d7ac378d28ebd833ae6d6a097186a4ec Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 11 Dec 2024 14:28:06 +0530 Subject: [PATCH 1043/1450] nvmet-loop: avoid using mutex in IO hotpath Using mutex lock in IO hot path causes the kernel BUG sleeping while atomic. Shinichiro[1], first encountered this issue while running blktest nvme/052 shown below: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 996, name: (udev-worker) preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 2 locks held by (udev-worker)/996: #0: ffff8881004570c8 (mapping.invalidate_lock){.+.+}-{3:3}, at: page_cache_ra_unbounded+0x155/0x5c0 #1: ffffffff8607eaa0 (rcu_read_lock){....}-{1:2}, at: blk_mq_flush_plug_list+0xa75/0x1950 CPU: 2 UID: 0 PID: 996 Comm: (udev-worker) Not tainted 6.12.0-rc3+ #339 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 Call Trace: dump_stack_lvl+0x6a/0x90 __might_resched.cold+0x1f7/0x23d ? __pfx___might_resched+0x10/0x10 ? vsnprintf+0xdeb/0x18f0 __mutex_lock+0xf4/0x1220 ? nvmet_subsys_nsid_exists+0xb9/0x150 [nvmet] ? __pfx_vsnprintf+0x10/0x10 ? __pfx___mutex_lock+0x10/0x10 ? snprintf+0xa5/0xe0 ? xas_load+0x1ce/0x3f0 ? nvmet_subsys_nsid_exists+0xb9/0x150 [nvmet] nvmet_subsys_nsid_exists+0xb9/0x150 [nvmet] ? __pfx_nvmet_subsys_nsid_exists+0x10/0x10 [nvmet] nvmet_req_find_ns+0x24e/0x300 [nvmet] nvmet_req_init+0x694/0xd40 [nvmet] ? blk_mq_start_request+0x11c/0x750 ? nvme_setup_cmd+0x369/0x990 [nvme_core] nvme_loop_queue_rq+0x2a7/0x7a0 [nvme_loop] ? __pfx___lock_acquire+0x10/0x10 ? __pfx_nvme_loop_queue_rq+0x10/0x10 [nvme_loop] __blk_mq_issue_directly+0xe2/0x1d0 ? __pfx___blk_mq_issue_directly+0x10/0x10 ? blk_mq_request_issue_directly+0xc2/0x140 blk_mq_plug_issue_direct+0x13f/0x630 ? lock_acquire+0x2d/0xc0 ? blk_mq_flush_plug_list+0xa75/0x1950 blk_mq_flush_plug_list+0xa9d/0x1950 ? __pfx_blk_mq_flush_plug_list+0x10/0x10 ? __pfx_mpage_readahead+0x10/0x10 __blk_flush_plug+0x278/0x4d0 ? __pfx___blk_flush_plug+0x10/0x10 ? lock_release+0x460/0x7a0 blk_finish_plug+0x4e/0x90 read_pages+0x51b/0xbc0 ? __pfx_read_pages+0x10/0x10 ? lock_release+0x460/0x7a0 page_cache_ra_unbounded+0x326/0x5c0 force_page_cache_ra+0x1ea/0x2f0 filemap_get_pages+0x59e/0x17b0 ? __pfx_filemap_get_pages+0x10/0x10 ? lock_is_held_type+0xd5/0x130 ? __pfx___might_resched+0x10/0x10 ? find_held_lock+0x2d/0x110 filemap_read+0x317/0xb70 ? up_write+0x1ba/0x510 ? __pfx_filemap_read+0x10/0x10 ? inode_security+0x54/0xf0 ? selinux_file_permission+0x36d/0x420 blkdev_read_iter+0x143/0x3b0 vfs_read+0x6ac/0xa20 ? __pfx_vfs_read+0x10/0x10 ? __pfx_vm_mmap_pgoff+0x10/0x10 ? __pfx___seccomp_filter+0x10/0x10 ksys_read+0xf7/0x1d0 ? __pfx_ksys_read+0x10/0x10 do_syscall_64+0x93/0x180 ? lockdep_hardirqs_on_prepare+0x16d/0x400 ? do_syscall_64+0x9f/0x180 ? lockdep_hardirqs_on+0x78/0x100 ? do_syscall_64+0x9f/0x180 ? lockdep_hardirqs_on_prepare+0x16d/0x400 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f565bd1ce11 Code: 00 48 8b 15 09 90 0d 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8 d0 ad 01 00 f3 0f 1e fa 80 3d 35 12 0e 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec RSP: 002b:00007ffd6e7a20c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000001000 RCX: 00007f565bd1ce11 RDX: 0000000000001000 RSI: 00007f565babb000 RDI: 0000000000000014 RBP: 00007ffd6e7a2130 R08: 00000000ffffffff R09: 0000000000000000 R10: 0000556000bfa610 R11: 0000000000000246 R12: 000000003ffff000 R13: 0000556000bfa5b0 R14: 0000000000000e00 R15: 0000556000c07328 Apparently, the above issue is caused due to using mutex lock while we're in IO hot path. It's a regression caused with commit 505363957fad ("nvmet: fix nvme status code when namespace is disabled"). The mutex ->su_mutex is used to find whether a disabled nsid exists in the config group or not. This is to differentiate between a nsid that is disabled vs non-existent. To mitigate the above issue, we've worked upon a fix[2] where we now insert nsid in subsys Xarray as soon as it's created under config group and later when that nsid is enabled, we add an Xarray mark on it and set ns->enabled to true. The Xarray mark is useful while we need to loop through all enabled namepsaces under a subsystem using xa_for_each_marked() API. If later a nsid is disabled then we clear Xarray mark from it and also set ns->enabled to false. It's only when nsid is deleted from the config group we delete it from the Xarray. So with this change, now we could easily differentiate a nsid is disabled (i.e. Xarray entry for ns exists but ns->enabled is set to false) vs non- existent (i.e.Xarray entry for ns doesn't exist). Link: https://lore.kernel.org/linux-nvme/20241022070252.GA11389@lst.de/ [2] Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/linux-nvme/tqcy3sveity7p56v7ywp7ssyviwcb3w4623cnxj3knoobfcanq@yxgt2mjkbkam/ [1] Fixes: 505363957fad ("nvmet: fix nvme status code when namespace is disabled") Fix-suggested-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 9 +-- drivers/nvme/target/configfs.c | 12 ---- drivers/nvme/target/core.c | 108 +++++++++++++++++++------------- drivers/nvme/target/nvmet.h | 7 +++ drivers/nvme/target/pr.c | 8 +-- 5 files changed, 79 insertions(+), 65 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 2962794ce881e..fa89b0549c36c 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -139,7 +139,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, unsigned long idx; ctrl = req->sq->ctrl; - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { /* we don't have the right data for file backed ns */ if (!ns->bdev) continue; @@ -331,9 +331,10 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, u32 count = 0; if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { - xa_for_each(&ctrl->subsys->namespaces, idx, ns) + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->anagrpid == grpid) desc->nsids[count++] = cpu_to_le32(ns->nsid); + } } desc->grpid = cpu_to_le32(grpid); @@ -772,7 +773,7 @@ static void nvmet_execute_identify_endgrp_list(struct nvmet_req *req) goto out; } - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->nsid <= min_endgid) continue; @@ -815,7 +816,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css) goto out; } - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->nsid <= min_nsid) continue; if (match_css && req->ns->csi != req->cmd->identify.csi) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 9c109b93ffbf8..2b030f0efc386 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -810,18 +810,6 @@ static struct configfs_attribute *nvmet_ns_attrs[] = { NULL, }; -bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid) -{ - struct config_item *ns_item; - char name[12]; - - snprintf(name, sizeof(name), "%u", nsid); - mutex_lock(&subsys->namespaces_group.cg_subsys->su_mutex); - ns_item = config_group_find_item(&subsys->namespaces_group, name); - mutex_unlock(&subsys->namespaces_group.cg_subsys->su_mutex); - return ns_item != NULL; -} - static void nvmet_ns_release(struct config_item *item) { struct nvmet_ns *ns = to_nvmet_ns(item); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 1f4e9989663be..fde6c555af619 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -127,7 +127,7 @@ static u32 nvmet_max_nsid(struct nvmet_subsys *subsys) unsigned long idx; u32 nsid = 0; - xa_for_each(&subsys->namespaces, idx, cur) + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, cur) nsid = cur->nsid; return nsid; @@ -441,11 +441,14 @@ u16 nvmet_req_find_ns(struct nvmet_req *req) struct nvmet_subsys *subsys = nvmet_req_subsys(req); req->ns = xa_load(&subsys->namespaces, nsid); - if (unlikely(!req->ns)) { + if (unlikely(!req->ns || !req->ns->enabled)) { req->error_loc = offsetof(struct nvme_common_command, nsid); - if (nvmet_subsys_nsid_exists(subsys, nsid)) - return NVME_SC_INTERNAL_PATH_ERROR; - return NVME_SC_INVALID_NS | NVME_STATUS_DNR; + if (!req->ns) /* ns doesn't exist! */ + return NVME_SC_INVALID_NS | NVME_STATUS_DNR; + + /* ns exists but it's disabled */ + req->ns = NULL; + return NVME_SC_INTERNAL_PATH_ERROR; } percpu_ref_get(&req->ns->ref); @@ -583,8 +586,6 @@ int nvmet_ns_enable(struct nvmet_ns *ns) goto out_unlock; ret = -EMFILE; - if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) - goto out_unlock; ret = nvmet_bdev_ns_enable(ns); if (ret == -ENOTBLK) @@ -599,38 +600,19 @@ int nvmet_ns_enable(struct nvmet_ns *ns) list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) nvmet_p2pmem_ns_add_p2p(ctrl, ns); - ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, - 0, GFP_KERNEL); - if (ret) - goto out_dev_put; - - if (ns->nsid > subsys->max_nsid) - subsys->max_nsid = ns->nsid; - - ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL); - if (ret) - goto out_restore_subsys_maxnsid; - if (ns->pr.enable) { ret = nvmet_pr_init_ns(ns); if (ret) - goto out_remove_from_subsys; + goto out_dev_put; } - subsys->nr_namespaces++; - nvmet_ns_changed(subsys, ns->nsid); ns->enabled = true; + xa_set_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); ret = 0; out_unlock: mutex_unlock(&subsys->lock); return ret; - -out_remove_from_subsys: - xa_erase(&subsys->namespaces, ns->nsid); -out_restore_subsys_maxnsid: - subsys->max_nsid = nvmet_max_nsid(subsys); - percpu_ref_exit(&ns->ref); out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); @@ -649,15 +631,37 @@ void nvmet_ns_disable(struct nvmet_ns *ns) goto out_unlock; ns->enabled = false; - xa_erase(&ns->subsys->namespaces, ns->nsid); - if (ns->nsid == subsys->max_nsid) - subsys->max_nsid = nvmet_max_nsid(subsys); + xa_clear_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); mutex_unlock(&subsys->lock); + if (ns->pr.enable) + nvmet_pr_exit_ns(ns); + + mutex_lock(&subsys->lock); + nvmet_ns_changed(subsys, ns->nsid); + nvmet_ns_dev_disable(ns); +out_unlock: + mutex_unlock(&subsys->lock); +} + +void nvmet_ns_free(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + + nvmet_ns_disable(ns); + + mutex_lock(&subsys->lock); + + xa_erase(&subsys->namespaces, ns->nsid); + if (ns->nsid == subsys->max_nsid) + subsys->max_nsid = nvmet_max_nsid(subsys); + + mutex_unlock(&subsys->lock); + /* * Now that we removed the namespaces from the lookup list, we * can kill the per_cpu ref and wait for any remaining references @@ -671,21 +675,9 @@ void nvmet_ns_disable(struct nvmet_ns *ns) wait_for_completion(&ns->disable_done); percpu_ref_exit(&ns->ref); - if (ns->pr.enable) - nvmet_pr_exit_ns(ns); - mutex_lock(&subsys->lock); - subsys->nr_namespaces--; - nvmet_ns_changed(subsys, ns->nsid); - nvmet_ns_dev_disable(ns); -out_unlock: mutex_unlock(&subsys->lock); -} - -void nvmet_ns_free(struct nvmet_ns *ns) -{ - nvmet_ns_disable(ns); down_write(&nvmet_ana_sem); nvmet_ana_group_enabled[ns->anagrpid]--; @@ -699,15 +691,33 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) { struct nvmet_ns *ns; + mutex_lock(&subsys->lock); + + if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) + goto out_unlock; + ns = kzalloc(sizeof(*ns), GFP_KERNEL); if (!ns) - return NULL; + goto out_unlock; init_completion(&ns->disable_done); ns->nsid = nsid; ns->subsys = subsys; + if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL)) + goto out_free; + + if (ns->nsid > subsys->max_nsid) + subsys->max_nsid = nsid; + + if (xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL)) + goto out_exit; + + subsys->nr_namespaces++; + + mutex_unlock(&subsys->lock); + down_write(&nvmet_ana_sem); ns->anagrpid = NVMET_DEFAULT_ANA_GRPID; nvmet_ana_group_enabled[ns->anagrpid]++; @@ -718,6 +728,14 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->csi = NVME_CSI_NVM; return ns; +out_exit: + subsys->max_nsid = nvmet_max_nsid(subsys); + percpu_ref_exit(&ns->ref); +out_free: + kfree(ns); +out_unlock: + mutex_unlock(&subsys->lock); + return NULL; } static void nvmet_update_sq_head(struct nvmet_req *req) @@ -1394,7 +1412,7 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, ctrl->p2p_client = get_device(req->p2p_client); - xa_for_each(&ctrl->subsys->namespaces, idx, ns) + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) nvmet_p2pmem_ns_add_p2p(ctrl, ns); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 58328b35dc968..7233549f7c8a0 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -24,6 +24,7 @@ #define NVMET_DEFAULT_VS NVME_VS(2, 1, 0) +#define NVMET_NS_ENABLED XA_MARK_1 #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) @@ -33,6 +34,12 @@ #define NVMET_FR_MAX_SIZE 8 #define NVMET_PR_LOG_QUEUE_SIZE 64 +#define nvmet_for_each_ns(xa, index, entry) \ + xa_for_each(xa, index, entry) + +#define nvmet_for_each_enabled_ns(xa, index, entry) \ + xa_for_each_marked(xa, index, entry, NVMET_NS_ENABLED) + /* * Supported optional AENs: */ diff --git a/drivers/nvme/target/pr.c b/drivers/nvme/target/pr.c index 90e9f5bbe5815..cd22d8333314e 100644 --- a/drivers/nvme/target/pr.c +++ b/drivers/nvme/target/pr.c @@ -60,7 +60,7 @@ u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask) goto success; } - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->pr.enable) WRITE_ONCE(ns->pr.notify_mask, mask); } @@ -1056,7 +1056,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl) * nvmet_pr_init_ns(), see more details in nvmet_ns_enable(). * So just check ns->pr.enable. */ - xa_for_each(&subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) { if (ns->pr.enable) { ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid, &ctrl->hostid); @@ -1067,7 +1067,7 @@ int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl) return 0; free_per_ctrl_refs: - xa_for_each(&subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&subsys->namespaces, idx, ns) { if (ns->pr.enable) { pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid); if (pc_ref) @@ -1087,7 +1087,7 @@ void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl) kfifo_free(&ctrl->pr_log_mgr.log_queue); mutex_destroy(&ctrl->pr_log_mgr.lock); - xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + nvmet_for_each_enabled_ns(&ctrl->subsys->namespaces, idx, ns) { if (ns->pr.enable) { pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid); if (pc_ref) From 36e3b1f9abe359b2bc25e81bc47b64354e42c9b1 Mon Sep 17 00:00:00 2001 From: "Chunguang.xu" Date: Tue, 3 Dec 2024 11:39:55 +0800 Subject: [PATCH 1044/1450] nvme-tcp: remove nvme_tcp_destroy_io_queues() Now when destroying the IO queue we call nvme_tcp_stop_io_queues() twice, nvme_tcp_destroy_io_queues() has an unnecessary call. Here we try to remove nvme_tcp_destroy_io_queues() and merge it into nvme_tcp_teardown_io_queues(), simplify the code and align with nvme-rdma, make it easy to maintaince. Signed-off-by: Chunguang.xu Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 28c76a3e1bd22..b127d41dbbfee 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2024,14 +2024,6 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) return __nvme_tcp_alloc_io_queues(ctrl); } -static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) -{ - nvme_tcp_stop_io_queues(ctrl); - if (remove) - nvme_remove_io_tag_set(ctrl); - nvme_tcp_free_io_queues(ctrl); -} - static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) { int ret, nr_queues; @@ -2176,9 +2168,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); - if (remove) + if (remove) { nvme_unquiesce_io_queues(ctrl); - nvme_tcp_destroy_io_queues(ctrl, remove); + nvme_remove_io_tag_set(ctrl); + } + nvme_tcp_free_io_queues(ctrl); } static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, @@ -2267,7 +2261,9 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); - nvme_tcp_destroy_io_queues(ctrl, new); + if (new) + nvme_remove_io_tag_set(ctrl); + nvme_tcp_free_io_queues(ctrl); } destroy_admin: nvme_stop_keep_alive(ctrl); From 7a6c355b55c051eb37cb15d191241da3aa3d6cba Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Mon, 23 Dec 2024 12:44:53 +0000 Subject: [PATCH 1045/1450] scripts/mksysmap: Fix escape chars '$' Commit b18b047002b7 ("kbuild: change scripts/mksysmap into sed script") changed the invocation of the script, to call sed directly without shell. That means, the current extra escape that was added in: commit ec336aa83162 ("scripts/mksysmap: Fix badly escaped '$'") for the shell is not correct any more, at the moment the stack traces for nvhe are corrupted: [ 22.840904] kvm [190]: [] __kvm_nvhe_$x.220+0x58/0x9c [ 22.842913] kvm [190]: [] __kvm_nvhe_$x.9+0x44/0x50 [ 22.844112] kvm [190]: [] __kvm_nvhe___skip_pauth_save+0x4/0x4 With this patch: [ 25.793513] kvm [192]: nVHE call trace: [ 25.794141] kvm [192]: [] __kvm_nvhe_hyp_panic+0xb0/0xf4 [ 25.796590] kvm [192]: [] __kvm_nvhe_handle_trap+0xe4/0x188 [ 25.797553] kvm [192]: [] __kvm_nvhe___skip_pauth_save+0x4/0x4 Fixes: b18b047002b7 ("kbuild: change scripts/mksysmap into sed script") Signed-off-by: Mostafa Saleh Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/mksysmap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/mksysmap b/scripts/mksysmap index c12723a046556..3accbdb269ac7 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -26,7 +26,7 @@ # (do not forget a space before each pattern) # local symbols for ARM, MIPS, etc. -/ \\$/d +/ \$/d # local labels, .LBB, .Ltmpxxx, .L__unnamed_xx, .LASANPC, etc. / \.L/d @@ -39,7 +39,7 @@ / __pi_\.L/d # arm64 local symbols in non-VHE KVM namespace -/ __kvm_nvhe_\\$/d +/ __kvm_nvhe_\$/d / __kvm_nvhe_\.L/d # lld arm/aarch64/mips thunks From bf36b4bf1b9a7a0015610e2f038ee84ddb085de2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 26 Dec 2024 00:33:35 +0900 Subject: [PATCH 1046/1450] modpost: fix the missed iteration for the max bit in do_input() This loop should iterate over the range from 'min' to 'max' inclusively. The last interation is missed. Fixes: 1d8f430c15b3 ("[PATCH] Input: add modalias support") Signed-off-by: Masahiro Yamada Tested-by: John Paul Adrian Glaubitz --- scripts/mod/file2alias.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 5b5745f00eb30..ff263c2859778 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -656,7 +656,7 @@ static void do_input(char *alias, for (i = min / BITS_PER_LONG; i < max / BITS_PER_LONG + 1; i++) arr[i] = TO_NATIVE(arr[i]); - for (i = min; i < max; i++) + for (i = min; i <= max; i++) if (arr[i / BITS_PER_LONG] & (1ULL << (i%BITS_PER_LONG))) sprintf(alias + strlen(alias), "%X,*", i); } From e1352d7ead2b8803689823cd4059c1ec72609ed4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 26 Dec 2024 00:33:36 +0900 Subject: [PATCH 1047/1450] modpost: refactor do_vmbus_entry() Optimize the size of guid_name[], as it only requires 1 additional byte for '\0' instead of 2. Simplify the loop by incrementing the iterator by 1 instead of 2. Remove the unnecessary TO_NATIVE() call, as the guid is represented as a byte stream. Signed-off-by: Masahiro Yamada Tested-by: John Paul Adrian Glaubitz --- scripts/mod/file2alias.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index ff263c2859778..2c7b76d4e8ece 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -812,15 +812,13 @@ static void do_virtio_entry(struct module *mod, void *symval) * Each byte of the guid will be represented by two hex characters * in the name. */ - static void do_vmbus_entry(struct module *mod, void *symval) { - int i; DEF_FIELD_ADDR(symval, hv_vmbus_device_id, guid); - char guid_name[(sizeof(*guid) + 1) * 2]; + char guid_name[sizeof(*guid) * 2 + 1]; - for (i = 0; i < (sizeof(*guid) * 2); i += 2) - sprintf(&guid_name[i], "%02x", TO_NATIVE((guid->b)[i/2])); + for (int i = 0; i < sizeof(*guid); i++) + sprintf(&guid_name[i * 2], "%02x", guid->b[i]); module_alias_printf(mod, false, "vmbus:%s", guid_name); } From 8fe1a63d3d99d86f1bdc034505aad6fc70424737 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 26 Dec 2024 00:33:37 +0900 Subject: [PATCH 1048/1450] modpost: work around unaligned data access error With the latest binutils, modpost fails with a bus error on some architectures such as ARM and sparc64. Since binutils commit 1f1b5e506bf0 ("bfd/ELF: restrict file alignment for object files"), the byte offset to each section (sh_offset) in relocatable ELF is no longer guaranteed to be aligned. modpost parses MODULE_DEVICE_TABLE() data structures, which are usually located in the .rodata section. If it is not properly aligned, unaligned access errors may occur. To address the issue, this commit imports the get_unaligned() helper from include/linux/unaligned.h. The get_unaligned_native() helper caters to the endianness in addition to handling the unaligned access. I slightly refactored do_pcmcia_entry() and do_input() to avoid writing back to an unaligned address. (We would need the put_unaligned() helper to do that.) The addend_*_rel() functions need similar adjustments because the .text sections are not aligned either. It seems that the .symtab, .rel.* and .rela.* sections are still aligned. Keep normal pointer access for these sections to avoid unnecessary performance costs. Reported-by: Paulo Pisati Reported-by: Matthias Klose Closes: https://sourceware.org/bugzilla/show_bug.cgi?id=32435 Reported-by: John Paul Adrian Glaubitz Closes: https://sourceware.org/bugzilla/show_bug.cgi?id=32493 Signed-off-by: Masahiro Yamada Tested-by: John Paul Adrian Glaubitz --- scripts/mod/file2alias.c | 26 +++++++++++++------------- scripts/mod/modpost.c | 24 ++++++++++++------------ scripts/mod/modpost.h | 14 ++++++++++++++ 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 2c7b76d4e8ece..19ec72a69e90c 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -132,7 +132,8 @@ struct devtable { * based at address m. */ #define DEF_FIELD(m, devid, f) \ - typeof(((struct devid *)0)->f) f = TO_NATIVE(*(typeof(f) *)((m) + OFF_##devid##_##f)) + typeof(((struct devid *)0)->f) f = \ + get_unaligned_native((typeof(f) *)((m) + OFF_##devid##_##f)) /* Define a variable f that holds the address of field f of struct devid * based at address m. Due to the way typeof works, for a field of type @@ -600,7 +601,7 @@ static void do_pnp_card_entry(struct module *mod, void *symval) static void do_pcmcia_entry(struct module *mod, void *symval) { char alias[256] = {}; - unsigned int i; + DEF_FIELD(symval, pcmcia_device_id, match_flags); DEF_FIELD(symval, pcmcia_device_id, manf_id); DEF_FIELD(symval, pcmcia_device_id, card_id); @@ -609,10 +610,6 @@ static void do_pcmcia_entry(struct module *mod, void *symval) DEF_FIELD(symval, pcmcia_device_id, device_no); DEF_FIELD_ADDR(symval, pcmcia_device_id, prod_id_hash); - for (i=0; i<4; i++) { - (*prod_id_hash)[i] = TO_NATIVE((*prod_id_hash)[i]); - } - ADD(alias, "m", match_flags & PCMCIA_DEV_ID_MATCH_MANF_ID, manf_id); ADD(alias, "c", match_flags & PCMCIA_DEV_ID_MATCH_CARD_ID, @@ -623,10 +620,14 @@ static void do_pcmcia_entry(struct module *mod, void *symval) function); ADD(alias, "pfn", match_flags & PCMCIA_DEV_ID_MATCH_DEVICE_NO, device_no); - ADD(alias, "pa", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID1, (*prod_id_hash)[0]); - ADD(alias, "pb", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID2, (*prod_id_hash)[1]); - ADD(alias, "pc", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3, (*prod_id_hash)[2]); - ADD(alias, "pd", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4, (*prod_id_hash)[3]); + ADD(alias, "pa", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID1, + get_unaligned_native(*prod_id_hash + 0)); + ADD(alias, "pb", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID2, + get_unaligned_native(*prod_id_hash + 1)); + ADD(alias, "pc", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3, + get_unaligned_native(*prod_id_hash + 2)); + ADD(alias, "pd", match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4, + get_unaligned_native(*prod_id_hash + 3)); module_alias_printf(mod, true, "pcmcia:%s", alias); } @@ -654,10 +655,9 @@ static void do_input(char *alias, { unsigned int i; - for (i = min / BITS_PER_LONG; i < max / BITS_PER_LONG + 1; i++) - arr[i] = TO_NATIVE(arr[i]); for (i = min; i <= max; i++) - if (arr[i / BITS_PER_LONG] & (1ULL << (i%BITS_PER_LONG))) + if (get_unaligned_native(arr + i / BITS_PER_LONG) & + (1ULL << (i % BITS_PER_LONG))) sprintf(alias + strlen(alias), "%X,*", i); } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 94ee49207a45a..7ea59dc4926b3 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1138,9 +1138,9 @@ static Elf_Addr addend_386_rel(uint32_t *location, unsigned int r_type) { switch (r_type) { case R_386_32: - return TO_NATIVE(*location); + return get_unaligned_native(location); case R_386_PC32: - return TO_NATIVE(*location) + 4; + return get_unaligned_native(location) + 4; } return (Elf_Addr)(-1); @@ -1161,24 +1161,24 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type) switch (r_type) { case R_ARM_ABS32: case R_ARM_REL32: - inst = TO_NATIVE(*(uint32_t *)loc); + inst = get_unaligned_native((uint32_t *)loc); return inst + sym->st_value; case R_ARM_MOVW_ABS_NC: case R_ARM_MOVT_ABS: - inst = TO_NATIVE(*(uint32_t *)loc); + inst = get_unaligned_native((uint32_t *)loc); offset = sign_extend32(((inst & 0xf0000) >> 4) | (inst & 0xfff), 15); return offset + sym->st_value; case R_ARM_PC24: case R_ARM_CALL: case R_ARM_JUMP24: - inst = TO_NATIVE(*(uint32_t *)loc); + inst = get_unaligned_native((uint32_t *)loc); offset = sign_extend32((inst & 0x00ffffff) << 2, 25); return offset + sym->st_value + 8; case R_ARM_THM_MOVW_ABS_NC: case R_ARM_THM_MOVT_ABS: - upper = TO_NATIVE(*(uint16_t *)loc); - lower = TO_NATIVE(*((uint16_t *)loc + 1)); + upper = get_unaligned_native((uint16_t *)loc); + lower = get_unaligned_native((uint16_t *)loc + 1); offset = sign_extend32(((upper & 0x000f) << 12) | ((upper & 0x0400) << 1) | ((lower & 0x7000) >> 4) | @@ -1195,8 +1195,8 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type) * imm11 = lower[10:0] * imm32 = SignExtend(S:J2:J1:imm6:imm11:'0') */ - upper = TO_NATIVE(*(uint16_t *)loc); - lower = TO_NATIVE(*((uint16_t *)loc + 1)); + upper = get_unaligned_native((uint16_t *)loc); + lower = get_unaligned_native((uint16_t *)loc + 1); sign = (upper >> 10) & 1; j1 = (lower >> 13) & 1; @@ -1219,8 +1219,8 @@ static Elf_Addr addend_arm_rel(void *loc, Elf_Sym *sym, unsigned int r_type) * I2 = NOT(J2 XOR S) * imm32 = SignExtend(S:I1:I2:imm10:imm11:'0') */ - upper = TO_NATIVE(*(uint16_t *)loc); - lower = TO_NATIVE(*((uint16_t *)loc + 1)); + upper = get_unaligned_native((uint16_t *)loc); + lower = get_unaligned_native((uint16_t *)loc + 1); sign = (upper >> 10) & 1; j1 = (lower >> 13) & 1; @@ -1241,7 +1241,7 @@ static Elf_Addr addend_mips_rel(uint32_t *location, unsigned int r_type) { uint32_t inst; - inst = TO_NATIVE(*location); + inst = get_unaligned_native(location); switch (r_type) { case R_MIPS_LO16: return inst & 0xffff; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 8b72c227ebf47..ffd0a52a606ef 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -65,6 +65,20 @@ #define TO_NATIVE(x) \ (target_is_big_endian == host_is_big_endian ? x : bswap(x)) +#define __get_unaligned_t(type, ptr) ({ \ + const struct { type x; } __attribute__((__packed__)) *__pptr = \ + (typeof(__pptr))(ptr); \ + __pptr->x; \ +}) + +#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) + +#define get_unaligned_native(ptr) \ +({ \ + typeof(*(ptr)) _val = get_unaligned(ptr); \ + TO_NATIVE(_val); \ +}) + #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #define strstarts(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) From 38fc96a58ce40257aec79b32e9b310c86907c63c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 28 Dec 2024 17:44:52 +0000 Subject: [PATCH 1049/1450] io_uring/rw: fix downgraded mshot read The io-wq path can downgrade a multishot request to oneshot mode, however io_read_mshot() doesn't handle that and would still post multiple CQEs. That's not allowed, because io_req_post_cqe() requires stricter context requirements. The described can only happen with pollable files that don't support FMODE_NOWAIT, which is an odd combination, so if even allowed it should be fairly rare. Cc: stable@vger.kernel.org Reported-by: chase xd Fixes: bee1d5becdf5b ("io_uring: disable io-wq execution of multishot NOWAIT requests") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c5c8c4a50a882fd581257b81bf52eee260ac29fd.1735407848.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/rw.c b/io_uring/rw.c index 0bcb83e4ce3cf..29bb3010f9c06 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -983,6 +983,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) io_kbuf_recycle(req, issue_flags); if (ret < 0) req_set_fail(req); + } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + cflags = io_put_kbuf(req, ret, issue_flags); } else { /* * Any successful return value will keep the multishot read From b06a6187ef983f501e93faa56209169752d3bde3 Mon Sep 17 00:00:00 2001 From: Tanya Agarwal Date: Sun, 29 Dec 2024 11:32:42 +0530 Subject: [PATCH 1050/1450] ALSA: usb-audio: US16x08: Initialize array before use Initialize meter_urb array before use in mixer_us16x08.c. CID 1410197: (#1 of 1): Uninitialized scalar variable (UNINIT) uninit_use_in_call: Using uninitialized value *meter_urb when calling get_meter_levels_from_urb. Coverity Link: https://scan7.scan.coverity.com/#/project-view/52849/11354?selectedIssue=1410197 Fixes: d2bb390a2081 ("ALSA: usb-audio: Tascam US-16x08 DSP mixer quirk") Signed-off-by: Tanya Agarwal Link: https://patch.msgid.link/20241229060240.1642-1-tanyaagarwal25699@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/mixer_us16x08.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/mixer_us16x08.c b/sound/usb/mixer_us16x08.c index 6eb7d93b358d9..20ac32635f1f5 100644 --- a/sound/usb/mixer_us16x08.c +++ b/sound/usb/mixer_us16x08.c @@ -687,7 +687,7 @@ static int snd_us16x08_meter_get(struct snd_kcontrol *kcontrol, struct usb_mixer_elem_info *elem = kcontrol->private_data; struct snd_usb_audio *chip = elem->head.mixer->chip; struct snd_us16x08_meter_store *store = elem->private_data; - u8 meter_urb[64]; + u8 meter_urb[64] = {0}; switch (kcontrol->private_value) { case 0: { From 31ad36a271290648e7c2288a03d7b933d20254d6 Mon Sep 17 00:00:00 2001 From: chenchangcheng Date: Fri, 20 Dec 2024 15:48:47 +0800 Subject: [PATCH 1051/1450] objtool: Add bch2_trans_unlocked_error() to bcachefs noreturns Fix the following objtool warning during build time: fs/bcachefs/btree_trans_commit.o: warning: objtool: bch2_trans_commit_write_locked.isra.0() falls through to next function do_bch2_trans_commit.isra.0() fs/bcachefs/btree_trans_commit.o: warning: objtool: .text: unexpected end of section ...... fs/bcachefs/btree_update.o: warning: objtool: bch2_trans_update_get_key_cache() falls through to next function flush_new_cached_update() fs/bcachefs/btree_update.o: warning: objtool: flush_new_cached_update() falls through to next function bch2_trans_update_by_path() bch2_trans_unlocked_error() is an Obviously Correct (tm) panic() wrapper, add it to the list of known noreturns. [ mingo: Improved the changelog ] Fixes: fd104e2967b7 ("bcachefs: bch2_trans_verify_not_unlocked()") Signed-off-by: chenchangcheng Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20241220074847.3418134-1-ccc194101@163.com --- tools/objtool/noreturns.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index f37614cc2c1b2..b2174894f9f71 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -19,6 +19,7 @@ NORETURN(__x64_sys_exit_group) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) +NORETURN(bch2_trans_unlocked_error) NORETURN(cpu_bringup_and_idle) NORETURN(cpu_startup_entry) NORETURN(do_exit) From f718faf3940e95d5d34af9041f279f598396ab7d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 17 Dec 2024 00:48:18 +0000 Subject: [PATCH 1052/1450] freezer, sched: Report frozen tasks as 'D' instead of 'R' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") the frozen task stat was reported as 'D' in cgroup v1. However, after rewriting the core freezer logic, the frozen task stat is reported as 'R'. This is confusing, especially when a task with stat of 'S' is frozen. This bug can be reproduced with these steps: $ cd /sys/fs/cgroup/freezer/ $ mkdir test $ sleep 1000 & [1] 739 // task whose stat is 'S' $ echo 739 > test/cgroup.procs $ echo FROZEN > test/freezer.state $ ps -aux | grep 739 root 739 0.1 0.0 8376 1812 pts/0 R 10:56 0:00 sleep 1000 As shown above, a task whose stat is 'S' was changed to 'R' when it was frozen. To solve this regression, simply maintain the same reported state as before the rewrite. [ mingo: Enhanced the changelog and comments ] Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Signed-off-by: Chen Ridong Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Tejun Heo Acked-by: Michal Koutný Link: https://lore.kernel.org/r/20241217004818.3200515-1-chenridong@huaweicloud.com --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 66b311fbd5d60..64934e0830af3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1637,8 +1637,9 @@ static inline unsigned int __task_state_index(unsigned int tsk_state, * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. */ - if (tsk_state & TASK_RTLOCK_WAIT) + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); From dc81e556f2a017d681251ace21bf06c126d5a192 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Wed, 13 Nov 2024 09:59:34 -0800 Subject: [PATCH 1053/1450] x86/fred: Clear WFE in missing-ENDBRANCH #CPs An indirect branch instruction sets the CPU indirect branch tracker (IBT) into WAIT_FOR_ENDBRANCH (WFE) state and WFE stays asserted across the instruction boundary. When the decoder finds an inappropriate instruction while WFE is set ENDBR, the CPU raises a #CP fault. For the "kernel IBT no ENDBR" selftest where #CPs are deliberately triggered, the WFE state of the interrupted context needs to be cleared to let execution continue. Otherwise when the CPU resumes from the instruction that just caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU enters a dead loop. This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't set WFE. But FRED provides space on the entry stack (in an expanded CS area) to save and restore the WFE state, thus the WFE state is no longer clobbered, so software must clear it. Clear WFE to avoid dead looping in ibt_clear_fred_wfe() and the !ibt_fatal code path when execution is allowed to continue. Clobbering WFE in any other circumstance is a security-relevant bug. [ dhansen: changelog rewording ] Fixes: a5f6c2ace997 ("x86/shstk: Add user control-protection fault handler") Signed-off-by: Xin Li (Intel) Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241113175934.3897541-1-xin%40zytor.com --- arch/x86/kernel/cet.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index d2c732a34e5d9..303bf74d175b3 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -81,6 +81,34 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) static __ro_after_init bool ibt_fatal = true; +/* + * By definition, all missing-ENDBRANCH #CPs are a result of WFE && !ENDBR. + * + * For the kernel IBT no ENDBR selftest where #CPs are deliberately triggered, + * the WFE state of the interrupted context needs to be cleared to let execution + * continue. Otherwise when the CPU resumes from the instruction that just + * caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU + * enters a dead loop. + * + * This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't + * set WFE. But FRED provides space on the entry stack (in an expanded CS area) + * to save and restore the WFE state, thus the WFE state is no longer clobbered, + * so software must clear it. + */ +static void ibt_clear_fred_wfe(struct pt_regs *regs) +{ + /* + * No need to do any FRED checks. + * + * For IDT event delivery, the high-order 48 bits of CS are pushed + * as 0s into the stack, and later IRET ignores these bits. + * + * For FRED, a test to check if fred_cs.wfe is set would be dropped + * by compilers. + */ + regs->fred_cs.wfe = 0; +} + static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) { if ((error_code & CP_EC) != CP_ENDBR) { @@ -90,6 +118,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { regs->ax = 0; + ibt_clear_fred_wfe(regs); return; } @@ -97,6 +126,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (!ibt_fatal) { printk(KERN_DEFAULT CUT_HERE); __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + ibt_clear_fred_wfe(regs); return; } BUG(); From 27834971f616c5e154423c578fa95e0444444ce1 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 19 Jun 2024 19:18:01 +0800 Subject: [PATCH 1054/1450] virt: tdx-guest: Just leak decrypted memory on unrecoverable errors In CoCo VMs it is possible for the untrusted host to cause set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. Leak the decrypted memory when set_memory_decrypted() fails, and don't need to print an error since set_memory_decrypted() will call WARN_ONCE(). Fixes: f4738f56d1dc ("virt: tdx-guest: Add Quote generation support using TSM_REPORTS") Signed-off-by: Li RongQing Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Reviewed-by: Rick Edgecombe Reviewed-by: Kirill A. Shutemov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240619111801.25630-1-lirongqing%40baidu.com --- drivers/virt/coco/tdx-guest/tdx-guest.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/virt/coco/tdx-guest/tdx-guest.c b/drivers/virt/coco/tdx-guest/tdx-guest.c index d7db6c824e13d..224e7dde9cdee 100644 --- a/drivers/virt/coco/tdx-guest/tdx-guest.c +++ b/drivers/virt/coco/tdx-guest/tdx-guest.c @@ -124,10 +124,8 @@ static void *alloc_quote_buf(void) if (!addr) return NULL; - if (set_memory_decrypted((unsigned long)addr, count)) { - free_pages_exact(addr, len); + if (set_memory_decrypted((unsigned long)addr, count)) return NULL; - } return addr; } From 032fe9b0516702599c2dd990a4703f783d5716b8 Mon Sep 17 00:00:00 2001 From: Mingcong Bai Date: Thu, 26 Dec 2024 14:22:05 +0800 Subject: [PATCH 1055/1450] platform/x86: hp-wmi: mark 8A15 board for timed OMEN thermal profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HP OMEN 8 (2022), corresponding to a board ID of 8A15, supports OMEN thermal profile and requires the timed profile quirk. Upon adding this ID to both the omen_thermal_profile_boards and omen_timed_thermal_profile_boards, significant bump in performance can be observed. For instance, SilverBench (https://silver.urih.com/) results improved from ~56,000 to ~69,000, as a result of higher power draws (and thus core frequencies) whilst under load: Package Power: - Before the patch: ~65W (dropping to about 55W under sustained load). - After the patch: ~115W (dropping to about 105W under sustained load). Core Power: - Before: ~60W (ditto above). - After: ~108W (ditto above). Add 8A15 to omen_thermal_profile_boards and omen_timed_thermal_profile_boards to improve performance. Signed-off-by: Xi Xiao <1577912515@qq.com> Signed-off-by: Mingcong Bai Link: https://lore.kernel.org/r/20241226062207.3352629-1-jeffbai@aosc.io Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-wmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 81ccc96ffe40a..20c55bab3b8cc 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -64,7 +64,7 @@ static const char * const omen_thermal_profile_boards[] = { "874A", "8603", "8604", "8748", "886B", "886C", "878A", "878B", "878C", "88C8", "88CB", "8786", "8787", "8788", "88D1", "88D2", "88F4", "88FD", "88F5", "88F6", "88F7", "88FE", "88FF", "8900", "8901", "8902", "8912", - "8917", "8918", "8949", "894A", "89EB", "8BAD", "8A42" + "8917", "8918", "8949", "894A", "89EB", "8BAD", "8A42", "8A15" }; /* DMI Board names of Omen laptops that are specifically set to be thermal @@ -80,7 +80,7 @@ static const char * const omen_thermal_profile_force_v0_boards[] = { * "balanced" when reaching zero. */ static const char * const omen_timed_thermal_profile_boards[] = { - "8BAD", "8A42" + "8BAD", "8A42", "8A15" }; /* DMI Board names of Victus laptops */ From 7e16ae558a87ac9099b6a93a43f19b42d809fd78 Mon Sep 17 00:00:00 2001 From: Vishnu Sankar Date: Sat, 28 Dec 2024 08:18:40 +0900 Subject: [PATCH 1056/1450] platform/x86: thinkpad-acpi: Add support for hotkey 0x1401 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit F8 mode key on Lenovo 2025 platforms use a different key code. Adding support for the new keycode 0x1401. Tested on X1 Carbon Gen 13 and X1 2-in-1 Gen 10. Signed-off-by: Vishnu Sankar Reviewed-by: Mark Pearson Link: https://lore.kernel.org/r/20241227231840.21334-1-vishnuocv@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- Documentation/admin-guide/laptops/thinkpad-acpi.rst | 10 +++++++--- drivers/platform/x86/thinkpad_acpi.c | 4 +++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst index 7f674a6cfa8a7..4ab0fef7d440d 100644 --- a/Documentation/admin-guide/laptops/thinkpad-acpi.rst +++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst @@ -445,8 +445,10 @@ event code Key Notes 0x1008 0x07 FN+F8 IBM: toggle screen expand Lenovo: configure UltraNav, or toggle screen expand. - On newer platforms (2024+) - replaced by 0x131f (see below) + On 2024 platforms replaced by + 0x131f (see below) and on newer + platforms (2025 +) keycode is + replaced by 0x1401 (see below). 0x1009 0x08 FN+F9 - @@ -506,9 +508,11 @@ event code Key Notes 0x1019 0x18 unknown -0x131f ... FN+F8 Platform Mode change. +0x131f ... FN+F8 Platform Mode change (2024 systems). Implemented in driver. +0x1401 ... FN+F8 Platform Mode change (2025 + systems). + Implemented in driver. ... ... ... 0x1020 0x1F unknown diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 6371a9f765c13..2cfb2ac3f465a 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -184,7 +184,8 @@ enum tpacpi_hkey_event_t { */ TP_HKEY_EV_AMT_TOGGLE = 0x131a, /* Toggle AMT on/off */ TP_HKEY_EV_DOUBLETAP_TOGGLE = 0x131c, /* Toggle trackpoint doubletap on/off */ - TP_HKEY_EV_PROFILE_TOGGLE = 0x131f, /* Toggle platform profile */ + TP_HKEY_EV_PROFILE_TOGGLE = 0x131f, /* Toggle platform profile in 2024 systems */ + TP_HKEY_EV_PROFILE_TOGGLE2 = 0x1401, /* Toggle platform profile in 2025 + systems */ /* Reasons for waking up from S3/S4 */ TP_HKEY_EV_WKUP_S3_UNDOCK = 0x2304, /* undock requested, S3 */ @@ -11200,6 +11201,7 @@ static bool tpacpi_driver_event(const unsigned int hkey_event) tp_features.trackpoint_doubletap = !tp_features.trackpoint_doubletap; return true; case TP_HKEY_EV_PROFILE_TOGGLE: + case TP_HKEY_EV_PROFILE_TOGGLE2: platform_profile_cycle(); return true; } From fc033cf25e612e840e545f8d5ad2edd6ba613ed5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Dec 2024 13:15:45 -0800 Subject: [PATCH 1057/1450] Linux 6.13-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5c9b1d2d59b4d..48e89108aa58c 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Baby Opossum Posse # *DOCUMENTATION* From 6a451e2c5c03e27aa3ec36be424fccaa286c3ccd Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Mon, 30 Dec 2024 14:49:10 +0800 Subject: [PATCH 1058/1450] ALSA: hda/tas2781: Ignore SUBSYS_ID not found for tas2563 projects Driver will return error if no SUBSYS_ID found in BIOS(acpi). It will cause error in tas2563 projects, which have no SUBSYS_ID. Fixes: 4e7035a75da9 ("ALSA: hda/tas2781: Add speaker id check for ASUS projects") Signed-off-by: Baojun Xu Link: https://lore.kernel.org/20241223225442.1358491-1-stuart.a.hayhurst@gmail.com Link: https://patch.msgid.link/20241230064910.1583-1-baojun.xu@ti.com Signed-off-by: Takashi Iwai --- sound/pci/hda/tas2781_hda_i2c.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/tas2781_hda_i2c.c b/sound/pci/hda/tas2781_hda_i2c.c index 0af015806aba5..0e42b87dadb8a 100644 --- a/sound/pci/hda/tas2781_hda_i2c.c +++ b/sound/pci/hda/tas2781_hda_i2c.c @@ -142,6 +142,9 @@ static int tas2781_read_acpi(struct tasdevice_priv *p, const char *hid) } sub = acpi_get_subsystem_id(ACPI_HANDLE(physdev)); if (IS_ERR(sub)) { + /* No subsys id in older tas2563 projects. */ + if (!strncmp(hid, "INT8866", sizeof("INT8866"))) + goto end_2563; dev_err(p->dev, "Failed to get SUBSYS ID.\n"); ret = PTR_ERR(sub); goto err; @@ -164,6 +167,7 @@ static int tas2781_read_acpi(struct tasdevice_priv *p, const char *hid) p->speaker_id = NULL; } +end_2563: acpi_dev_free_resource_list(&resources); strscpy(p->dev_name, hid, sizeof(p->dev_name)); put_device(physdev); From ac9fae799eda81e24bbf2e0d5cb9e5c33fc9bdcb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 29 Dec 2024 09:39:13 +0100 Subject: [PATCH 1059/1450] ALSA: compress_offload: Drop unneeded no_free_ptr() The error path for memdup_user() no longer needs the tricky wrap with no_free_ptr() and we can safely return the error pointer directly. Fixes: 04177158cf98 ("ALSA: compress_offload: introduce accel operation mode") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412290846.cncnpGaw-lkp@intel.com/ Link: https://patch.msgid.link/20241229083917.14912-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/compress_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index edf5aadf38e5a..4ed6cec5fd5c2 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1077,7 +1077,7 @@ static int snd_compr_task_create(struct snd_compr_stream *stream, unsigned long return -EPERM; task = memdup_user((void __user *)arg, sizeof(*task)); if (IS_ERR(task)) - return PTR_ERR(no_free_ptr(task)); + return PTR_ERR(task); retval = snd_compr_task_new(stream, task); if (retval >= 0) if (copy_to_user((void __user *)arg, task, sizeof(*task))) @@ -1138,7 +1138,7 @@ static int snd_compr_task_start_ioctl(struct snd_compr_stream *stream, unsigned return -EPERM; task = memdup_user((void __user *)arg, sizeof(*task)); if (IS_ERR(task)) - return PTR_ERR(no_free_ptr(task)); + return PTR_ERR(task); retval = snd_compr_task_start(stream, task); if (retval >= 0) if (copy_to_user((void __user *)arg, task, sizeof(*task))) @@ -1229,7 +1229,7 @@ static int snd_compr_task_status_ioctl(struct snd_compr_stream *stream, unsigned return -EPERM; status = memdup_user((void __user *)arg, sizeof(*status)); if (IS_ERR(status)) - return PTR_ERR(no_free_ptr(status)); + return PTR_ERR(status); retval = snd_compr_task_status(stream, status); if (retval >= 0) if (copy_to_user((void __user *)arg, status, sizeof(*status))) From 7439b395211874e20c24b2fe0e4903864357a3f5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Dec 2024 18:52:32 +0000 Subject: [PATCH 1060/1450] ALSA: compress_offload: fix remaining descriptor races in sound/core/compress_offload.c 3d3f43fab4cf ("ALSA: compress_offload: improve file descriptors installation for dma-buf") fixed some of descriptor races in snd_compr_task_new(), but there's a couple more left. We need to grab the references to dmabuf before moving them into descriptor table - trying to do that by descriptor afterwards might end up getting a different object, with a dangling reference left in task->{input,output} Fixes: 3d3f43fab4cf ("ALSA: compress_offload: improve file descriptors installation for dma-buf") Signed-off-by: Al Viro Link: https://patch.msgid.link/20241229185232.GA1977892@ZenIV Signed-off-by: Takashi Iwai --- sound/core/compress_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 4ed6cec5fd5c2..840bb9cfe7890 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -1053,13 +1053,13 @@ static int snd_compr_task_new(struct snd_compr_stream *stream, struct snd_compr_ put_unused_fd(fd_i); goto cleanup; } + /* keep dmabuf reference until freed with task free ioctl */ + get_dma_buf(task->input); + get_dma_buf(task->output); fd_install(fd_i, task->input->file); fd_install(fd_o, task->output->file); utask->input_fd = fd_i; utask->output_fd = fd_o; - /* keep dmabuf reference until freed with task free ioctl */ - dma_buf_get(utask->input_fd); - dma_buf_get(utask->output_fd); list_add_tail(&task->list, &stream->runtime->tasks); stream->runtime->total_tasks++; return 0; From 0179488ca992d79908b8e26b9213f1554fc5bacc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 30 Dec 2024 12:05:35 +0100 Subject: [PATCH 1061/1450] ALSA: seq: oss: Fix races at processing SysEx messages OSS sequencer handles the SysEx messages split in 6 bytes packets, and ALSA sequencer OSS layer tries to combine those. It stores the data in the internal buffer and this access is racy as of now, which may lead to the out-of-bounds access. As a temporary band-aid fix, introduce a mutex for serializing the process of the SysEx message packets. Reported-by: Kun Hu Closes: https://lore.kernel.org/2B7E93E4-B13A-4AE4-8E87-306A8EE9BBB7@m.fudan.edu.cn Cc: Link: https://patch.msgid.link/20241230110543.32454-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/oss/seq_oss_synth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/core/seq/oss/seq_oss_synth.c b/sound/core/seq/oss/seq_oss_synth.c index e3394919daa09..51ee4c00a8431 100644 --- a/sound/core/seq/oss/seq_oss_synth.c +++ b/sound/core/seq/oss/seq_oss_synth.c @@ -66,6 +66,7 @@ static struct seq_oss_synth midi_synth_dev = { }; static DEFINE_SPINLOCK(register_lock); +static DEFINE_MUTEX(sysex_mutex); /* * prototypes @@ -497,6 +498,7 @@ snd_seq_oss_synth_sysex(struct seq_oss_devinfo *dp, int dev, unsigned char *buf, if (!info) return -ENXIO; + guard(mutex)(&sysex_mutex); sysex = info->sysex; if (sysex == NULL) { sysex = kzalloc(sizeof(*sysex), GFP_KERNEL); From abbff41b6932cde359589fd51f4024b7c85f366b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 30 Dec 2024 12:40:22 +0100 Subject: [PATCH 1062/1450] Revert "ALSA: ump: Don't enumeration invalid groups for legacy rawmidi" This reverts commit c2d188e137e77294323132a760a4608321a36a70. Although it's fine to filter the invalid UMP groups at the first probe time, this will become a problem when UMP groups are updated and (re-)activated. Then there is no way to re-add the substreams properly for the legacy rawmidi, and the new active groups will be still invisible. So let's revert the change. This will move back to showing the full 16 groups, but it's better than forever lost. Link: https://patch.msgid.link/20241230114023.3787-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/ump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/ump.c b/sound/core/ump.c index fe4d39ae11593..9198bff4768ce 100644 --- a/sound/core/ump.c +++ b/sound/core/ump.c @@ -1244,7 +1244,7 @@ static int fill_legacy_mapping(struct snd_ump_endpoint *ump) num = 0; for (i = 0; i < SNDRV_UMP_MAX_GROUPS; i++) - if ((group_maps & (1U << i)) && ump->groups[i].valid) + if (group_maps & (1U << i)) ump->legacy_mapping[num++] = i; return num; From cc0dc9e871a91aadf5b26a2d7760fb762e0d9203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Le=20Goffic?= Date: Wed, 18 Dec 2024 10:22:27 +0100 Subject: [PATCH 1063/1450] watchdog: stm32_iwdg: fix error message during driver probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 3ab1663af6c1 ("watchdog: stm32_iwdg: Add pretimeout support") introduces the support for the pre-timeout interrupt. The support for this interrupt is optional but the driver uses the platform_get_irq() which produces an error message during the driver probe if we don't have any `interrupts` property in the DT. Use the platform_get_irq_optional() API to get rid of the error message as this property is optional. Fixes: 3ab1663af6c1 ("watchdog: stm32_iwdg: Add pretimeout support") Signed-off-by: Clément Le Goffic Reviewed-by: Marek Vasut Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20241218092227.771133-1-clement.legoffic@foss.st.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/stm32_iwdg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/stm32_iwdg.c b/drivers/watchdog/stm32_iwdg.c index d700e0d49bb95..8ad06b54c5adc 100644 --- a/drivers/watchdog/stm32_iwdg.c +++ b/drivers/watchdog/stm32_iwdg.c @@ -286,7 +286,7 @@ static int stm32_iwdg_irq_init(struct platform_device *pdev, if (!wdt->data->has_early_wakeup) return 0; - irq = platform_get_irq(pdev, 0); + irq = platform_get_irq_optional(pdev, 0); if (irq <= 0) return 0; From bddfe23be8f84e66b1920140a6e11400fae4f74a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 25 Dec 2024 01:24:23 +0000 Subject: [PATCH 1064/1450] net: mac802154: Remove unused ieee802154_mlme_tx_one ieee802154_mlme_tx_one() was added in 2022 by commit ddd9ee7cda12 ("net: mac802154: Introduce a synchronous API for MLME commands") but has remained unused. Remove it. Note, there's still a ieee802154_mlme_tx_one_locked() variant that is used. Signed-off-by: Dr. David Alan Gilbert Acked-by: Miquel Raynal Link: https://lore.kernel.org/20241225012423.439229-1-linux@treblig.org Signed-off-by: Stefan Schmidt --- net/mac802154/ieee802154_i.h | 3 --- net/mac802154/tx.c | 13 ------------- 2 files changed, 16 deletions(-) diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 08dd521a51a5c..8f2bff268392b 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -194,9 +194,6 @@ int ieee802154_mlme_tx_locked(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb); void ieee802154_mlme_op_post(struct ieee802154_local *local); -int ieee802154_mlme_tx_one(struct ieee802154_local *local, - struct ieee802154_sub_if_data *sdata, - struct sk_buff *skb); int ieee802154_mlme_tx_one_locked(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb); diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 337d6faf0d2af..4d13f18f6f2ca 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -178,19 +178,6 @@ void ieee802154_mlme_op_post(struct ieee802154_local *local) ieee802154_release_queue(local); } -int ieee802154_mlme_tx_one(struct ieee802154_local *local, - struct ieee802154_sub_if_data *sdata, - struct sk_buff *skb) -{ - int ret; - - ieee802154_mlme_op_pre(local); - ret = ieee802154_mlme_tx(local, sdata, skb); - ieee802154_mlme_op_post(local); - - return ret; -} - int ieee802154_mlme_tx_one_locked(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb) From de30d74f58cbecb3894c7738985bd0086d04bec1 Mon Sep 17 00:00:00 2001 From: Steven Davis Date: Mon, 30 Dec 2024 19:34:31 +0000 Subject: [PATCH 1065/1450] cdrom: Fix typo, 'devicen' to 'device' Fix typo in cd_dbg line to add trailing newline character. Signed-off-by: Steven Davis Link: https://lore.kernel.org/lkml/20241229165744.21725-1-goldside000@outlook.com Reviewed-by: Phillip Potter Link: https://lore.kernel.org/lkml/Z3GV2W_MUOw5BrtR@equinox Signed-off-by: Phillip Potter Link: https://lore.kernel.org/r/20241230193431.441120-2-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 6a99a459b80b2..51745ed1bbabc 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -1106,7 +1106,7 @@ int open_for_data(struct cdrom_device_info *cdi) } } - cd_dbg(CD_OPEN, "all seems well, opening the devicen"); + cd_dbg(CD_OPEN, "all seems well, opening the device\n"); /* all seems well, we can open the device */ ret = cdo->open(cdi, 0); /* open for data */ From a9c83a0ab66a5b02e914daed502fb8d3a8d3d619 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Dec 2024 14:15:17 -0700 Subject: [PATCH 1066/1450] io_uring/timeout: flush timeouts outside of the timeout lock syzbot reports that a recent fix causes nesting issues between the (now) raw timeoutlock and the eventfd locking: ============================= [ BUG: Invalid wait context ] 6.13.0-rc4-00080-g9828a4c0901f #29 Not tainted ----------------------------- kworker/u32:0/68094 is trying to lock: ffff000014d7a520 (&ctx->wqh#2){..-.}-{3:3}, at: eventfd_signal_mask+0x64/0x180 other info that might help us debug this: context-{5:5} 6 locks held by kworker/u32:0/68094: #0: ffff0000c1d98148 ((wq_completion)iou_exit){+.+.}-{0:0}, at: process_one_work+0x4e8/0xfc0 #1: ffff80008d927c78 ((work_completion)(&ctx->exit_work)){+.+.}-{0:0}, at: process_one_work+0x53c/0xfc0 #2: ffff0000c59bc3d8 (&ctx->completion_lock){+.+.}-{3:3}, at: io_kill_timeouts+0x40/0x180 #3: ffff0000c59bc358 (&ctx->timeout_lock){-.-.}-{2:2}, at: io_kill_timeouts+0x48/0x180 #4: ffff800085127aa0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x8/0x38 #5: ffff800085127aa0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x8/0x38 stack backtrace: CPU: 7 UID: 0 PID: 68094 Comm: kworker/u32:0 Not tainted 6.13.0-rc4-00080-g9828a4c0901f #29 Hardware name: linux,dummy-virt (DT) Workqueue: iou_exit io_ring_exit_work Call trace: show_stack+0x1c/0x30 (C) __dump_stack+0x24/0x30 dump_stack_lvl+0x60/0x80 dump_stack+0x14/0x20 __lock_acquire+0x19f8/0x60c8 lock_acquire+0x1a4/0x540 _raw_spin_lock_irqsave+0x90/0xd0 eventfd_signal_mask+0x64/0x180 io_eventfd_signal+0x64/0x108 io_req_local_work_add+0x294/0x430 __io_req_task_work_add+0x1c0/0x270 io_kill_timeout+0x1f0/0x288 io_kill_timeouts+0xd4/0x180 io_uring_try_cancel_requests+0x2e8/0x388 io_ring_exit_work+0x150/0x550 process_one_work+0x5e8/0xfc0 worker_thread+0x7ec/0xc80 kthread+0x24c/0x300 ret_from_fork+0x10/0x20 because after the preempt-rt fix for the timeout lock nesting inside the io-wq lock, we now have the eventfd spinlock nesting inside the raw timeout spinlock. Rather than play whack-a-mole with other nesting on the timeout lock, split the deletion and killing of timeouts so queueing the task_work for the timeout cancelations can get done outside of the timeout lock. Reported-by: syzbot+b1fc199a40b65d601b65@syzkaller.appspotmail.com Fixes: 020b40f35624 ("io_uring: make ctx->timeout_lock a raw spinlock") Signed-off-by: Jens Axboe --- io_uring/timeout.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index bbe58638eca79..362689b17cccf 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -85,7 +85,27 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_task_complete(req, ts); } -static bool io_kill_timeout(struct io_kiocb *req, int status) +static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) +{ + if (list_empty(list)) + return false; + + while (!list_empty(list)) { + struct io_timeout *timeout; + struct io_kiocb *req; + + timeout = list_first_entry(list, struct io_timeout, list); + list_del_init(&timeout->list); + req = cmd_to_io_kiocb(timeout); + if (err) + req_set_fail(req); + io_req_queue_tw_complete(req, err); + } + + return true; +} + +static void io_kill_timeout(struct io_kiocb *req, struct list_head *list) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -93,21 +113,17 @@ static bool io_kill_timeout(struct io_kiocb *req, int status) if (hrtimer_try_to_cancel(&io->timer) != -1) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); - if (status) - req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&timeout->list); - io_req_queue_tw_complete(req, status); - return true; + list_move_tail(&timeout->list, list); } - return false; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) { - u32 seq; struct io_timeout *timeout, *tmp; + LIST_HEAD(list); + u32 seq; raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); @@ -131,10 +147,11 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - io_kill_timeout(req, 0); + io_kill_timeout(req, &list); } ctx->cq_last_tm_flush = seq; raw_spin_unlock_irq(&ctx->timeout_lock); + io_flush_killed_timeouts(&list, 0); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -661,7 +678,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx bool cancel_all) { struct io_timeout *timeout, *tmp; - int canceled = 0; + LIST_HEAD(list); /* * completion_lock is needed for io_match_task(). Take it before @@ -672,11 +689,11 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); - if (io_match_task(req, tctx, cancel_all) && - io_kill_timeout(req, -ECANCELED)) - canceled++; + if (io_match_task(req, tctx, cancel_all)) + io_kill_timeout(req, &list); } raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); - return canceled != 0; + + return io_flush_killed_timeouts(&list, -ECANCELED); } From b255ef45fcc2141c1bf98456796abb956d843a27 Mon Sep 17 00:00:00 2001 From: Vitalii Mordan Date: Fri, 27 Dec 2024 15:30:07 +0300 Subject: [PATCH 1067/1450] eth: bcmsysport: fix call balance of priv->clk handling routines Check the return value of clk_prepare_enable to ensure that priv->clk has been successfully enabled. If priv->clk was not enabled during bcm_sysport_probe, bcm_sysport_resume, or bcm_sysport_open, it must not be disabled in any subsequent execution paths. Fixes: 31bc72d97656 ("net: systemport: fetch and use clock resources") Signed-off-by: Vitalii Mordan Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20241227123007.2333397-1-mordan@ispras.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcmsysport.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 42672c63f1086..bc4e1f3b3752d 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1933,7 +1933,11 @@ static int bcm_sysport_open(struct net_device *dev) unsigned int i; int ret; - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + netdev_err(dev, "could not enable priv clock\n"); + return ret; + } /* Reset UniMAC */ umac_reset(priv); @@ -2591,7 +2595,11 @@ static int bcm_sysport_probe(struct platform_device *pdev) goto err_deregister_notifier; } - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + dev_err(&pdev->dev, "could not enable priv clock\n"); + goto err_deregister_netdev; + } priv->rev = topctrl_readl(priv, REV_CNTL) & REV_MASK; dev_info(&pdev->dev, @@ -2605,6 +2613,8 @@ static int bcm_sysport_probe(struct platform_device *pdev) return 0; +err_deregister_netdev: + unregister_netdev(dev); err_deregister_notifier: unregister_netdevice_notifier(&priv->netdev_notifier); err_deregister_fixed_link: @@ -2774,7 +2784,12 @@ static int __maybe_unused bcm_sysport_resume(struct device *d) if (!netif_running(dev)) return 0; - clk_prepare_enable(priv->clk); + ret = clk_prepare_enable(priv->clk); + if (ret) { + netdev_err(dev, "could not enable priv clock\n"); + return ret; + } + if (priv->wolopts) clk_disable_unprepare(priv->wol_clk); From fb3a9a1165cea104b5ab3753e88218e4497b01c1 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Fri, 20 Dec 2024 19:28:06 -0800 Subject: [PATCH 1068/1450] gve: trigger RX NAPI instead of TX NAPI in gve_xsk_wakeup Commit ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI") moved XSK TX processing to be part of the RX NAPI. However, that commit did not include triggering the RX NAPI in gve_xsk_wakeup. This is necessary because the TX NAPI only processes TX completions, meaning that a TX wakeup would not actually trigger XSK descriptor processing. Also, the branch on XDP_WAKEUP_TX was supposed to have been removed, as the NAPI should be scheduled whether the wakeup is for RX or TX. Fixes: ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI") Cc: stable@vger.kernel.org Signed-off-by: Joshua Washington Signed-off-by: Praveen Kaligineedi Link: https://patch.msgid.link/20241221032807.302244-1-pkaligineedi@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 09fb7f16f73e2..8a8f6ab12a980 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1714,7 +1714,7 @@ static int gve_xsk_pool_disable(struct net_device *dev, static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) { struct gve_priv *priv = netdev_priv(dev); - int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id); + struct napi_struct *napi; if (!gve_get_napi_enabled(priv)) return -ENETDOWN; @@ -1722,19 +1722,12 @@ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog) return -EINVAL; - if (flags & XDP_WAKEUP_TX) { - struct gve_tx_ring *tx = &priv->tx[tx_queue_id]; - struct napi_struct *napi = - &priv->ntfy_blocks[tx->ntfy_id].napi; - - if (!napi_if_scheduled_mark_missed(napi)) { - /* Call local_bh_enable to trigger SoftIRQ processing */ - local_bh_disable(); - napi_schedule(napi); - local_bh_enable(); - } - - tx->xdp_xsk_wakeup++; + napi = &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_id)].napi; + if (!napi_if_scheduled_mark_missed(napi)) { + /* Call local_bh_enable to trigger SoftIRQ processing */ + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); } return 0; From ad5c318086e2e23b577eca33559c5ebf89bc7eb9 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Sat, 21 Dec 2024 17:14:48 +0900 Subject: [PATCH 1069/1450] net: mv643xx_eth: fix an OF node reference leak Current implementation of mv643xx_eth_shared_of_add_port() calls of_parse_phandle(), but does not release the refcount on error. Call of_node_put() in the error path and in mv643xx_eth_shared_of_remove(). This bug was found by an experimental verification tool that I am developing. Fixes: 76723bca2802 ("net: mv643xx_eth: add DT parsing support") Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241221081448.3313163-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mv643xx_eth.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index a06048719e844..67a6ff07c83d8 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2704,9 +2704,15 @@ static struct platform_device *port_platdev[3]; static void mv643xx_eth_shared_of_remove(void) { + struct mv643xx_eth_platform_data *pd; int n; for (n = 0; n < 3; n++) { + if (!port_platdev[n]) + continue; + pd = dev_get_platdata(&port_platdev[n]->dev); + if (pd) + of_node_put(pd->phy_node); platform_device_del(port_platdev[n]); port_platdev[n] = NULL; } @@ -2769,8 +2775,10 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, } ppdev = platform_device_alloc(MV643XX_ETH_NAME, dev_num); - if (!ppdev) - return -ENOMEM; + if (!ppdev) { + ret = -ENOMEM; + goto put_err; + } ppdev->dev.coherent_dma_mask = DMA_BIT_MASK(32); ppdev->dev.of_node = pnp; @@ -2792,6 +2800,8 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, port_err: platform_device_put(ppdev); +put_err: + of_node_put(ppd.phy_node); return ret; } From cbb26f7d8451fe56ccac802c6db48d16240feebd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 21 Dec 2024 09:51:46 +0100 Subject: [PATCH 1070/1450] mptcp: fix TCP options overflow. Syzbot reported the following splat: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 UID: 0 PID: 5836 Comm: sshd Not tainted 6.13.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 RIP: 0010:_compound_head include/linux/page-flags.h:242 [inline] RIP: 0010:put_page+0x23/0x260 include/linux/mm.h:1552 Code: 90 90 90 90 90 90 90 55 41 57 41 56 53 49 89 fe 48 bd 00 00 00 00 00 fc ff df e8 f8 5e 12 f8 49 8d 5e 08 48 89 d8 48 c1 e8 03 <80> 3c 28 00 74 08 48 89 df e8 8f c7 78 f8 48 8b 1b 48 89 de 48 83 RSP: 0000:ffffc90003916c90 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000008 RCX: ffff888030458000 RDX: 0000000000000100 RSI: 0000000000000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff898ca81d R09: 1ffff110054414ac R10: dffffc0000000000 R11: ffffed10054414ad R12: 0000000000000007 R13: ffff88802a20a542 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f34f496e800(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f9d6ec9ec28 CR3: 000000004d260000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_page_unref include/linux/skbuff_ref.h:43 [inline] __skb_frag_unref include/linux/skbuff_ref.h:56 [inline] skb_release_data+0x483/0x8a0 net/core/skbuff.c:1119 skb_release_all net/core/skbuff.c:1190 [inline] __kfree_skb+0x55/0x70 net/core/skbuff.c:1204 tcp_clean_rtx_queue net/ipv4/tcp_input.c:3436 [inline] tcp_ack+0x2442/0x6bc0 net/ipv4/tcp_input.c:4032 tcp_rcv_state_process+0x8eb/0x44e0 net/ipv4/tcp_input.c:6805 tcp_v4_do_rcv+0x77d/0xc70 net/ipv4/tcp_ipv4.c:1939 tcp_v4_rcv+0x2dc0/0x37f0 net/ipv4/tcp_ipv4.c:2351 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5672 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5785 process_backlog+0x662/0x15b0 net/core/dev.c:6117 __napi_poll+0xcb/0x490 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:7074 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0xf7/0x220 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0x57/0xc0 arch/x86/kernel/apic/apic.c:1049 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 RIP: 0033:0x7f34f4519ad5 Code: 85 d2 74 0d 0f 10 02 48 8d 54 24 20 0f 11 44 24 20 64 8b 04 25 18 00 00 00 85 c0 75 27 41 b8 08 00 00 00 b8 0f 01 00 00 0f 05 <48> 3d 00 f0 ff ff 76 75 48 8b 15 24 73 0d 00 f7 d8 64 89 02 48 83 RSP: 002b:00007ffec5b32ce0 EFLAGS: 00000246 RAX: 0000000000000001 RBX: 00000000000668a0 RCX: 00007f34f4519ad5 RDX: 00007ffec5b32d00 RSI: 0000000000000004 RDI: 0000564f4bc6cae0 RBP: 0000564f4bc6b5a0 R08: 0000000000000008 R09: 0000000000000000 R10: 00007ffec5b32de8 R11: 0000000000000246 R12: 0000564f48ea8aa4 R13: 0000000000000001 R14: 0000564f48ea93e8 R15: 00007ffec5b32d68 Eric noted a probable shinfo->nr_frags corruption, which indeed occurs. The root cause is a buggy MPTCP option len computation in some circumstances: the ADD_ADDR option should be mutually exclusive with DSS since the blamed commit. Still, mptcp_established_options_add_addr() tries to set the relevant info in mptcp_out_options, if the remaining space is large enough even when DSS is present. Since the ADD_ADDR infos and the DSS share the same union fields, adding first corrupts the latter. In the worst-case scenario, such corruption increases the DSS binary layout, exceeding the computed length and possibly overwriting the skb shared info. Address the issue by enforcing mutual exclusion in mptcp_established_options_add_addr(), too. Cc: stable@vger.kernel.org Reported-by: syzbot+38a095a81f30d82884c1@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/538 Fixes: 1bff1e43a30e ("mptcp: optimize out option generation") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/025d9df8cde3c9a557befc47e9bc08fbbe3476e5.1734771049.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1603b3702e220..a62bc874bf1e1 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -667,8 +667,15 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * &echo, &drop_other_suboptions)) return false; + /* + * Later on, mptcp_write_options() will enforce mutually exclusion with + * DSS, bail out if such option is set and we can't drop it. + */ if (drop_other_suboptions) remaining += opt_size; + else if (opts->suboptions & OPTION_MPTCP_DSS) + return false; + len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; From 03c8d0af2e409e15c16130b185e12b5efba0a6b9 Mon Sep 17 00:00:00 2001 From: Pascal Hambourg Date: Mon, 23 Dec 2024 17:44:01 +0100 Subject: [PATCH 1071/1450] sky2: Add device ID 11ab:4373 for Marvell 88E8075 A Marvell 88E8075 ethernet controller has this device ID instead of 11ab:4370 and works fine with the sky2 driver. Signed-off-by: Pascal Hambourg Cc: stable@vger.kernel.org Link: https://patch.msgid.link/10165a62-99fb-4be6-8c64-84afd6234085@plouf.fr.eu.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/sky2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 3914cd9210d49..988fa28cfb5ff 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -130,6 +130,7 @@ static const struct pci_device_id sky2_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x436C) }, /* 88E8072 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x436D) }, /* 88E8055 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4370) }, /* 88E8075 */ + { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4373) }, /* 88E8075 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4380) }, /* 88E8057 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4381) }, /* 88E8059 */ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, 0x4382) }, /* 88E8079 */ From 8ec396d05d1b737c87311fb7311f753b02c2a6b1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 28 Nov 2024 15:06:17 +0000 Subject: [PATCH 1072/1450] mm: reinstate ability to map write-sealed memfd mappings read-only Patch series "mm: reinstate ability to map write-sealed memfd mappings read-only". In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. This series reworks how we both permit write-sealed mappings being mapped read-only and disallow mprotect() from undoing the write-seal, fixing this regression. We also add a regression test to ensure that we do not accidentally regress this in future. Thanks to Julian Orth for reporting this regression. This patch (of 2): In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. This was previously unnecessarily disallowed, despite the man page documentation indicating that it would be, thereby limiting the usefulness of F_SEAL_WRITE logic. We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE seal (one which disallows future writes to the memfd) to also be used for F_SEAL_WRITE. For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a read-only mapping to disallow mprotect() from overriding the seal - an operation performed by seal_check_write(), invoked from shmem_mmap(), the f_op->mmap() hook used by shmem mappings. By extending this to F_SEAL_WRITE and critically - checking mapping_map_writable() to determine if we may map the memfd AFTER we invoke shmem_mmap() - the desired logic becomes possible. This is because mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will have cleared. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. We reinstate this functionality by moving the check out of shmem_mmap() and instead performing it in do_mmap() at the point at which VMA flags are being determined, which seems in any case to be a more appropriate place in which to make this determination. In order to achieve this we rework memfd seal logic to allow us access to this information using existing logic and eliminate the clearing of VM_MAYWRITE from seal_check_write() which we are performing in do_mmap() instead. Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") Signed-off-by: Lorenzo Stoakes Reported-by: Julian Orth Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/ Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Shuah Khan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/memfd.h | 14 +++++++++++ include/linux/mm.h | 58 +++++++++++++++++++++++++++++-------------- mm/memfd.c | 2 +- mm/mmap.c | 4 +++ 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 3f2cf339ceafd..d437e30708502 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,6 +7,7 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); +unsigned int *memfd_file_seals_ptr(struct file *file); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } + +static inline unsigned int *memfd_file_seals_ptr(struct file *file) +{ + return NULL; +} #endif +/* Retrieve memfd seals associated with the file, if any. */ +static inline unsigned int memfd_file_seals(struct file *file) +{ + unsigned int *sealsp = memfd_file_seals_ptr(file); + + return sealsp ? *sealsp : 0; +} + #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 338a76ce9083a..fb397918c43d9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4101,6 +4101,37 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif +static inline bool is_write_sealed(int seals) +{ + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +/** + * is_readonly_sealed - Checks whether write-sealed but mapped read-only, + * in which case writes should be disallowing moving + * forwards. + * @seals: the seals to check + * @vm_flags: the VMA flags to check + * + * Returns whether readonly sealed, in which case writess should be disallowed + * going forward. + */ +static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) +{ + /* + * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (is_write_sealed(seals) && + ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) + return true; + + return false; +} + /** * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and * handle them. @@ -4112,24 +4143,15 @@ static inline void mem_dump_obj(void *object) {} */ static inline int seal_check_write(int seals, struct vm_area_struct *vma) { - if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vm_flags_clear(vma, VM_MAYWRITE); - } + if (!is_write_sealed(seals)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; return 0; } diff --git a/mm/memfd.c b/mm/memfd.c index c17c3ea701a17..35a370d75c9ad 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) return error; } -static unsigned int *memfd_file_seals_ptr(struct file *file) +unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; diff --git a/mm/mmap.c b/mm/mmap.c index d32b7e701058a..16f8e8be01f83 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -368,6 +369,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); + unsigned int seals = memfd_file_seals(file); unsigned long flags_mask; if (!file_mmap_ok(file, inode, pgoff, len)) @@ -408,6 +410,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + else if (is_readonly_sealed(seals, vm_flags)) + vm_flags &= ~VM_MAYWRITE; fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) From ea0916e01d0b0f2cce1369ac1494239a79827270 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 28 Nov 2024 15:06:18 +0000 Subject: [PATCH 1073/1450] selftests/memfd: add test for mapping write-sealed memfd read-only Now we have reinstated the ability to map F_SEAL_WRITE mappings read-only, assert that we are able to do this in a test to ensure that we do not regress this again. Link: https://lkml.kernel.org/r/a6377ec470b14c0539b4600cf8fa24bf2e4858ae.1732804776.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Jann Horn Cc: Julian Orth Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 0a0b555160280..c0c53451a16dc 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -282,6 +282,24 @@ static void *mfd_assert_mmap_shared(int fd) return p; } +static void *mfd_assert_mmap_read_shared(int fd) +{ + void *p; + + p = mmap(NULL, + mfd_def_size, + PROT_READ, + MAP_SHARED, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + return p; +} + static void *mfd_assert_mmap_private(int fd) { void *p; @@ -980,6 +998,30 @@ static void test_seal_future_write(void) close(fd); } +static void test_seal_write_map_read_shared(void) +{ + int fd; + void *p; + + printf("%s SEAL-WRITE-MAP-READ\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_write_map_read", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_add_seals(fd, F_SEAL_WRITE); + mfd_assert_has_seals(fd, F_SEAL_WRITE); + + p = mfd_assert_mmap_read_shared(fd); + + mfd_assert_read(fd); + mfd_assert_read_shared(fd); + mfd_fail_write(fd); + + munmap(p, mfd_def_size); + close(fd); +} + /* * Test SEAL_SHRINK * Test whether SEAL_SHRINK actually prevents shrinking @@ -1593,6 +1635,7 @@ int main(int argc, char **argv) test_seal_write(); test_seal_future_write(); + test_seal_write_map_read_shared(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); From 6aaced5abd32e2a57cd94fd64f824514d0361da8 Mon Sep 17 00:00:00 2001 From: Seiji Nishikawa Date: Sun, 1 Dec 2024 01:12:34 +0900 Subject: [PATCH 1074/1450] mm: vmscan: account for free pages to prevent infinite Loop in throttle_direct_reclaim() The task sometimes continues looping in throttle_direct_reclaim() because allow_direct_reclaim(pgdat) keeps returning false. #0 [ffff80002cb6f8d0] __switch_to at ffff8000080095ac #1 [ffff80002cb6f900] __schedule at ffff800008abbd1c #2 [ffff80002cb6f990] schedule at ffff800008abc50c #3 [ffff80002cb6f9b0] throttle_direct_reclaim at ffff800008273550 #4 [ffff80002cb6fa20] try_to_free_pages at ffff800008277b68 #5 [ffff80002cb6fae0] __alloc_pages_nodemask at ffff8000082c4660 #6 [ffff80002cb6fc50] alloc_pages_vma at ffff8000082e4a98 #7 [ffff80002cb6fca0] do_anonymous_page at ffff80000829f5a8 #8 [ffff80002cb6fce0] __handle_mm_fault at ffff8000082a5974 #9 [ffff80002cb6fd90] handle_mm_fault at ffff8000082a5bd4 At this point, the pgdat contains the following two zones: NODE: 4 ZONE: 0 ADDR: ffff00817fffe540 NAME: "DMA32" SIZE: 20480 MIN/LOW/HIGH: 11/28/45 VM_STAT: NR_FREE_PAGES: 359 NR_ZONE_INACTIVE_ANON: 18813 NR_ZONE_ACTIVE_ANON: 0 NR_ZONE_INACTIVE_FILE: 50 NR_ZONE_ACTIVE_FILE: 0 NR_ZONE_UNEVICTABLE: 0 NR_ZONE_WRITE_PENDING: 0 NR_MLOCK: 0 NR_BOUNCE: 0 NR_ZSPAGES: 0 NR_FREE_CMA_PAGES: 0 NODE: 4 ZONE: 1 ADDR: ffff00817fffec00 NAME: "Normal" SIZE: 8454144 PRESENT: 98304 MIN/LOW/HIGH: 68/166/264 VM_STAT: NR_FREE_PAGES: 146 NR_ZONE_INACTIVE_ANON: 94668 NR_ZONE_ACTIVE_ANON: 3 NR_ZONE_INACTIVE_FILE: 735 NR_ZONE_ACTIVE_FILE: 78 NR_ZONE_UNEVICTABLE: 0 NR_ZONE_WRITE_PENDING: 0 NR_MLOCK: 0 NR_BOUNCE: 0 NR_ZSPAGES: 0 NR_FREE_CMA_PAGES: 0 In allow_direct_reclaim(), while processing ZONE_DMA32, the sum of inactive/active file-backed pages calculated in zone_reclaimable_pages() based on the result of zone_page_state_snapshot() is zero. Additionally, since this system lacks swap, the calculation of inactive/ active anonymous pages is skipped. crash> p nr_swap_pages nr_swap_pages = $1937 = { counter = 0 } As a result, ZONE_DMA32 is deemed unreclaimable and skipped, moving on to the processing of the next zone, ZONE_NORMAL, despite ZONE_DMA32 having free pages significantly exceeding the high watermark. The problem is that the pgdat->kswapd_failures hasn't been incremented. crash> px ((struct pglist_data *) 0xffff00817fffe540)->kswapd_failures $1935 = 0x0 This is because the node deemed balanced. The node balancing logic in balance_pgdat() evaluates all zones collectively. If one or more zones (e.g., ZONE_DMA32) have enough free pages to meet their watermarks, the entire node is deemed balanced. This causes balance_pgdat() to exit early before incrementing the kswapd_failures, as it considers the overall memory state acceptable, even though some zones (like ZONE_NORMAL) remain under significant pressure. The patch ensures that zone_reclaimable_pages() includes free pages (NR_FREE_PAGES) in its calculation when no other reclaimable pages are available (e.g., file-backed or anonymous pages). This change prevents zones like ZONE_DMA32, which have sufficient free pages, from being mistakenly deemed unreclaimable. By doing so, the patch ensures proper node balancing, avoids masking pressure on other zones like ZONE_NORMAL, and prevents infinite loops in throttle_direct_reclaim() caused by allow_direct_reclaim(pgdat) repeatedly returning false. The kernel hangs due to a task stuck in throttle_direct_reclaim(), caused by a node being incorrectly deemed balanced despite pressure in certain zones, such as ZONE_NORMAL. This issue arises from zone_reclaimable_pages() returning 0 for zones without reclaimable file- backed or anonymous pages, causing zones like ZONE_DMA32 with sufficient free pages to be skipped. The lack of swap or reclaimable pages results in ZONE_DMA32 being ignored during reclaim, masking pressure in other zones. Consequently, pgdat->kswapd_failures remains 0 in balance_pgdat(), preventing fallback mechanisms in allow_direct_reclaim() from being triggered, leading to an infinite loop in throttle_direct_reclaim(). This patch modifies zone_reclaimable_pages() to account for free pages (NR_FREE_PAGES) when no other reclaimable pages exist. This ensures zones with sufficient free pages are not skipped, enabling proper balancing and reclaim behavior. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20241130164346.436469-1-snishika@redhat.com Link: https://lkml.kernel.org/r/20241130161236.433747-2-snishika@redhat.com Fixes: 5a1c84b404a7 ("mm: remove reclaim and compaction retry approximations") Signed-off-by: Seiji Nishikawa Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 76378bc257e38..9a859b7d18d79 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -374,7 +374,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone) if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); - + /* + * If there are no reclaimable file-backed or anonymous pages, + * ensure zones with sufficient free pages are not skipped. + * This prevents zones like DMA32 from being ignored in reclaim + * scenarios where they can still help alleviate memory pressure. + */ + if (nr == 0) + nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); return nr; } From 34d7cf637c437d5c2a8a6ef23ea45193bad8a91c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 6 Dec 2024 15:03:45 +0800 Subject: [PATCH 1075/1450] mm: don't try THP alignment for FS without get_unmapped_area Commit ed48e87c7df3 ("thp: add thp_get_unmapped_area_vmflags()") changes thp_get_unmapped_area() to thp_get_unmapped_area_vmflags() in __get_unmapped_area(), which doesn't initialize local get_area for anonymous mappings. This leads to us always trying THP alignment even for file_operations which have a NULL ->get_unmapped_area() callback. Since commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") we only want to enable THP alignment for anonymous mappings, so add a !file check to avoid attempting THP alignment for file mappings. Found issue by code inspection. THP alignment is used for easy or more pmd mappings, from vma side. This may cause unnecessary VMA fragmentation and potentially worse performance on filesystems that do not actually support THPs and thus cannot benefit from the alignment. Link: https://lkml.kernel.org/r/20241206070345.2526501-1-wangkefeng.wang@huawei.com Fixes: ed48e87c7df3 ("thp: add thp_get_unmapped_area_vmflags()") Signed-off-by: Kefeng Wang Reviewed-by: Vlastimil Babka Reviewed-by: Yang Shi Cc: Christophe Leroy Cc: David Hildenbrand Cc: Jann Horn Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Rick Edgecombe Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 16f8e8be01f83..aec208f90337c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -892,7 +892,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ From 158cdce87c8c172787063998ad5dd3e2f658b963 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 6 Dec 2024 16:30:25 +0800 Subject: [PATCH 1076/1450] mm/readahead: fix large folio support in async readahead When testing large folio support with XFS on our servers, we observed that only a few large folios are mapped when reading large files via mmap. After a thorough analysis, I identified it was caused by the `/sys/block/*/queue/read_ahead_kb` setting. On our test servers, this parameter is set to 128KB. After I tune it to 2MB, the large folio can work as expected. However, I believe the large folio behavior should not be dependent on the value of read_ahead_kb. It would be more robust if the kernel can automatically adopt to it. With /sys/block/*/queue/read_ahead_kb set to 128KB and performing a sequential read on a 1GB file using MADV_HUGEPAGE, the differences in /proc/meminfo are as follows: - before this patch FileHugePages: 18432 kB FilePmdMapped: 4096 kB - after this patch FileHugePages: 1067008 kB FilePmdMapped: 1048576 kB This shows that after applying the patch, the entire 1GB file is mapped to huge pages. The stable list is CCed, as without this patch, large folios don't function optimally in the readahead path. It's worth noting that if read_ahead_kb is set to a larger value that isn't aligned with huge page sizes (e.g., 4MB + 128KB), it may still fail to map to hugepages. Link: https://lkml.kernel.org/r/20241108141710.9721-1-laoar.shao@gmail.com Link: https://lkml.kernel.org/r/20241206083025.3478-1-laoar.shao@gmail.com Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings") Signed-off-by: Yafang Shao Tested-by: kernel test robot Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/readahead.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/readahead.c b/mm/readahead.c index ea650b8b02fb1..e151f4b13ca4b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -646,7 +646,11 @@ void page_cache_async_ra(struct readahead_control *ractl, 1UL << order); if (index == expected) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max_pages); + /* + * In the case of MADV_HUGEPAGE, the actual size might exceed + * the readahead window. + */ + ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); ra->async_size = ra->size; goto readit; } From 1fd8bc7cd889bd73d07a83cb32d674ac68f99153 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Sat, 14 Dec 2024 17:30:05 +0800 Subject: [PATCH 1077/1450] maple_tree: reload mas before the second call for mas_empty_area Change the LONG_MAX in simple_offset_add to 1024, and do latter: [root@fedora ~]# mkdir /tmp/dir [root@fedora ~]# for i in {1..1024}; do touch /tmp/dir/$i; done touch: cannot touch '/tmp/dir/1024': Device or resource busy [root@fedora ~]# rm /tmp/dir/123 [root@fedora ~]# touch /tmp/dir/1024 [root@fedora ~]# rm /tmp/dir/100 [root@fedora ~]# touch /tmp/dir/1025 touch: cannot touch '/tmp/dir/1025': Device or resource busy After we delete file 100, actually this is a empty entry, but the latter create failed unexpected. mas_alloc_cyclic has two chance to find empty entry. First find the entry with range range_lo and range_hi, if no empty entry exist, and range_lo > min, retry find with range min and range_hi. However, the first call mas_empty_area may mark mas as EBUSY, and the second call for mas_empty_area will return false directly. Fix this by reload mas before second call for mas_empty_area. [Liam.Howlett@Oracle.com: fix mas_alloc_cyclic() second search] Link: https://lore.kernel.org/all/20241216060600.287B4C4CED0@smtp.kernel.org/ Link: https://lkml.kernel.org/r/20241216190113.1226145-2-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20241214093005.72284-1-yangerkun@huaweicloud.com Fixes: 9b6713cc7522 ("maple_tree: Add mtree_alloc_cyclic()") Signed-off-by: Yang Erkun Signed-off-by: Liam R. Howlett Cc: Christian Brauner Cc: Chuck Lever says: Cc: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d0ae808f3a149..047397136f15b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4354,6 +4354,7 @@ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, ret = 1; } if (ret < 0 && range_lo > min) { + mas_reset(mas); ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; From 59d9094df3d79443937add8700b2ef1a866b1081 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 16 Dec 2024 15:11:47 +0800 Subject: [PATCH 1078/1450] mm: hugetlb: independent PMD page table shared count The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin Cc: Kefeng Wang Cc: Ken Chen Cc: Muchun Song Cc: Nanyong Sun Cc: Jane Chu Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + include/linux/mm_types.h | 30 ++++++++++++++++++++++++++++++ mm/hugetlb.c | 16 +++++++--------- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fb397918c43d9..b1c3db9cf3551 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3125,6 +3125,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) if (!pmd_ptlock_init(ptdesc)) return false; __folio_set_pgtable(folio); + ptdesc_pmd_pts_init(ptdesc); lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68e..332cee2856620 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. + * @pt_share_count: Used for HugeTLB PMD page table share count. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. @@ -471,6 +472,9 @@ struct ptdesc { pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + atomic_t pt_share_count; +#endif }; union { @@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ + atomic_set(&ptdesc->pt_share_count, 0); +} + +static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc) +{ + atomic_inc(&ptdesc->pt_share_count); +} + +static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) +{ + atomic_dec(&ptdesc->pt_share_count); +} + +static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +{ + return atomic_read(&ptdesc->pt_share_count); +} +#else +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ +} +#endif + /* * Used for sizing the vmemmap region on some architectures */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cec4b121193fc..c498874a71701 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7211,7 +7211,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { - get_page(virt_to_page(spte)); + ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } @@ -7226,7 +7226,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { - put_page(virt_to_page(spte)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: @@ -7238,10 +7238,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, /* * unmap huge page backed by shared pte. * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page @@ -7250,18 +7246,20 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) + if (sz != PMD_SIZE) + return 0; + if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); - put_page(virt_to_page(ptep)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; } From cddc76b165161a02ff14c4d84d0f5266d9d32b9e Mon Sep 17 00:00:00 2001 From: Alessandro Carminati Date: Tue, 17 Dec 2024 14:20:33 +0000 Subject: [PATCH 1079/1450] mm/kmemleak: fix sleeping function called from invalid context at print message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address a bug in the kernel that triggers a "sleeping function called from invalid context" warning when /sys/kernel/debug/kmemleak is printed under specific conditions: - CONFIG_PREEMPT_RT=y - Set SELinux as the LSM for the system - Set kptr_restrict to 1 - kmemleak buffer contains at least one item BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 136, name: cat preempt_count: 1, expected: 0 RCU nest depth: 2, expected: 2 6 locks held by cat/136: #0: ffff32e64bcbf950 (&p->lock){+.+.}-{3:3}, at: seq_read_iter+0xb8/0xe30 #1: ffffafe6aaa9dea0 (scan_mutex){+.+.}-{3:3}, at: kmemleak_seq_start+0x34/0x128 #3: ffff32e6546b1cd0 (&object->lock){....}-{2:2}, at: kmemleak_seq_show+0x3c/0x1e0 #4: ffffafe6aa8d8560 (rcu_read_lock){....}-{1:2}, at: has_ns_capability_noaudit+0x8/0x1b0 #5: ffffafe6aabbc0f8 (notif_lock){+.+.}-{2:2}, at: avc_compute_av+0xc4/0x3d0 irq event stamp: 136660 hardirqs last enabled at (136659): [] _raw_spin_unlock_irqrestore+0xa8/0xd8 hardirqs last disabled at (136660): [] _raw_spin_lock_irqsave+0x8c/0xb0 softirqs last enabled at (0): [] copy_process+0x11d8/0x3df8 softirqs last disabled at (0): [<0000000000000000>] 0x0 Preemption disabled at: [] kmemleak_seq_show+0x3c/0x1e0 CPU: 1 UID: 0 PID: 136 Comm: cat Tainted: G E 6.11.0-rt7+ #34 Tainted: [E]=UNSIGNED_MODULE Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0xa0/0x128 show_stack+0x1c/0x30 dump_stack_lvl+0xe8/0x198 dump_stack+0x18/0x20 rt_spin_lock+0x8c/0x1a8 avc_perm_nonode+0xa0/0x150 cred_has_capability.isra.0+0x118/0x218 selinux_capable+0x50/0x80 security_capable+0x7c/0xd0 has_ns_capability_noaudit+0x94/0x1b0 has_capability_noaudit+0x20/0x30 restricted_pointer+0x21c/0x4b0 pointer+0x298/0x760 vsnprintf+0x330/0xf70 seq_printf+0x178/0x218 print_unreferenced+0x1a4/0x2d0 kmemleak_seq_show+0xd0/0x1e0 seq_read_iter+0x354/0xe30 seq_read+0x250/0x378 full_proxy_read+0xd8/0x148 vfs_read+0x190/0x918 ksys_read+0xf0/0x1e0 __arm64_sys_read+0x70/0xa8 invoke_syscall.constprop.0+0xd4/0x1d8 el0_svc+0x50/0x158 el0t_64_sync+0x17c/0x180 %pS and %pK, in the same back trace line, are redundant, and %pS can void %pK service in certain contexts. %pS alone already provides the necessary information, and if it cannot resolve the symbol, it falls back to printing the raw address voiding the original intent behind the %pK. Additionally, %pK requires a privilege check CAP_SYSLOG enforced through the LSM, which can trigger a "sleeping function called from invalid context" warning under RT_PREEMPT kernels when the check occurs in an atomic context. This issue may also affect other LSMs. This change avoids the unnecessary privilege check and resolves the sleeping function warning without any loss of information. Link: https://lkml.kernel.org/r/20241217142032.55793-1-acarmina@redhat.com Fixes: 3a6f33d86baa ("mm/kmemleak: use %pK to display kernel pointers in backtrace") Signed-off-by: Alessandro Carminati Acked-by: Sebastian Andrzej Siewior Acked-by: Catalin Marinas Cc: Clément Léger Cc: Alessandro Carminati Cc: Eric Chanudet Cc: Gabriele Paoloni Cc: Juri Lelli Cc: Steven Rostedt Cc: Thomas Weißschuh Cc: Signed-off-by: Andrew Morton --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2a945c07ae995..737af23f4f4e1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -373,7 +373,7 @@ static void print_unreferenced(struct seq_file *seq, for (i = 0; i < nr_entries; i++) { void *ptr = (void *)entries[i]; - warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr); + warn_or_seq_printf(seq, " %pS\n", ptr); } } From 4d9b90df2eb49ab9becdbfd1fd60071bb107406e Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Tue, 17 Dec 2024 11:09:21 +0100 Subject: [PATCH 1080/1450] mailmap: modify the entry for Mathieu Othacehe Set my gnu address as the main one. Link: https://lkml.kernel.org/r/20241217100924.7821-1-othacehe@gnu.org Signed-off-by: Mathieu Othacehe Cc: Alex Elder Cc: David S. Miller Cc: Geliang Tang Cc: Kees Cook Cc: Matthieu Baerts (NGI0) Cc: Neeraj Upadhyay Cc: Quentin Monnet Signed-off-by: Andrew Morton --- .mailmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 7efe43237ca8e..f5f97f9470209 100644 --- a/.mailmap +++ b/.mailmap @@ -435,7 +435,7 @@ Martin Kepplinger Martin Kepplinger Martin Kepplinger Martyna Szapar-Mudlaw -Mathieu Othacehe +Mathieu Othacehe Mat Martineau Mat Martineau Matthew Wilcox From 472098f23323c39cc6269d7b7bf76cba62830a4c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 17 Dec 2024 16:55:39 +0800 Subject: [PATCH 1081/1450] docs: mm: fix the incorrect 'FileHugeMapped' field The '/proc/PID/smaps' does not have the 'FileHugeMapped' field to count the file transparent huge pages, instead, the 'FilePmdMapped' field should be used. Fix it. Link: https://lkml.kernel.org/r/d520ce3aba2b03b088be30bece732426a939049a.1734425264.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 5034915f4e8e8..8872203df0880 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -436,7 +436,7 @@ AnonHugePmdMapped). The number of file transparent huge pages mapped to userspace is available by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. To identify what applications are mapping file transparent huge pages, it -is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields +is necessary to read ``/proc/PID/smaps`` and count the FilePmdMapped fields for each mapping. Note that reading the smaps file is expensive and reading it From cb0ca08b326aa03f87fe94bb91872ce8d2ef1ed8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Dec 2024 08:18:10 +0100 Subject: [PATCH 1082/1450] kcov: mark in_softirq_really() as __always_inline If gcc decides not to inline in_softirq_really(), objtool warns about a function call with UACCESS enabled: kernel/kcov.o: warning: objtool: __sanitizer_cov_trace_pc+0x1e: call to in_softirq_really() with UACCESS enabled kernel/kcov.o: warning: objtool: check_kcov_mode+0x11: call to in_softirq_really() with UACCESS enabled Mark this as __always_inline to avoid the problem. Link: https://lkml.kernel.org/r/20241217071814.2261620-1-arnd@kernel.org Fixes: 7d4df2dad312 ("kcov: properly check for softirq context") Signed-off-by: Arnd Bergmann Reviewed-by: Marco Elver Cc: Aleksandr Nogikh Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 28a6be6e64fdd..187ba1b80bda1 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -166,7 +166,7 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, * Unlike in_serving_softirq(), this function returns false when called during * a hardirq or an NMI that happened in the softirq context. */ -static inline bool in_softirq_really(void) +static __always_inline bool in_softirq_really(void) { return in_serving_softirq() && !in_hardirq() && !in_nmi(); } From 3754137d263f52f4b507cf9ae913f8f0497d1b0e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 17 Dec 2024 20:50:00 +0100 Subject: [PATCH 1083/1450] fs/proc/task_mmu: fix pagemap flags with PMD THP entries on 32bit Entries (including flags) are u64, even on 32bit. So right now we are cutting of the flags on 32bit. This way, for example the cow selftest complains about: # ./cow ... Bail Out! read and ioctl return unmatched results for populated: 0 1 Link: https://lkml.kernel.org/r/20241217195000.1734039-1-david@redhat.com Fixes: 2c1f057e5be6 ("fs/proc/task_mmu: properly detect PM_MMAP_EXCLUSIVE per page of PMD-mapped THPs") Signed-off-by: David Hildenbrand Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 38a5a3e9cba20..f02cd362309a0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1810,7 +1810,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } for (; addr != end; addr += PAGE_SIZE, idx++) { - unsigned long cur_flags = flags; + u64 cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && From 5f3fd772d152229d94602bca243fbb658068a597 Mon Sep 17 00:00:00 2001 From: Dennis Lam Date: Tue, 17 Dec 2024 21:39:25 -0500 Subject: [PATCH 1084/1450] ocfs2: fix slab-use-after-free due to dangling pointer dqi_priv When mounting ocfs2 and then remounting it as read-only, a slab-use-after-free occurs after the user uses a syscall to quota_getnextquota. Specifically, sb_dqinfo(sb, type)->dqi_priv is the dangling pointer. During the remounting process, the pointer dqi_priv is freed but is never set as null leaving it to be accessed. Additionally, the read-only option for remounting sets the DQUOT_SUSPENDED flag instead of setting the DQUOT_USAGE_ENABLED flags. Moreover, later in the process of getting the next quota, the function ocfs2_get_next_id is called and only checks the quota usage flags and not the quota suspended flags. To fix this, I set dqi_priv to null when it is freed after remounting with read-only and put a check for DQUOT_SUSPENDED in ocfs2_get_next_id. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20241218023924.22821-2-dennis.lamerice@gmail.com Fixes: 8f9e8f5fcc05 ("ocfs2: Fix Q_GETNEXTQUOTA for filesystem without quotas") Signed-off-by: Dennis Lam Reported-by: syzbot+d173bf8a5a7faeede34c@syzkaller.appspotmail.com Tested-by: syzbot+d173bf8a5a7faeede34c@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6731d26f.050a0220.1fb99c.014b.GAE@google.com/T/ Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/quota_global.c | 2 +- fs/ocfs2/quota_local.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 2b0daced98ebb..3404e7a30c330 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -893,7 +893,7 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) int status = 0; trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); - if (!sb_has_quota_loaded(sb, type)) { + if (!sb_has_quota_active(sb, type)) { status = -ESRCH; goto out; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 73d3367c533b8..2956d888c1314 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -867,6 +867,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); + info->dqi_priv = NULL; return status; } From eaebeb93922ca6ab0dd92027b73d0112701706ef Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 19 Dec 2024 21:24:37 +0000 Subject: [PATCH 1085/1450] mm: zswap: fix race between [de]compression and CPU hotunplug In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the current CPU at the beginning of the operation is retrieved and used throughout. However, since neither preemption nor migration are disabled, it is possible that the operation continues on a different CPU. If the original CPU is hotunplugged while the acomp_ctx is still in use, we run into a UAF bug as the resources attached to the acomp_ctx are freed during hotunplug in zswap_cpu_comp_dead(). The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") when the switch to the crypto_acomp API was made. Prior to that, the per-CPU crypto_comp was retrieved using get_cpu_ptr() which disables preemption and makes sure the CPU cannot go away from under us. Preemption cannot be disabled with the crypto_acomp API as a sleepable context is needed. Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx") increased the UAF surface area by making the per-CPU buffers dynamic, adding yet another resource that can be freed from under zswap compression/decompression by CPU hotunplug. There are a few ways to fix this: (a) Add a refcount for acomp_ctx. (b) Disable migration while using the per-CPU acomp_ctx. (c) Disable CPU hotunplug while using the per-CPU acomp_ctx by holding the CPUs read lock. Implement (c) since it's simpler than (a), and (b) involves using migrate_disable() which is apparently undesired (see huge comment in include/linux/preempt.h). Link: https://lkml.kernel.org/r/20241219212437.2714151-1-yosryahmed@google.com Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") Signed-off-by: Yosry Ahmed Reported-by: Johannes Weiner Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/ Reported-by: Sam Sun Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/ Reviewed-by: Chengming Zhou Acked-by: Barry Song Reviewed-by: Nhat Pham Cc: Vitaly Wool Cc: Signed-off-by: Andrew Morton --- mm/zswap.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index f6316b66fb236..5a27af8d86ea9 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -880,6 +880,18 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } +/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */ +static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx) +{ + cpus_read_lock(); + return raw_cpu_ptr(acomp_ctx); +} + +static void acomp_ctx_put_cpu(void) +{ + cpus_read_unlock(); +} + static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -893,8 +905,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - + acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); dst = acomp_ctx->buffer; @@ -950,6 +961,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, zswap_reject_alloc_fail++; mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_cpu(); return comp_ret == 0 && alloc_ret == 0; } @@ -960,7 +972,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; u8 *src; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); @@ -990,6 +1002,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) if (src != acomp_ctx->buffer) zpool_unmap_handle(zpool, entry->handle); + acomp_ctx_put_cpu(); } /********************************* From 11673247700e2af3a6a95f7b3f1bb80b691c950e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 19 Dec 2024 14:18:28 +0200 Subject: [PATCH 1086/1450] percpu: remove intermediate variable in PERCPU_PTR() The intermediate variable in the PERCPU_PTR() macro results in a kernel panic on boot [1] due to a compiler bug seen when compiling the kernel (+ KASAN) with gcc 11.3.1, but not when compiling with latest gcc (v14.2)/clang(v18.1). To solve it, remove the intermediate variable (which is not needed) and keep the casting that resolves the address space checks. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] CPU: 0 UID: 0 PID: 547 Comm: iptables Not tainted 6.13.0-rc1_external_tested-master #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:nf_ct_netns_do_get+0x139/0x540 Code: 03 00 00 48 81 c4 88 00 00 00 5b 5d 41 5c 41 5d 41 5e 41 5f c3 4d 8d 75 08 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 27 03 00 00 41 8b 45 08 83 c0 RSP: 0018:ffff888116df75e8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 1ffff11022dbeebe RCX: ffffffff839a2382 RDX: 0000000000000003 RSI: 0000000000000008 RDI: ffff88842ec46d10 RBP: 0000000000000002 R08: 0000000000000000 R09: fffffbfff0b0860c R10: ffff888116df75e8 R11: 0000000000000001 R12: ffffffff879d6a80 R13: 0000000000000016 R14: 000000000000001e R15: ffff888116df7908 FS: 00007fba01646740(0000) GS:ffff88842ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bd901800d8 CR3: 00000001205f0003 CR4: 0000000000172eb0 Call Trace: ? die_addr+0x3d/0xa0 ? exc_general_protection+0x144/0x220 ? asm_exc_general_protection+0x22/0x30 ? __mutex_lock+0x2c2/0x1d70 ? nf_ct_netns_do_get+0x139/0x540 ? nf_ct_netns_do_get+0xb5/0x540 ? net_generic+0x1f0/0x1f0 ? __create_object+0x5e/0x80 xt_check_target+0x1f0/0x930 ? textify_hooks.constprop.0+0x110/0x110 ? pcpu_alloc_noprof+0x7cd/0xcf0 ? xt_find_target+0x148/0x1e0 find_check_entry.constprop.0+0x6c0/0x920 ? get_info+0x380/0x380 ? __virt_addr_valid+0x1df/0x3b0 ? kasan_quarantine_put+0xe3/0x200 ? kfree+0x13e/0x3d0 ? translate_table+0xaf5/0x1750 translate_table+0xbd8/0x1750 ? ipt_unregister_table_exit+0x30/0x30 ? __might_fault+0xbb/0x170 do_ipt_set_ctl+0x408/0x1340 ? nf_sockopt_find.constprop.0+0x17b/0x1f0 ? lock_downgrade+0x680/0x680 ? lockdep_hardirqs_on_prepare+0x284/0x400 ? ipt_register_table+0x440/0x440 ? bit_wait_timeout+0x160/0x160 nf_setsockopt+0x6f/0xd0 raw_setsockopt+0x7e/0x200 ? raw_bind+0x590/0x590 ? do_user_addr_fault+0x812/0xd20 do_sock_setsockopt+0x1e2/0x3f0 ? move_addr_to_user+0x90/0x90 ? lock_downgrade+0x680/0x680 __sys_setsockopt+0x9e/0x100 __x64_sys_setsockopt+0xb9/0x150 ? do_syscall_64+0x33/0x140 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fba015134ce Code: 0f 1f 40 00 48 8b 15 59 69 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b1 0f 1f 00 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 21 RSP: 002b:00007ffd9de6f388 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 000055bd9017f490 RCX: 00007fba015134ce RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 0000000000000500 R08: 0000000000000560 R09: 0000000000000052 R10: 000055bd901800e0 R11: 0000000000000246 R12: 000055bd90180140 R13: 000055bd901800e0 R14: 000055bd9017f498 R15: 000055bd9017ff10 Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc mlx4_ib mlx4_en mlx4_core rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi fuse ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_core ---[ end trace 0000000000000000 ]--- [akpm@linux-foundation.org: simplification, per Uros] Link: https://lkml.kernel.org/r/20241219121828.2120780-1-gal@nvidia.com Fixes: dabddd687c9e ("percpu: cast percpu pointer in PERCPU_PTR() via unsigned long") Signed-off-by: Gal Pressman Closes: https://lore.kernel.org/all/7590f546-4021-4602-9252-0d525de35b52@nvidia.com Cc: Uros Bizjak Cc: Bill Wendling Cc: Christoph Lameter Cc: Dennis Zhou Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 35842d1e38796..5b520fe86b609 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,10 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ -({ \ - unsigned long __pcpu_ptr = (__force unsigned long)(__p); \ - (typeof(*(__p)) __force __kernel *)(__pcpu_ptr); \ -}) + (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP From d0e6983a6d1719738cf8d13982a68094f0a1872a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 19 Dec 2024 15:30:08 +0800 Subject: [PATCH 1087/1450] mm: shmem: fix incorrect index alignment for within_size policy With enabling the shmem per-size within_size policy, using an incorrect 'order' size to round_up() the index can lead to incorrect i_size checks, resulting in an inappropriate large orders being returned. Changing to use '1 << order' to round_up() the index to fix this issue. Additionally, adding an 'aligned_index' variable to avoid affecting the index checks. Link: https://lkml.kernel.org/r/77d8ef76a7d3d646e9225e9af88a76549a68aab1.1734593154.git.baolin.wang@linux.alibaba.com Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f6fb053ac50dc..dec659e84562e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1689,6 +1689,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; + pgoff_t aligned_index; bool global_huge; loff_t i_size; int order; @@ -1723,9 +1724,9 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, /* Allow mTHP that will be fully within i_size. */ order = highest_order(within_size_orders); while (within_size_orders) { - index = round_up(index + 1, order); + aligned_index = round_up(index + 1, 1 << order); i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) { + if (i_size >> PAGE_SHIFT >= aligned_index) { mask |= within_size_orders; break; } From d77b90d2b2642655b5f60953c36ad887257e1802 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 19 Dec 2024 15:30:09 +0800 Subject: [PATCH 1088/1450] mm: shmem: fix the update of 'shmem_falloc->nr_unswapped' The 'shmem_falloc->nr_unswapped' is used to record how many writepage refused to swap out because fallocate() is allocating, but after shmem supports large folio swap out, the update of 'shmem_falloc->nr_unswapped' does not use the correct number of pages in the large folio, which may lead to fallocate() not exiting as soon as possible. Anyway, this is found through code inspection, and I am not sure whether it would actually cause serious issues. Link: https://lkml.kernel.org/r/f66a0119d0564c2c37c84f045835b870d1b2196f.1734593154.git.baolin.wang@linux.alibaba.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index dec659e84562e..ac58d4fb2e6f5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1535,7 +1535,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); From adcfb264c3ed51fbbf5068ddf10d309a63683868 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Sat, 21 Dec 2024 12:33:20 +0900 Subject: [PATCH 1089/1450] vmstat: disable vmstat_work on vmstat_cpu_down_prep() Even after mm/vmstat:online teardown, shepherd may still queue work for the dying cpu until the cpu is removed from online mask. While it's quite rare, this means that after unbind_workers() unbinds a per-cpu kworker, it potentially runs vmstat_update for the dying CPU on an irrelevant cpu before entering atomic AP states. When CONFIG_DEBUG_PREEMPT=y, it results in the following error with the backtrace. BUG: using smp_processor_id() in preemptible [00000000] code: \ kworker/7:3/1702 caller is refresh_cpu_vm_stats+0x235/0x5f0 CPU: 0 UID: 0 PID: 1702 Comm: kworker/7:3 Tainted: G Tainted: [N]=TEST Workqueue: mm_percpu_wq vmstat_update Call Trace: dump_stack_lvl+0x8d/0xb0 check_preemption_disabled+0xce/0xe0 refresh_cpu_vm_stats+0x235/0x5f0 vmstat_update+0x17/0xa0 process_one_work+0x869/0x1aa0 worker_thread+0x5e5/0x1100 kthread+0x29e/0x380 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 So, for mm/vmstat:online, disable vmstat_work reliably on teardown and symmetrically enable it on startup. Link: https://lkml.kernel.org/r/20241221033321.4154409-1-koichiro.den@canonical.com Signed-off-by: Koichiro Den Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Andrew Morton --- mm/vmstat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 4d016314a56c9..0889b75cef149 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2148,13 +2148,14 @@ static int vmstat_cpu_online(unsigned int cpu) if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); } + enable_delayed_work(&per_cpu(vmstat_work, cpu)); return 0; } static int vmstat_cpu_down_prep(unsigned int cpu) { - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); return 0; } From 98a6abc6cec186bdc3d94c162227cc8e003de76c Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 23 Dec 2024 23:09:07 +0800 Subject: [PATCH 1090/1450] mm/list_lru: fix false warning of negative counter commit 2788cf0c401c ("memcg: reparent list_lrus and free kmemcg_id on css offline") removed sanity checks for the nr_items counter's value because it implemented list_lru re-parenting in a way that will redirect children's list_lru to the parent before re-parenting the items in list_lru. This will make item counter uncharging happen in the parent while the item is still being held by the child. As a result, the parent's counter value may become negative. This is acceptable because re-parenting will sum up the children's counter values, and the parent's counter will be fixed. Later commit fb56fdf8b9a2 ("mm/list_lru: split the lock to per-cgroup scope") reworked the re-parenting process, and removed the redirect. So it added the sanity check back, assuming that as long as items are still in the children's list_lru, parent's counter will not be uncharged. But that assumption is incorrect. The xas_store in memcg_reparent_list_lrus will set children's list_lru to NULL before re-parenting the items, it redirects list_lru helpers to use parent's list_lru just like before. But still, it's not a problem as re-parenting will fix the counter. Therefore, remove this sanity check, but add a new check to ensure that the counter won't go negative in a different way: the child's list_lru being re-parented should never have a negative counter, since re-parenting should occur in order and fixes counters. Link: https://lkml.kernel.org/r/20241223150907.1591-1-ryncsn@gmail.com Fixes: fb56fdf8b9a2 ("mm/list_lru: split the lock to per-cgroup scope") Signed-off-by: Kairui Song Closes: https://lore.kernel.org/lkml/Z2Bz9t92Be9l1xqj@lappy/ Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Sasha Levin Cc: Shakeel Butt Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/list_lru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index f93ada6a207b1..7d69434c70e0a 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -77,7 +77,6 @@ lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, spin_lock(&l->lock); nr_items = READ_ONCE(l->nr_items); if (likely(nr_items != LONG_MIN)) { - WARN_ON(nr_items < 0); rcu_read_unlock(); return l; } @@ -450,6 +449,7 @@ static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, list_splice_init(&src->list, &dst->list); if (src->nr_items) { + WARN_ON(src->nr_items < 0); dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); } From 8debfc5b1aa569d3d2ac836af2553da037611c61 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Dec 2024 15:12:21 -0800 Subject: [PATCH 1091/1450] mm/damon/core: fix new damon_target objects leaks on damon_commit_targets() Patch series "mm/damon/core: fix memory leaks and ignored inputs from damon_commit_ctx()". Due to two bugs in damon_commit_targets() and damon_commit_schemes(), which are called from damon_commit_ctx(), some user inputs can be ignored, and some mmeory objects can be leaked. Fix those. Note that only DAMON sysfs interface users are affected. Other DAMON core API user modules that more focused more on simple and dedicated production usages, including DAMON_RECLAIM and DAMON_LRU_SORT are not using the buggy function in the way, so not affected. This patch (of 2): When new DAMON targets are added via damon_commit_targets(), the newly created targets are not deallocated when updating the internal data (damon_commit_target()) is failed. Worse yet, even if the setup is successfully done, the new target is not linked to the context. Hence, the new targets are always leaked regardless of the internal data setup failure. Fix the leaks. Link: https://lkml.kernel.org/r/20241222231222.85060-2-sj@kernel.org Fixes: 9cb3d0b9dfce ("mm/damon/core: implement DAMON context commit function") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 8b8e2933dcd4a..dc52361f18634 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -961,8 +961,11 @@ static int damon_commit_targets( return -ENOMEM; err = damon_commit_target(new_target, false, src_target, damon_target_has_pid(src)); - if (err) + if (err) { + damon_destroy_target(new_target); return err; + } + damon_add_target(dst, new_target); } return 0; } From 7d390b53067ef745e2d9bee5a9683df4c96b80a0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Dec 2024 15:12:22 -0800 Subject: [PATCH 1092/1450] mm/damon/core: fix ignored quota goals and filters of newly committed schemes damon_commit_schemes() ignores quota goals and filters of the newly committed schemes. This makes users confused about the behaviors. Correctly handle those inputs. Link: https://lkml.kernel.org/r/20241222231222.85060-3-sj@kernel.org Fixes: 9cb3d0b9dfce ("mm/damon/core: implement DAMON context commit function") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index dc52361f18634..0776452a1abbb 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -868,6 +868,11 @@ static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src) NUMA_NO_NODE); if (!new_scheme) return -ENOMEM; + err = damos_commit(new_scheme, src_scheme); + if (err) { + damon_destroy_scheme(new_scheme); + return err; + } damon_add_scheme(dst, new_scheme); } return 0; From 62e72d2cf702a5e2fb53d9c46ed900d9384e4a06 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 22 Dec 2024 20:29:36 +0800 Subject: [PATCH 1093/1450] mm, madvise: fix potential workingset node list_lru leaks Since commit 5abc1e37afa0 ("mm: list_lru: allocate list_lru_one only when needed"), all list_lru users need to allocate the items using the new infrastructure that provides list_lru info for slab allocation, ensuring that the corresponding memcg list_lru is allocated before use. For workingset shadow nodes (which are xa_node), users are converted to use the new infrastructure by commit 9bbdc0f32409 ("xarray: use kmem_cache_alloc_lru to allocate xa_node"). The xas->xa_lru will be set correctly for filemap users. However, there is a missing case: xa_node allocations caused by madvise(..., MADV_COLLAPSE). madvise(..., MADV_COLLAPSE) will also read in the absent parts of file map, and there will be xa_nodes allocated for the caller's memcg (assuming it's not rootcg). However, these allocations won't trigger memcg list_lru allocation because the proper xas info was not set. If nothing else has allocated other xa_nodes for that memcg to trigger list_lru creation, and memory pressure starts to evict file pages, workingset_update_node will try to add these xa_nodes to their corresponding memcg list_lru, and it does not exist (NULL). So they will be added to rootcg's list_lru instead. This shouldn't be a significant issue in practice, but it is indeed unexpected behavior, and these xa_nodes will not be reclaimed effectively. And may lead to incorrect counting of the list_lru->nr_items counter. This problem wasn't exposed until recent commit 28e98022b31ef ("mm/list_lru: simplify reparenting and initial allocation") added a sanity check: only dying memcg could have a NULL list_lru when list_lru_{add,del} is called. This problem triggered this WARNING. So make madvise(..., MADV_COLLAPSE) also call xas_set_lru() to pass the list_lru which we may want to insert xa_node into later. And move mapping_set_update to mm/internal.h, and turn into a macro to avoid including extra headers in mm/internal.h. Link: https://lkml.kernel.org/r/20241222122936.67501-1-ryncsn@gmail.com Fixes: 9bbdc0f32409 ("xarray: use kmem_cache_alloc_lru to allocate xa_node") Reported-by: syzbot+38a0cbd267eff2d286ff@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/675d01e9.050a0220.37aaf.00be.GAE@google.com/ Signed-off-by: Kairui Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Sasha Levin Cc: Shakeel Butt Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/filemap.c | 9 --------- mm/internal.h | 6 ++++++ mm/khugepaged.c | 3 +++ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index f61cf51c22389..33b60d448fca5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,15 +124,6 @@ * ->private_lock (zap_pte_range->block_dirty_folio) */ -static void mapping_set_update(struct xa_state *xas, - struct address_space *mapping) -{ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - xas_set_update(xas, workingset_update_node); - xas_set_lru(xas, &shadow_nodes); -} - static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { diff --git a/mm/internal.h b/mm/internal.h index 3bd08bafad042..9826f7dce6072 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1504,6 +1504,12 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; +#define mapping_set_update(xas, mapping) do { \ + if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ + xas_set_update(xas, workingset_update_node); \ + xas_set_lru(xas, &shadow_nodes); \ + } \ +} while (0) /* mremap.c */ unsigned long move_page_tables(struct vm_area_struct *vma, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6f8d46d107b4b..653dbb1ff05c0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -1837,6 +1838,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (result != SCAN_SUCCEED) goto out; + mapping_set_update(&xas, mapping); + __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); From dd2a5b5514ab0e690f018595e34dd1fcb981d345 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 21 Dec 2024 16:47:29 +0900 Subject: [PATCH 1094/1450] mm/util: make memdup_user_nul() similar to memdup_user() Since the string data to copy from userspace is likely less than PAGE_SIZE bytes, replace GFP_KERNEL with GFP_USER like commit 6c2c97a24f09 ("memdup_user(): switch to GFP_USER") does and add __GFP_NOWARN like commit 6c8fcc096be9 ("mm: don't let userspace spam allocations warnings") does. Also, use dedicated slab buckets like commit d73778e4b867 ("mm/util: Use dedicated slab buckets for memdup_user()") does. Link: https://lkml.kernel.org/r/014cd694-cc27-4a07-a34a-2ae95d744515@I-love.SAKURA.ne.jp Reported-by: syzbot+7e12e97b36154c54414b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7e12e97b36154c54414b Signed-off-by: Tetsuo Handa Signed-off-by: Andrew Morton --- mm/util.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/util.c b/mm/util.c index c1c3b06ab4f94..60aa40f612b87 100644 --- a/mm/util.c +++ b/mm/util.c @@ -297,12 +297,7 @@ void *memdup_user_nul(const void __user *src, size_t len) { char *p; - /* - * Always use GFP_KERNEL, since copy_from_user() can sleep and - * cause pagefault, which makes it pointless to use GFP_NOFS - * or GFP_ATOMIC. - */ - p = kmalloc_track_caller(len + 1, GFP_KERNEL); + p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); From 0210d251162f4033350a94a43f95b1c39ec84a90 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Thu, 26 Dec 2024 22:03:32 +0800 Subject: [PATCH 1095/1450] scripts/sorttable: fix orc_sort_cmp() to maintain symmetry and transitivity The orc_sort_cmp() function, used with qsort(), previously violated the symmetry and transitivity rules required by the C standard. Specifically, when both entries are ORC_TYPE_UNDEFINED, it could result in both a < b and b < a, which breaks the required symmetry and transitivity. This can lead to undefined behavior and incorrect sorting results, potentially causing memory corruption in glibc implementations [1]. Symmetry: If x < y, then y > x. Transitivity: If x < y and y < z, then x < z. Fix the comparison logic to return 0 when both entries are ORC_TYPE_UNDEFINED, ensuring compliance with qsort() requirements. Link: https://www.qualys.com/2024/01/30/qsort.txt [1] Link: https://lkml.kernel.org/r/20241226140332.2670689-1-visitorckw@gmail.com Fixes: 57fa18994285 ("scripts/sorttable: Implement build-time ORC unwind table sorting") Fixes: fb799447ae29 ("x86,objtool: Split UNWIND_HINT_EMPTY in two") Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Shile Zhang Cc: Steven Rostedt Cc: Signed-off-by: Andrew Morton --- scripts/sorttable.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 7bd0184380d3b..a7c5445baf002 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -110,7 +110,7 @@ static inline unsigned long orc_ip(const int *ip) static int orc_sort_cmp(const void *_a, const void *_b) { - struct orc_entry *orc_a; + struct orc_entry *orc_a, *orc_b; const int *a = g_orc_ip_table + *(int *)_a; const int *b = g_orc_ip_table + *(int *)_b; unsigned long a_val = orc_ip(a); @@ -128,6 +128,9 @@ static int orc_sort_cmp(const void *_a, const void *_b) * whitelisted .o files which didn't get objtool generation. */ orc_a = g_orc_table + (a - g_orc_ip_table); + orc_b = g_orc_table + (b - g_orc_ip_table); + if (orc_a->type == ORC_TYPE_UNDEFINED && orc_b->type == ORC_TYPE_UNDEFINED) + return 0; return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1; } From e7404921818d676da4d7143ce78659456b05e2af Mon Sep 17 00:00:00 2001 From: "Chester A. Unal" Date: Wed, 25 Dec 2024 15:50:41 +0300 Subject: [PATCH 1096/1450] =?UTF-8?q?MAINTAINERS:=20change=20Ar=C4=B1n?= =?UTF-8?q?=C3=A7=20=5FNAL's=20name=20and=20email=20address?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My legal name now includes Chester. Change the name and the email address sections to reflect that. Link: https://lkml.kernel.org/r/20241225-for-unknown-upstream-v1-1-3e35e4d5e161@arinc9.com Signed-off-by: Chester A. Unal Signed-off-by: Andrew Morton --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a8..22fa261cb60e7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14756,7 +14756,7 @@ F: drivers/memory/mtk-smi.c F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER -M: Arınç ÜNAL +M: Chester A. Unal M: Daniel Golle M: DENG Qingfang M: Sean Wang @@ -18460,7 +18460,7 @@ F: Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml F: drivers/pinctrl/mediatek/ PIN CONTROLLER - MEDIATEK MIPS -M: Arınç ÜNAL +M: Chester A. Unal M: Sergio Paracuellos L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) L: linux-mips@vger.kernel.org @@ -19504,7 +19504,7 @@ S: Maintained F: arch/mips/ralink RALINK MT7621 MIPS ARCHITECTURE -M: Arınç ÜNAL +M: Chester A. Unal M: Sergio Paracuellos L: linux-mips@vger.kernel.org S: Maintained From 4f619d518db9cd1a933c3a095a5f95d0c1584ae8 Mon Sep 17 00:00:00 2001 From: Jinjian Song Date: Tue, 24 Dec 2024 12:15:52 +0800 Subject: [PATCH 1097/1450] net: wwan: t7xx: Fix FSM command timeout issue When driver processes the internal state change command, it use an asynchronous thread to process the command operation. If the main thread detects that the task has timed out, the asynchronous thread will panic when executing the completion notification because the main thread completion object has been released. BUG: unable to handle page fault for address: fffffffffffffff8 PGD 1f283a067 P4D 1f283a067 PUD 1f283c067 PMD 0 Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:complete_all+0x3e/0xa0 [...] Call Trace: ? __die_body+0x68/0xb0 ? page_fault_oops+0x379/0x3e0 ? exc_page_fault+0x69/0xa0 ? asm_exc_page_fault+0x22/0x30 ? complete_all+0x3e/0xa0 fsm_main_thread+0xa3/0x9c0 [mtk_t7xx (HASH:1400 5)] ? __pfx_autoremove_wake_function+0x10/0x10 kthread+0xd8/0x110 ? __pfx_fsm_main_thread+0x10/0x10 [mtk_t7xx (HASH:1400 5)] ? __pfx_kthread+0x10/0x10 ret_from_fork+0x38/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 [...] CR2: fffffffffffffff8 ---[ end trace 0000000000000000 ]--- Use the reference counter to ensure safe release as Sergey suggests: https://lore.kernel.org/all/da90f64c-260a-4329-87bf-1f9ff20a5951@gmail.com/ Fixes: 13e920d93e37 ("net: wwan: t7xx: Add core components") Signed-off-by: Jinjian Song Acked-by: Sergey Ryazanov Link: https://patch.msgid.link/20241224041552.8711-1-jinjian.song@fibocom.com Signed-off-by: Jakub Kicinski --- drivers/net/wwan/t7xx/t7xx_state_monitor.c | 26 ++++++++++++++-------- drivers/net/wwan/t7xx/t7xx_state_monitor.h | 5 +++-- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_state_monitor.c b/drivers/net/wwan/t7xx/t7xx_state_monitor.c index 3931c7a13f5ab..cbdbb91e8381f 100644 --- a/drivers/net/wwan/t7xx/t7xx_state_monitor.c +++ b/drivers/net/wwan/t7xx/t7xx_state_monitor.c @@ -104,14 +104,21 @@ void t7xx_fsm_broadcast_state(struct t7xx_fsm_ctl *ctl, enum md_state state) fsm_state_notify(ctl->md, state); } +static void fsm_release_command(struct kref *ref) +{ + struct t7xx_fsm_command *cmd = container_of(ref, typeof(*cmd), refcnt); + + kfree(cmd); +} + static void fsm_finish_command(struct t7xx_fsm_ctl *ctl, struct t7xx_fsm_command *cmd, int result) { if (cmd->flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { - *cmd->ret = result; - complete_all(cmd->done); + cmd->result = result; + complete_all(&cmd->done); } - kfree(cmd); + kref_put(&cmd->refcnt, fsm_release_command); } static void fsm_del_kf_event(struct t7xx_fsm_event *event) @@ -475,7 +482,6 @@ static int fsm_main_thread(void *data) int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id, unsigned int flag) { - DECLARE_COMPLETION_ONSTACK(done); struct t7xx_fsm_command *cmd; unsigned long flags; int ret; @@ -487,11 +493,13 @@ int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id INIT_LIST_HEAD(&cmd->entry); cmd->cmd_id = cmd_id; cmd->flag = flag; + kref_init(&cmd->refcnt); if (flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { - cmd->done = &done; - cmd->ret = &ret; + init_completion(&cmd->done); + kref_get(&cmd->refcnt); } + kref_get(&cmd->refcnt); spin_lock_irqsave(&ctl->command_lock, flags); list_add_tail(&cmd->entry, &ctl->command_queue); spin_unlock_irqrestore(&ctl->command_lock, flags); @@ -501,11 +509,11 @@ int t7xx_fsm_append_cmd(struct t7xx_fsm_ctl *ctl, enum t7xx_fsm_cmd_state cmd_id if (flag & FSM_CMD_FLAG_WAIT_FOR_COMPLETION) { unsigned long wait_ret; - wait_ret = wait_for_completion_timeout(&done, + wait_ret = wait_for_completion_timeout(&cmd->done, msecs_to_jiffies(FSM_CMD_TIMEOUT_MS)); - if (!wait_ret) - return -ETIMEDOUT; + ret = wait_ret ? cmd->result : -ETIMEDOUT; + kref_put(&cmd->refcnt, fsm_release_command); return ret; } diff --git a/drivers/net/wwan/t7xx/t7xx_state_monitor.h b/drivers/net/wwan/t7xx/t7xx_state_monitor.h index 7b0a9baf488c1..6e0601bb752e5 100644 --- a/drivers/net/wwan/t7xx/t7xx_state_monitor.h +++ b/drivers/net/wwan/t7xx/t7xx_state_monitor.h @@ -110,8 +110,9 @@ struct t7xx_fsm_command { struct list_head entry; enum t7xx_fsm_cmd_state cmd_id; unsigned int flag; - struct completion *done; - int *ret; + struct completion done; + int result; + struct kref refcnt; }; struct t7xx_fsm_notifier { From afc6717628f959941d7b33728570568b4af1c4b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 31 Dec 2024 00:06:46 -0500 Subject: [PATCH 1098/1450] tracing: Have process_string() also allow arrays In order to catch a common bug where a TRACE_EVENT() TP_fast_assign() assigns an address of an allocated string to the ring buffer and then references it in TP_printk(), which can be executed hours later when the string is free, the function test_event_printk() runs on all events as they are registered to make sure there's no unwanted dereferencing. It calls process_string() to handle cases in TP_printk() format that has "%s". It returns whether or not the string is safe. But it can have some false positives. For instance, xe_bo_move() has: TP_printk("move_lacks_source:%s, migrate object %p [size %zu] from %s to %s device_id:%s", __entry->move_lacks_source ? "yes" : "no", __entry->bo, __entry->size, xe_mem_type_to_name[__entry->old_placement], xe_mem_type_to_name[__entry->new_placement], __get_str(device_id)) Where the "%s" references into xe_mem_type_to_name[]. This is an array of pointers that should be safe for the event to access. Instead of flagging this as a bad reference, if a reference points to an array, where the record field is the index, consider it safe. Link: https://lore.kernel.org/all/9dee19b6185d325d0e6fa5f7cbba81d007d99166.camel@sapience.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241231000646.324fb5f7@gandalf.local.home Fixes: 65a25d9f7ac02 ("tracing: Add "%s" check in test_event_printk()") Reported-by: Genes Lists Tested-by: Gene C Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1545cc8b49d02..770e7ed917161 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -364,6 +364,18 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca s = r + 1; } while (s < e); + /* + * Check for arrays. If the argument has: foo[REC->val] + * then it is very likely that foo is an array of strings + * that are safe to use. + */ + r = strstr(s, "["); + if (r && r < e) { + r = strstr(r, "REC->"); + if (r && r < e) + return true; + } + /* * If there's any strings in the argument consider this arg OK as it * could be: REC->field ? "foo" : "bar" and we don't want to get into From fee873761bd978d077d8c55334b4966ac4cb7b59 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Fri, 13 Dec 2024 13:08:37 +0800 Subject: [PATCH 1099/1450] exfat: fix the infinite loop in exfat_readdir() If the file system is corrupted so that a cluster is linked to itself in the cluster chain, and there is an unused directory entry in the cluster, 'dentry' will not be incremented, causing condition 'dentry < max_dentries' unable to prevent an infinite loop. This infinite loop causes s_lock not to be released, and other tasks will hang, such as exfat_sync_fs(). This commit stops traversing the cluster chain when there is unused directory entry in the cluster to avoid this infinite loop. Reported-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=205c2644abdff9d3f9fc Tested-by: syzbot+205c2644abdff9d3f9fc@syzkaller.appspotmail.com Fixes: ca06197382bd ("exfat: add directory operations") Signed-off-by: Yuezhang Mo Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index fe0a9b8a0cd07..3103b932b6746 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -122,7 +122,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent type = exfat_get_entry_type(ep); if (type == TYPE_UNUSED) { brelse(bh); - break; + goto out; } if (type != TYPE_FILE && type != TYPE_DIR) { @@ -170,6 +170,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent } } +out: dir_entry->namebuf.lfn[0] = '\0'; *cpos = EXFAT_DEN_TO_B(dentry); return 0; From 98e2fb26d1a9eafe79f46d15d54e68e014d81d8c Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Thu, 12 Dec 2024 16:29:23 +0800 Subject: [PATCH 1100/1450] exfat: fix the new buffer was not zeroed before writing Before writing, if a buffer_head marked as new, its data must be zeroed, otherwise uninitialized data in the page cache will be written. So this commit uses folio_zero_new_buffers() to zero the new buffers before ->write_end(). Fixes: 6630ea49103c ("exfat: move extend valid_size into ->page_mkwrite()") Reported-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=91ae49e1c1a2634d20c0 Tested-by: syzbot+91ae49e1c1a2634d20c0@syzkaller.appspotmail.com Signed-off-by: Yuezhang Mo Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index fb38769c3e39d..05b51e7217838 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -545,6 +545,7 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) while (pos < new_valid_size) { u32 len; struct folio *folio; + unsigned long off; len = PAGE_SIZE - (pos & (PAGE_SIZE - 1)); if (pos + len > new_valid_size) @@ -554,6 +555,9 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) if (err) goto out; + off = offset_in_folio(folio, pos); + folio_zero_new_buffers(folio, off, off + len); + err = ops->write_end(file, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; @@ -563,6 +567,8 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) cond_resched(); } + return 0; + out: return err; } From a5324b3a488d883aa2d42f72260054e87d0940a0 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Mon, 16 Dec 2024 13:39:42 +0800 Subject: [PATCH 1101/1450] exfat: fix the infinite loop in __exfat_free_cluster() In __exfat_free_cluster(), the cluster chain is traversed until the EOF cluster. If the cluster chain includes a loop due to file system corruption, the EOF cluster cannot be traversed, resulting in an infinite loop. This commit uses the total number of clusters to prevent this infinite loop. Reported-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1de5a37cb85a2d536330 Tested-by: syzbot+1de5a37cb85a2d536330@syzkaller.appspotmail.com Fixes: 31023864e67a ("exfat: add fat entry operations") Signed-off-by: Yuezhang Mo Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/fatent.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 773c320d68f3f..9e5492ac409b0 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -216,6 +216,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain if (err) goto dec_used_clus; + + if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) { + /* + * The cluster chain includes a loop, scan the + * bitmap to get the number of used clusters. + */ + exfat_count_used_clusters(sb, &sbi->used_clusters); + + return 0; + } } while (clu != EXFAT_EOF_CLUSTER); } From 7b509910b3ad6d7aacead24c8744de10daf8715d Mon Sep 17 00:00:00 2001 From: Daniel Schaefer Date: Tue, 31 Dec 2024 12:59:58 +0800 Subject: [PATCH 1102/1450] ALSA hda/realtek: Add quirk for Framework F111:000C Similar to commit eb91c456f371 ("ALSA: hda/realtek: Add Framework Laptop 13 (Intel Core Ultra) to quirks") and previous quirks for Framework systems with Realtek codecs. 000C is a new platform that will also have an ALC285 codec and needs the same quirk. Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: linux@frame.work Cc: Dustin L. Howett Signed-off-by: Daniel Schaefer Cc: Link: https://patch.msgid.link/20241231045958.14545-1-dhs@frame.work Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 61ba5dc35b8b8..b74b566f675e9 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -11009,6 +11009,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0xf111, 0x0001, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0006, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0009, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0xf111, 0x000c, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), #if 0 /* Below is a quirk table taken from the old code. From fb514b31395946022f13a08e06a435f53cf9e8b3 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 31 Dec 2024 09:34:16 +0800 Subject: [PATCH 1103/1450] RDMA/rtrs: Ensure 'ib_sge list' is accessible Move the declaration of the 'ib_sge list' variable outside the 'always_invalidate' block to ensure it remains accessible for use throughout the function. Previously, 'ib_sge list' was declared within the 'always_invalidate' block, limiting its accessibility, then caused a 'BUG: kernel NULL pointer dereference'[1]. ? __die_body.cold+0x19/0x27 ? page_fault_oops+0x15a/0x2d0 ? search_module_extables+0x19/0x60 ? search_bpf_extables+0x5f/0x80 ? exc_page_fault+0x7e/0x180 ? asm_exc_page_fault+0x26/0x30 ? memcpy_orig+0xd5/0x140 rxe_mr_copy+0x1c3/0x200 [rdma_rxe] ? rxe_pool_get_index+0x4b/0x80 [rdma_rxe] copy_data+0xa5/0x230 [rdma_rxe] rxe_requester+0xd9b/0xf70 [rdma_rxe] ? finish_task_switch.isra.0+0x99/0x2e0 rxe_sender+0x13/0x40 [rdma_rxe] do_task+0x68/0x1e0 [rdma_rxe] process_one_work+0x177/0x330 worker_thread+0x252/0x390 ? __pfx_worker_thread+0x10/0x10 This change ensures the variable is available for subsequent operations that require it. [1] https://lore.kernel.org/linux-rdma/6a1f3e8f-deb0-49f9-bc69-a9b03ecfcda7@fujitsu.com/ Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality") Signed-off-by: Li Zhijian Link: https://patch.msgid.link/20241231013416.1290920-1-lizhijian@fujitsu.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index e83d956478521..ef4abdea3c2d2 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -349,6 +349,7 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, struct rtrs_srv_mr *srv_mr; bool need_inval = false; enum ib_send_flags flags; + struct ib_sge list; u32 imm; int err; @@ -401,7 +402,6 @@ static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, imm = rtrs_to_io_rsp_imm(id->msg_id, errno, need_inval); imm_wr.wr.next = NULL; if (always_invalidate) { - struct ib_sge list; struct rtrs_msg_rkey_rsp *msg; srv_mr = &srv_path->mrs[id->msg_id]; From e6178bf78d0378c2d397a6aafaf4882d0af643fa Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 31 Dec 2024 08:20:08 +0530 Subject: [PATCH 1104/1450] RDMA/bnxt_re: Fix error recovery sequence Fixed to return ENXIO from __send_message_basic_sanity() to indicate that device is in error state. In the case of ERR_DEVICE_DETACHED state, the driver should not post the commands to the firmware as it will time out eventually. Removed bnxt_re_modify_qp() call from bnxt_re_dev_stop() as it is a no-op. Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error is detected") Signed-off-by: Kalesh AP Signed-off-by: Kashyap Desai Link: https://patch.msgid.link/20241231025008.2267162-1-kalesh-anakkur.purayil@broadcom.com Reviewed-by: Selvin Xavier Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 8 +------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 5 +++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b7af0d5ff3b64..c143f273b7596 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1715,11 +1715,8 @@ static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev, static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) { - int mask = IB_QP_STATE; - struct ib_qp_attr qp_attr; struct bnxt_re_qp *qp; - qp_attr.qp_state = IB_QPS_ERR; mutex_lock(&rdev->qp_lock); list_for_each_entry(qp, &rdev->qp_list, list) { /* Modify the state of all QPs except QP1/Shadow QP */ @@ -1727,12 +1724,9 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev) if (qp->qplib_qp.state != CMDQ_MODIFY_QP_NEW_STATE_RESET && qp->qplib_qp.state != - CMDQ_MODIFY_QP_NEW_STATE_ERR) { + CMDQ_MODIFY_QP_NEW_STATE_ERR) bnxt_re_dispatch_event(&rdev->ibdev, &qp->ib_qp, 1, IB_EVENT_QP_FATAL); - bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, mask, - NULL); - } } } mutex_unlock(&rdev->qp_lock); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 5e90ea232de80..17e62f22683b1 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -424,7 +424,8 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, /* Prevent posting if f/w is not in a state to process */ if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) - return bnxt_qplib_map_rc(opcode); + return -ENXIO; + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) return -ETIMEDOUT; @@ -493,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __send_message_basic_sanity(rcfw, msg, opcode); if (rc) - return rc; + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; rc = __send_message(rcfw, msg, opcode); if (rc) From 8765429279e7d3d68d39ace5f84af2815174bb1e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 31 Dec 2024 15:53:58 +0100 Subject: [PATCH 1105/1450] ALSA: seq: Check UMP support for midi_version change When the kernel is built without UMP support but a user-space app requires the midi_version > 0, the kernel should return an error. Otherwise user-space assumes as if it were possible to deal, eventually hitting serious errors later. Fixes: 46397622a3fa ("ALSA: seq: Add UMP support") Cc: Link: https://patch.msgid.link/20241231145358.21946-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 3930e2f9082f4..77b6ac9b5c11b 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1275,10 +1275,16 @@ static int snd_seq_ioctl_set_client_info(struct snd_seq_client *client, if (client->type != client_info->type) return -EINVAL; - /* check validity of midi_version field */ - if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3) && - client_info->midi_version > SNDRV_SEQ_CLIENT_UMP_MIDI_2_0) - return -EINVAL; + if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3)) { + /* check validity of midi_version field */ + if (client_info->midi_version > SNDRV_SEQ_CLIENT_UMP_MIDI_2_0) + return -EINVAL; + + /* check if UMP is supported in kernel */ + if (!IS_ENABLED(CONFIG_SND_SEQ_UMP) && + client_info->midi_version > 0) + return -EINVAL; + } /* fill the info fields */ if (client_info->name[0]) From 7bac65687510038390a0a54cbe14fba08d037e46 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 19 Dec 2024 22:20:41 +0530 Subject: [PATCH 1106/1450] scsi: ufs: qcom: Power off the PHY if it was already powered on in ufs_qcom_power_up_sequence() PHY might already be powered on during ufs_qcom_power_up_sequence() in a couple of cases: 1. During UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH quirk 2. Resuming from spm_lvl = 5 suspend In those cases, it is necessary to call phy_power_off() and phy_exit() in ufs_qcom_power_up_sequence() function to power off the PHY before calling phy_init() and phy_power_on(). Case (1) is doing it via ufs_qcom_reinit_notify() callback, but case (2) is not handled. So to satisfy both cases, call phy_power_off() and phy_exit() if the phy_count is non-zero. And with this change, the reinit_notify() callback is no longer needed. This fixes the below UFS resume failure with spm_lvl = 5: ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufs_device_wlun 0:0:0:49488: ufshcd_wl_resume failed: -5 ufs_device_wlun 0:0:0:49488: PM: dpm_run_callback(): scsi_bus_resume returns -5 ufs_device_wlun 0:0:0:49488: PM: failed to resume async: error -5 Cc: stable@vger.kernel.org # 6.3 Fixes: baf5ddac90dc ("scsi: ufs: ufs-qcom: Add support for reinitializing the UFS device") Reported-by: Ram Kumar Dwivedi Tested-by: Amit Pundir # on SM8550-HDK Reviewed-by: Bart Van Assche Tested-by: Neil Armstrong # on SM8550-QRD Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241219-ufs-qcom-suspend-fix-v3-1-63c4b95a70b9@linaro.org Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd-priv.h | 6 ------ drivers/ufs/core/ufshcd.c | 1 - drivers/ufs/host/ufs-qcom.c | 13 +++++-------- include/ufs/ufshcd.h | 2 -- 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h index 9ffd94ddf8c7c..786f20ef22386 100644 --- a/drivers/ufs/core/ufshcd-priv.h +++ b/drivers/ufs/core/ufshcd-priv.h @@ -237,12 +237,6 @@ static inline void ufshcd_vops_config_scaling_param(struct ufs_hba *hba, hba->vops->config_scaling_param(hba, p, data); } -static inline void ufshcd_vops_reinit_notify(struct ufs_hba *hba) -{ - if (hba->vops && hba->vops->reinit_notify) - hba->vops->reinit_notify(hba); -} - static inline int ufshcd_vops_mcq_config_resource(struct ufs_hba *hba) { if (hba->vops && hba->vops->mcq_config_resource) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 8a01e4393159d..d1e19a2ccf49d 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8858,7 +8858,6 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool init_dev_params) ufshcd_device_reset(hba); ufs_put_device_desc(hba); ufshcd_hba_stop(hba); - ufshcd_vops_reinit_notify(hba); ret = ufshcd_hba_enable(hba); if (ret) { dev_err(hba->dev, "Host controller enable failed\n"); diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 68040b2ab5f82..e770e7b9d2392 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -368,6 +368,11 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba) if (ret) return ret; + if (phy->power_count) { + phy_power_off(phy); + phy_exit(phy); + } + /* phy initialization - calibrate the phy */ ret = phy_init(phy); if (ret) { @@ -1579,13 +1584,6 @@ static void ufs_qcom_config_scaling_param(struct ufs_hba *hba, } #endif -static void ufs_qcom_reinit_notify(struct ufs_hba *hba) -{ - struct ufs_qcom_host *host = ufshcd_get_variant(hba); - - phy_power_off(host->generic_phy); -} - /* Resources */ static const struct ufshcd_res_info ufs_res_info[RES_MAX] = { {.name = "ufs_mem",}, @@ -1825,7 +1823,6 @@ static const struct ufs_hba_variant_ops ufs_hba_qcom_vops = { .device_reset = ufs_qcom_device_reset, .config_scaling_param = ufs_qcom_config_scaling_param, .program_key = ufs_qcom_ice_program_key, - .reinit_notify = ufs_qcom_reinit_notify, .mcq_config_resource = ufs_qcom_mcq_config_resource, .get_hba_mac = ufs_qcom_get_hba_mac, .op_runtime_config = ufs_qcom_op_runtime_config, diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index d650ae6b58d39..74e5b9960c546 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -329,7 +329,6 @@ struct ufs_pwr_mode_info { * @program_key: program or evict an inline encryption key * @fill_crypto_prdt: initialize crypto-related fields in the PRDT * @event_notify: called to notify important events - * @reinit_notify: called to notify reinit of UFSHCD during max gear switch * @mcq_config_resource: called to configure MCQ platform resources * @get_hba_mac: reports maximum number of outstanding commands supported by * the controller. Should be implemented for UFSHCI 4.0 or later @@ -381,7 +380,6 @@ struct ufs_hba_variant_ops { void *prdt, unsigned int num_segments); void (*event_notify)(struct ufs_hba *hba, enum ufs_event_type evt, void *data); - void (*reinit_notify)(struct ufs_hba *); int (*mcq_config_resource)(struct ufs_hba *hba); int (*get_hba_mac)(struct ufs_hba *hba); int (*op_runtime_config)(struct ufs_hba *hba); From bb9850704c043e48c86cc9df90ee102e8a338229 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 19 Dec 2024 22:20:42 +0530 Subject: [PATCH 1107/1450] scsi: ufs: core: Honor runtime/system PM levels if set by host controller drivers Otherwise, the default levels will override the levels set by the host controller drivers. Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241219-ufs-qcom-suspend-fix-v3-2-63c4b95a70b9@linaro.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index d1e19a2ccf49d..9c26e8767515b 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10590,14 +10590,17 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) } /* - * Set the default power management level for runtime and system PM. + * Set the default power management level for runtime and system PM if + * not set by the host controller drivers. * Default power saving mode is to keep UFS link in Hibern8 state * and UFS device in sleep state. */ - hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + if (!hba->rpm_lvl) + hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( UFS_SLEEP_PWR_MODE, UIC_LINK_HIBERN8_STATE); - hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + if (!hba->spm_lvl) + hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( UFS_SLEEP_PWR_MODE, UIC_LINK_HIBERN8_STATE); From 4f78a56af4c472834681759d4365fb93921da77d Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 19 Dec 2024 22:20:43 +0530 Subject: [PATCH 1108/1450] scsi: ufs: qcom: Allow passing platform specific OF data In order to allow platform specific flags and configurations, introduce the platform specific OF data and move the existing quirk UFSHCD_QUIRK_BROKEN_LSDBS_CAP for SM8550 and SM8650 SoCs. Reviewed-by: Avri Altman Reviewed-by: Neil Armstrong Tested-by: Amit Pundir # on SM8550-HDK Reviewed-by: Bart Van Assche Tested-by: Neil Armstrong # on SM8550-QRD Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241219-ufs-qcom-suspend-fix-v3-3-63c4b95a70b9@linaro.org Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 13 +++++++++---- drivers/ufs/host/ufs-qcom.h | 4 ++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index e770e7b9d2392..7042322d55e98 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -871,6 +871,7 @@ static u32 ufs_qcom_get_ufs_hci_version(struct ufs_hba *hba) */ static void ufs_qcom_advertise_quirks(struct ufs_hba *hba) { + const struct ufs_qcom_drvdata *drvdata = of_device_get_match_data(hba->dev); struct ufs_qcom_host *host = ufshcd_get_variant(hba); if (host->hw_ver.major == 0x2) @@ -879,9 +880,8 @@ static void ufs_qcom_advertise_quirks(struct ufs_hba *hba) if (host->hw_ver.major > 0x3) hba->quirks |= UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH; - if (of_device_is_compatible(hba->dev->of_node, "qcom,sm8550-ufshc") || - of_device_is_compatible(hba->dev->of_node, "qcom,sm8650-ufshc")) - hba->quirks |= UFSHCD_QUIRK_BROKEN_LSDBS_CAP; + if (drvdata && drvdata->quirks) + hba->quirks |= drvdata->quirks; } static void ufs_qcom_set_phy_gear(struct ufs_qcom_host *host) @@ -1865,9 +1865,14 @@ static void ufs_qcom_remove(struct platform_device *pdev) platform_device_msi_free_irqs_all(hba->dev); } +static const struct ufs_qcom_drvdata ufs_qcom_sm8550_drvdata = { + .quirks = UFSHCD_QUIRK_BROKEN_LSDBS_CAP, +}; + static const struct of_device_id ufs_qcom_of_match[] __maybe_unused = { { .compatible = "qcom,ufshc" }, - { .compatible = "qcom,sm8550-ufshc" }, + { .compatible = "qcom,sm8550-ufshc", .data = &ufs_qcom_sm8550_drvdata }, + { .compatible = "qcom,sm8650-ufshc", .data = &ufs_qcom_sm8550_drvdata }, {}, }; MODULE_DEVICE_TABLE(of, ufs_qcom_of_match); diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h index b9de170983c9e..15f6dad8b27fb 100644 --- a/drivers/ufs/host/ufs-qcom.h +++ b/drivers/ufs/host/ufs-qcom.h @@ -217,6 +217,10 @@ struct ufs_qcom_host { bool esi_enabled; }; +struct ufs_qcom_drvdata { + enum ufshcd_quirks quirks; +}; + static inline u32 ufs_qcom_get_debug_reg_offset(struct ufs_qcom_host *host, u32 reg) { From 3b2f56860b05bf0cea86af786fd9b7faa8fe3ef3 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 19 Dec 2024 22:20:44 +0530 Subject: [PATCH 1109/1450] scsi: ufs: qcom: Power down the controller/device during system suspend for SM8550/SM8650 SoCs SM8550 and SM8650 SoCs doesn't support UFS PHY retention. So once these SoCs reaches the low power state (CX power collapse) during system suspend, all the PHY hardware state gets lost. This leads to the UFS resume failure: ufshcd-qcom 1d84000.ufs: ufshcd_uic_hibern8_exit: hibern8 exit failed. ret = 5 ufshcd-qcom 1d84000.ufs: __ufshcd_wl_resume: hibern8 exit failed 5 ufs_device_wlun 0:0:0:49488: ufshcd_wl_resume failed: 5 ufs_device_wlun 0:0:0:49488: PM: dpm_run_callback(): scsi_bus_resume+0x0/0x84 returns 5 ufs_device_wlun 0:0:0:49488: PM: failed to resume async: error 5 With the default system suspend level of UFS_PM_LVL_3, the power domain for UFS PHY needs to be kept always ON to retain the state. But this would prevent these SoCs from reaching the CX power collapse state, leading to poor power saving during system suspend. So to fix this issue without affecting the power saving, set 'ufs_qcom_drvdata::no_phy_retention' to true which sets 'hba->spm_lvl' to UFS_PM_LVL_5 to allow both the controller and device (in turn the PHY) to be powered down during system suspend for these SoCs by default. Cc: stable@vger.kernel.org # 6.3 Fixes: 35cf1aaab169 ("arm64: dts: qcom: sm8550: Add UFS host controller and phy nodes") Fixes: 10e024671295 ("arm64: dts: qcom: sm8650: add interconnect dependent device nodes") Reported-by: Neil Armstrong Tested-by: Amit Pundir # on SM8550-HDK Reviewed-by: Bart Van Assche Tested-by: Neil Armstrong # on SM8550-QRD Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241219-ufs-qcom-suspend-fix-v3-4-63c4b95a70b9@linaro.org Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 5 +++++ drivers/ufs/host/ufs-qcom.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 7042322d55e98..91e94fe990b4a 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -1069,6 +1069,7 @@ static int ufs_qcom_init(struct ufs_hba *hba) struct device *dev = hba->dev; struct ufs_qcom_host *host; struct ufs_clk_info *clki; + const struct ufs_qcom_drvdata *drvdata = of_device_get_match_data(hba->dev); host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL); if (!host) @@ -1148,6 +1149,9 @@ static int ufs_qcom_init(struct ufs_hba *hba) dev_warn(dev, "%s: failed to configure the testbus %d\n", __func__, err); + if (drvdata && drvdata->no_phy_retention) + hba->spm_lvl = UFS_PM_LVL_5; + return 0; out_variant_clear: @@ -1867,6 +1871,7 @@ static void ufs_qcom_remove(struct platform_device *pdev) static const struct ufs_qcom_drvdata ufs_qcom_sm8550_drvdata = { .quirks = UFSHCD_QUIRK_BROKEN_LSDBS_CAP, + .no_phy_retention = true, }; static const struct of_device_id ufs_qcom_of_match[] __maybe_unused = { diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h index 15f6dad8b27fb..919f53682beba 100644 --- a/drivers/ufs/host/ufs-qcom.h +++ b/drivers/ufs/host/ufs-qcom.h @@ -219,6 +219,7 @@ struct ufs_qcom_host { struct ufs_qcom_drvdata { enum ufshcd_quirks quirks; + bool no_phy_retention; }; static inline u32 From f0ed39830e6064d62f9c5393505677a26569bb56 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 20 Dec 2024 09:19:18 -0800 Subject: [PATCH 1110/1450] xe/oa: Fix query mode of operation for OAR/OAC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a set of squashed commits to facilitate smooth applying to stable. Each commit message is retained for reference. 1) Allow a GGTT mapped batch to be submitted to user exec queue For a OA use case, one of the HW registers needs to be modified by submitting an MI_LOAD_REGISTER_IMM command to the users exec queue, so that the register is modified in the user's hardware context. In order to do this a batch that is mapped in GGTT, needs to be submitted to the user exec queue. Since all user submissions use q->vm and hence PPGTT, add some plumbing to enable submission of batches mapped in GGTT. v2: ggtt is zero-initialized, so no need to set it false (Matt Brost) 2) xe/oa: Use MI_LOAD_REGISTER_IMMEDIATE to enable OAR/OAC To enable OAR/OAC, a bit in RING_CONTEXT_CONTROL needs to be set. Setting this bit cause the context image size to change and if not done correct, can cause undesired hangs. Current code uses a separate exec_queue to modify this bit and is error-prone. As per HW recommendation, submit MI_LOAD_REGISTER_IMM to the target hardware context to modify the relevant bit. In v2 version, an attempt to submit everything to the user-queue was made, but it failed the unprivileged-single-ctx-counters test. It appears that the OACTXCONTROL must be modified from a remote context. In v3 version, all context specific register configurations were moved to use LOAD_REGISTER_IMMEDIATE and that seems to work well. This is a cleaner way, since we can now submit all configuration to user exec_queue and the fence handling is simplified. v2: (Matt) - set job->ggtt to true if create job is successful - unlock vm on job error (Ashutosh) - don't wait on job submission - use kernel exec queue where possible v3: (Ashutosh) - Fix checkpatch issues - Remove extra spaces/new-lines - Add Fixes: and Cc: tags - Reset context control bit when OA stream is closed - Submit all config via MI_LOAD_REGISTER_IMMEDIATE (Umesh) - Update commit message for v3 experiment - Squash patches for easier port to stable v4: (Ashutosh) - No need to pass q to xe_oa_submit_bb - Do not support exec queues with width > 1 - Fix disabling of CTX_CTRL_OAC_CONTEXT_ENABLE v5: (Ashutosh) - Drop reg_lri related comments - Use XE_OA_SUBMIT_NO_DEPS in xe_oa_load_with_lri Fixes: 8135f1c09dd2 ("drm/xe/oa: Don't reset OAC_CONTEXT_ENABLE on OA stream close") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Matthew Brost # commit 1 Reviewed-by: Ashutosh Dixit Cc: stable@vger.kernel.org Reviewed-by: Jonathan Cavitt Signed-off-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20241220171919.571528-2-umesh.nerlige.ramappa@intel.com (cherry picked from commit 55039832f98c7e05f1cf9e0d8c12b2490abd0f16) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_oa.c | 134 ++++++++---------------- drivers/gpu/drm/xe/xe_ring_ops.c | 5 +- drivers/gpu/drm/xe/xe_sched_job_types.h | 2 + 3 files changed, 51 insertions(+), 90 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index 8dd55798ab312..5cc0f6f9bc113 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -74,12 +74,6 @@ struct xe_oa_config { struct rcu_head rcu; }; -struct flex { - struct xe_reg reg; - u32 offset; - u32 value; -}; - struct xe_oa_open_param { struct xe_file *xef; u32 oa_unit_id; @@ -596,19 +590,38 @@ static __poll_t xe_oa_poll(struct file *file, poll_table *wait) return ret; } +static void xe_oa_lock_vma(struct xe_exec_queue *q) +{ + if (q->vm) { + down_read(&q->vm->lock); + xe_vm_lock(q->vm, false); + } +} + +static void xe_oa_unlock_vma(struct xe_exec_queue *q) +{ + if (q->vm) { + xe_vm_unlock(q->vm); + up_read(&q->vm->lock); + } +} + static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa_submit_deps deps, struct xe_bb *bb) { + struct xe_exec_queue *q = stream->exec_q ?: stream->k_exec_q; struct xe_sched_job *job; struct dma_fence *fence; int err = 0; - /* Kernel configuration is issued on stream->k_exec_q, not stream->exec_q */ - job = xe_bb_create_job(stream->k_exec_q, bb); + xe_oa_lock_vma(q); + + job = xe_bb_create_job(q, bb); if (IS_ERR(job)) { err = PTR_ERR(job); goto exit; } + job->ggtt = true; if (deps == XE_OA_SUBMIT_ADD_DEPS) { for (int i = 0; i < stream->num_syncs && !err; i++) @@ -623,10 +636,13 @@ static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa fence = dma_fence_get(&job->drm.s_fence->finished); xe_sched_job_push(job); + xe_oa_unlock_vma(q); + return fence; err_put_job: xe_sched_job_put(job); exit: + xe_oa_unlock_vma(q); return ERR_PTR(err); } @@ -675,63 +691,19 @@ static void xe_oa_free_configs(struct xe_oa_stream *stream) dma_fence_put(stream->last_fence); } -static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc, - struct xe_bb *bb, const struct flex *flex, u32 count) -{ - u32 offset = xe_bo_ggtt_addr(lrc->bo); - - do { - bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); - bb->cs[bb->len++] = offset + flex->offset * sizeof(u32); - bb->cs[bb->len++] = 0; - bb->cs[bb->len++] = flex->value; - - } while (flex++, --count); -} - -static int xe_oa_modify_ctx_image(struct xe_oa_stream *stream, struct xe_lrc *lrc, - const struct flex *flex, u32 count) -{ - struct dma_fence *fence; - struct xe_bb *bb; - int err; - - bb = xe_bb_new(stream->gt, 4 * count, false); - if (IS_ERR(bb)) { - err = PTR_ERR(bb); - goto exit; - } - - xe_oa_store_flex(stream, lrc, bb, flex, count); - - fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb); - if (IS_ERR(fence)) { - err = PTR_ERR(fence); - goto free_bb; - } - xe_bb_free(bb, fence); - dma_fence_put(fence); - - return 0; -free_bb: - xe_bb_free(bb, NULL); -exit: - return err; -} - -static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri) +static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri, u32 count) { struct dma_fence *fence; struct xe_bb *bb; int err; - bb = xe_bb_new(stream->gt, 3, false); + bb = xe_bb_new(stream->gt, 2 * count + 1, false); if (IS_ERR(bb)) { err = PTR_ERR(bb); goto exit; } - write_cs_mi_lri(bb, reg_lri, 1); + write_cs_mi_lri(bb, reg_lri, count); fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb); if (IS_ERR(fence)) { @@ -751,71 +723,55 @@ static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *re static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable) { const struct xe_oa_format *format = stream->oa_buffer.format; - struct xe_lrc *lrc = stream->exec_q->lrc[0]; - u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32); u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) | (enable ? OAR_OACONTROL_COUNTER_ENABLE : 0); - struct flex regs_context[] = { + struct xe_oa_reg reg_lri[] = { { OACTXCONTROL(stream->hwe->mmio_base), - stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1, enable ? OA_COUNTER_RESUME : 0, }, + { + OAR_OACONTROL, + oacontrol, + }, { RING_CONTEXT_CONTROL(stream->hwe->mmio_base), - regs_offset + CTX_CONTEXT_CONTROL, - _MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE), + _MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE, + enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) }, }; - struct xe_oa_reg reg_lri = { OAR_OACONTROL, oacontrol }; - int err; - - /* Modify stream hwe context image with regs_context */ - err = xe_oa_modify_ctx_image(stream, stream->exec_q->lrc[0], - regs_context, ARRAY_SIZE(regs_context)); - if (err) - return err; - /* Apply reg_lri using LRI */ - return xe_oa_load_with_lri(stream, ®_lri); + return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri)); } static int xe_oa_configure_oac_context(struct xe_oa_stream *stream, bool enable) { const struct xe_oa_format *format = stream->oa_buffer.format; - struct xe_lrc *lrc = stream->exec_q->lrc[0]; - u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32); u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) | (enable ? OAR_OACONTROL_COUNTER_ENABLE : 0); - struct flex regs_context[] = { + struct xe_oa_reg reg_lri[] = { { OACTXCONTROL(stream->hwe->mmio_base), - stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1, enable ? OA_COUNTER_RESUME : 0, }, + { + OAC_OACONTROL, + oacontrol + }, { RING_CONTEXT_CONTROL(stream->hwe->mmio_base), - regs_offset + CTX_CONTEXT_CONTROL, - _MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE) | + _MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE, + enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) | _MASKED_FIELD(CTX_CTRL_RUN_ALONE, enable ? CTX_CTRL_RUN_ALONE : 0), }, }; - struct xe_oa_reg reg_lri = { OAC_OACONTROL, oacontrol }; - int err; /* Set ccs select to enable programming of OAC_OACONTROL */ xe_mmio_write32(&stream->gt->mmio, __oa_regs(stream)->oa_ctrl, __oa_ccs_select(stream)); - /* Modify stream hwe context image with regs_context */ - err = xe_oa_modify_ctx_image(stream, stream->exec_q->lrc[0], - regs_context, ARRAY_SIZE(regs_context)); - if (err) - return err; - - /* Apply reg_lri using LRI */ - return xe_oa_load_with_lri(stream, ®_lri); + return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri)); } static int xe_oa_configure_oa_context(struct xe_oa_stream *stream, bool enable) @@ -2066,8 +2022,8 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f if (XE_IOCTL_DBG(oa->xe, !param.exec_q)) return -ENOENT; - if (param.exec_q->width > 1) - drm_dbg(&oa->xe->drm, "exec_q->width > 1, programming only exec_q->lrc[0]\n"); + if (XE_IOCTL_DBG(oa->xe, param.exec_q->width > 1)) + return -EOPNOTSUPP; } /* diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index 0be4f489d3e12..9f327f27c0726 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -221,7 +221,10 @@ static int emit_pipe_imm_ggtt(u32 addr, u32 value, bool stall_only, u32 *dw, static u32 get_ppgtt_flag(struct xe_sched_job *job) { - return job->q->vm ? BIT(8) : 0; + if (job->q->vm && !job->ggtt) + return BIT(8); + + return 0; } static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i) diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index f13f333f00beb..d942b20a9f29c 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -56,6 +56,8 @@ struct xe_sched_job { u32 migrate_flush_flags; /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */ bool ring_ops_flush_tlb; + /** @ggtt: mapped in ggtt. */ + bool ggtt; /** @ptrs: per instance pointers. */ struct xe_job_ptrs ptrs[]; }; From 0bc21e701a6ffacfdde7f04f87d664d82e8a13bf Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Thu, 2 Jan 2025 06:30:03 -0800 Subject: [PATCH 1111/1450] MAINTAINERS: Remove Olof from SoC maintainers I haven't been an active participant for a couple of years now, and after discussions at Linux Plumbers in 2024, Arnd is getting fresh help from a few more participants. It's time to remove myself, and spare myself from patches and pull requests in my inbox. Signed-off-by: Olof Johansson Cc: Arnd Bergmann Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 910305c11e8a8..c575de4903dbf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1797,7 +1797,6 @@ F: include/uapi/linux/if_arcnet.h ARM AND ARM64 SoC SUB-ARCHITECTURES (COMMON PARTS) M: Arnd Bergmann -M: Olof Johansson L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: soc@lists.linux.dev S: Maintained From d65474033740ded0a4fe9a097fce72328655b41d Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Tue, 31 Dec 2024 11:37:31 +0000 Subject: [PATCH 1112/1450] fgraph: Add READ_ONCE() when accessing fgraph_array[] In __ftrace_return_to_handler(), a loop iterates over the fgraph_array[] elements, which are fgraph_ops. The loop checks if an element is a fgraph_stub to prevent using a fgraph_stub afterward. However, if the compiler reloads fgraph_array[] after this check, it might race with an update to fgraph_array[] that introduces a fgraph_stub. This could result in the stub being processed, but the stub contains a null "func_hash" field, leading to a NULL pointer dereference. To ensure that the gops compared against the fgraph_stub matches the gops processed later, add a READ_ONCE(). A similar patch appears in commit 63a8dfb ("function_graph: Add READ_ONCE() when accessing fgraph_array[]"). Cc: stable@vger.kernel.org Fixes: 37238abe3cb47 ("ftrace/function_graph: Pass fgraph_ops to function graph callbacks") Link: https://lore.kernel.org/20241231113731.277668-1-zilin@seu.edu.cn Signed-off-by: Zilin Guan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index ddedcb50917f4..30e3ddc8a8a84 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -833,7 +833,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs #endif { for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) { - struct fgraph_ops *gops = fgraph_array[i]; + struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]); if (gops == &fgraph_stub) continue; From 789a8cff8d2dbe4b5c617c3004b5eb63fa7a3b35 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Thu, 2 Jan 2025 04:08:20 +0900 Subject: [PATCH 1113/1450] ftrace: Fix function profiler's filtering functionality Commit c132be2c4fcc ("function_graph: Have the instances use their own ftrace_ops for filtering"), function profiler (enabled via function_profile_enabled) has been showing statistics for all functions, ignoring set_ftrace_filter settings. While tracers are instantiated, the function profiler is not. Therefore, it should use the global set_ftrace_filter for consistency. This patch modifies the function profiler to use the global filter, fixing the filtering functionality. Before (filtering not working): ``` root@localhost:~# echo 'vfs*' > /sys/kernel/tracing/set_ftrace_filter root@localhost:~# echo 1 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# sleep 1 root@localhost:~# echo 0 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# head /sys/kernel/tracing/trace_stat/* Function Hit Time Avg s^2 -------- --- ---- --- --- schedule 314 22290594 us 70989.15 us 40372231 us x64_sys_call 1527 8762510 us 5738.382 us 3414354 us schedule_hrtimeout_range 176 8665356 us 49234.98 us 405618876 us __x64_sys_ppoll 324 5656635 us 17458.75 us 19203976 us do_sys_poll 324 5653747 us 17449.83 us 19214945 us schedule_timeout 67 5531396 us 82558.15 us 2136740827 us __x64_sys_pselect6 12 3029540 us 252461.7 us 63296940171 us do_pselect.constprop.0 12 3029532 us 252461.0 us 63296952931 us ``` After (filtering working): ``` root@localhost:~# echo 'vfs*' > /sys/kernel/tracing/set_ftrace_filter root@localhost:~# echo 1 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# sleep 1 root@localhost:~# echo 0 > /sys/kernel/tracing/function_profile_enabled root@localhost:~# head /sys/kernel/tracing/trace_stat/* Function Hit Time Avg s^2 -------- --- ---- --- --- vfs_write 462 68476.43 us 148.217 us 25874.48 us vfs_read 641 9611.356 us 14.994 us 28868.07 us vfs_fstat 890 878.094 us 0.986 us 1.667 us vfs_fstatat 227 757.176 us 3.335 us 18.928 us vfs_statx 226 610.610 us 2.701 us 17.749 us vfs_getattr_nosec 1187 460.919 us 0.388 us 0.326 us vfs_statx_path 297 343.287 us 1.155 us 11.116 us vfs_rename 6 291.575 us 48.595 us 9889.236 us ``` Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250101190820.72534-1-enjuk@amazon.com Fixes: c132be2c4fcc ("function_graph: Have the instances use their own ftrace_ops for filtering") Signed-off-by: Kohei Enju Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9b17efb1a87dd..2e113f8b13a28 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -902,16 +902,13 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, } static struct fgraph_ops fprofiler_ops = { - .ops = { - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(fprofiler_ops.ops) - }, .entryfunc = &profile_graph_entry, .retfunc = &profile_graph_return, }; static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&fprofiler_ops.ops); return register_ftrace_graph(&fprofiler_ops); } @@ -922,12 +919,11 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&ftrace_profile_ops); return register_ftrace_function(&ftrace_profile_ops); } From c6e60a0a68b7e6b3c7e33863a16e8e88ba9eee6f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 2 Jan 2025 16:32:51 -0700 Subject: [PATCH 1114/1450] io_uring/net: always initialize kmsg->msg.msg_inq upfront syzbot reports that ->msg_inq may get used uinitialized from the following path: BUG: KMSAN: uninit-value in io_recv_buf_select io_uring/net.c:1094 [inline] BUG: KMSAN: uninit-value in io_recv+0x930/0x1f90 io_uring/net.c:1158 io_recv_buf_select io_uring/net.c:1094 [inline] io_recv+0x930/0x1f90 io_uring/net.c:1158 io_issue_sqe+0x420/0x2130 io_uring/io_uring.c:1740 io_queue_sqe io_uring/io_uring.c:1950 [inline] io_req_task_submit+0xfa/0x1d0 io_uring/io_uring.c:1374 io_handle_tw_list+0x55f/0x5c0 io_uring/io_uring.c:1057 tctx_task_work_run+0x109/0x3e0 io_uring/io_uring.c:1121 tctx_task_work+0x6d/0xc0 io_uring/io_uring.c:1139 task_work_run+0x268/0x310 kernel/task_work.c:239 io_run_task_work+0x43a/0x4a0 io_uring/io_uring.h:343 io_cqring_wait io_uring/io_uring.c:2527 [inline] __do_sys_io_uring_enter io_uring/io_uring.c:3439 [inline] __se_sys_io_uring_enter+0x204f/0x4ce0 io_uring/io_uring.c:3330 __x64_sys_io_uring_enter+0x11f/0x1a0 io_uring/io_uring.c:3330 x64_sys_call+0xce5/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:427 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f and it is correct, as it's never initialized upfront. Hence the first submission can end up using it uninitialized, if the recv wasn't successful and the networking stack didn't honor ->msg_get_inq being set and filling in the output value of ->msg_inq as requested. Set it to 0 upfront when it's allocated, just to silence this KMSAN warning. There's no side effect of using it uninitialized, it'll just potentially cause the next receive to use a recv value hint that's not accurate. Fixes: c6f32c7d9e09 ("io_uring/net: get rid of ->prep_async() for receive side") Reported-by: syzbot+068ff190354d2f74892f@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index df1f7dc6f1c8f..c6cd38cc5dc4e 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -754,6 +754,7 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) if (req->opcode == IORING_OP_RECV) { kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_inq = 0; kmsg->msg.msg_control = NULL; kmsg->msg.msg_get_inq = 1; kmsg->msg.msg_controllen = 0; From c83ca5a4df7cf0ce9ccc25e8481043e05aed6ad0 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 20 Dec 2024 23:02:06 +0100 Subject: [PATCH 1115/1450] net: phy: fix phy_disable_eee genphy_c45_write_eee_adv() becomes a no-op if phydev->supported_eee is cleared. That's not what we want because this function is still needed to clear the EEE advertisement register(s). Fill phydev->eee_broken_modes instead to ensure that userspace can't re-enable EEE advertising. Fixes: b55498ff14bd ("net: phy: add phy_disable_eee") Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/57e2ae5f-4319-413c-b5c4-ebc8d049bc23@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 928dc3c509b66..bdc997f597790 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3012,10 +3012,11 @@ EXPORT_SYMBOL(phy_support_eee); */ void phy_disable_eee(struct phy_device *phydev) { - linkmode_zero(phydev->supported_eee); linkmode_zero(phydev->advertising_eee); phydev->eee_cfg.tx_lpi_enabled = false; phydev->eee_cfg.eee_enabled = false; + /* don't let userspace re-enable EEE advertisement */ + linkmode_fill(phydev->eee_broken_modes); } EXPORT_SYMBOL_GPL(phy_disable_eee); From be16b46f9ebd3270a4c4da81e5d7bb34b6f43384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 22 Dec 2024 21:08:20 +0100 Subject: [PATCH 1116/1450] ptp: ocp: constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20241222-sysfs-const-bin_attr-ptp-v1-1-5c1f3ee246fb@weissschuh.net Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_ocp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index 5feecaadde8e0..7f08c70d81230 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -3692,7 +3692,7 @@ DEVICE_FREQ_GROUP(freq4, 3); static ssize_t disciplining_config_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct ptp_ocp *bp = dev_get_drvdata(kobj_to_dev(kobj)); @@ -3727,7 +3727,7 @@ disciplining_config_read(struct file *filp, struct kobject *kobj, static ssize_t disciplining_config_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct ptp_ocp *bp = dev_get_drvdata(kobj_to_dev(kobj)); @@ -3750,11 +3750,11 @@ disciplining_config_write(struct file *filp, struct kobject *kobj, return err; } -static BIN_ATTR_RW(disciplining_config, OCP_ART_CONFIG_SIZE); +static const BIN_ATTR_RW(disciplining_config, OCP_ART_CONFIG_SIZE); static ssize_t temperature_table_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct ptp_ocp *bp = dev_get_drvdata(kobj_to_dev(kobj)); @@ -3789,7 +3789,7 @@ temperature_table_read(struct file *filp, struct kobject *kobj, static ssize_t temperature_table_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct ptp_ocp *bp = dev_get_drvdata(kobj_to_dev(kobj)); @@ -3812,7 +3812,7 @@ temperature_table_write(struct file *filp, struct kobject *kobj, return err; } -static BIN_ATTR_RW(temperature_table, OCP_ART_TEMP_TABLE_SIZE); +static const BIN_ATTR_RW(temperature_table, OCP_ART_TEMP_TABLE_SIZE); static struct attribute *fb_timecard_attrs[] = { &dev_attr_serialnum.attr, @@ -3867,7 +3867,7 @@ static struct attribute *art_timecard_attrs[] = { NULL, }; -static struct bin_attribute *bin_art_timecard_attrs[] = { +static const struct bin_attribute *const bin_art_timecard_attrs[] = { &bin_attr_disciplining_config, &bin_attr_temperature_table, NULL, @@ -3875,7 +3875,7 @@ static struct bin_attribute *bin_art_timecard_attrs[] = { static const struct attribute_group art_timecard_group = { .attrs = art_timecard_attrs, - .bin_attrs = bin_art_timecard_attrs, + .bin_attrs_new = bin_art_timecard_attrs, }; static const struct ocp_attr_group art_timecard_groups[] = { From bb70b0d48d8eab20644ca0101fedfe23f8a26c59 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 24 Dec 2024 20:37:06 +0200 Subject: [PATCH 1117/1450] devlink: Improve the port attributes description Current PF number description is vague, sometimes interpreted as some PF index. VF number in the PCI specification starts at 1; however in kernel, it starts at 0 for representor model. Improve the description of devlink port attributes PF, VF and SF numbers with these details. Reviewed-by: Sridhar Samudrala Reviewed-by: Shay Drory Reviewed-by: Mark Bloch Reviewed-by: Jiri Pirko Signed-off-by: Parav Pandit Link: https://patch.msgid.link/20241224183706.26571-1-parav@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 11 ++++++----- net/devlink/port.c | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/net/devlink.h b/include/net/devlink.h index 58e33959c8521..fc79fe2297a14 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -35,7 +35,7 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes * @controller: Associated controller number - * @pf: Associated PCI PF number for this port. + * @pf: associated PCI function number for the devlink port instance * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { @@ -47,8 +47,9 @@ struct devlink_port_pci_pf_attrs { /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes * @controller: Associated controller number - * @pf: Associated PCI PF number for this port. - * @vf: Associated PCI VF for of the PCI PF for this port. + * @pf: associated PCI function number for the devlink port instance + * @vf: associated PCI VF number of a PF for the devlink port instance; + * VF number starts from 0 for the first PCI virtual function * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { @@ -61,8 +62,8 @@ struct devlink_port_pci_vf_attrs { /** * struct devlink_port_pci_sf_attrs - devlink port's PCI SF attributes * @controller: Associated controller number - * @sf: Associated PCI SF for of the PCI PF for this port. - * @pf: Associated PCI PF number for this port. + * @sf: associated SF number of a PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_sf_attrs { diff --git a/net/devlink/port.c b/net/devlink/port.c index be9158b4453c5..939081a0e6154 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -1376,7 +1376,7 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, @@ -1402,8 +1402,9 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @vf: associated VF of a PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance + * @vf: associated PCI VF number of a PF for the devlink port instance; + * VF number starts from 0 for the first PCI virtual function * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, @@ -1430,8 +1431,8 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @sf: associated SF of a PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance + * @sf: associated SF number of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, From a8620de72e5676993ec3a3b975f7c10908f5f60f Mon Sep 17 00:00:00 2001 From: Liang Jie Date: Mon, 30 Dec 2024 17:37:09 +0800 Subject: [PATCH 1118/1450] net: sfc: Correct key_len for efx_tc_ct_zone_ht_params In efx_tc_ct_zone_ht_params, the key_len was previously set to offsetof(struct efx_tc_ct_zone, linkage). This calculation is incorrect because it includes any padding between the zone field and the linkage field due to structure alignment, which can vary between systems. This patch updates key_len to use sizeof_field(struct efx_tc_ct_zone, zone) , ensuring that the hash table correctly uses the zone as the key. This fix prevents potential hash lookup errors and improves connection tracking reliability. Fixes: c3bb5c6acd4e ("sfc: functions to register for conntrack zone offload") Signed-off-by: Liang Jie Acked-by: Edward Cree Link: https://patch.msgid.link/20241230093709.3226854-1-buaajxlj@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/tc_conntrack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/tc_conntrack.c b/drivers/net/ethernet/sfc/tc_conntrack.c index d90206f27161e..c0603f54cec3a 100644 --- a/drivers/net/ethernet/sfc/tc_conntrack.c +++ b/drivers/net/ethernet/sfc/tc_conntrack.c @@ -16,7 +16,7 @@ static int efx_tc_flow_block(enum tc_setup_type type, void *type_data, void *cb_priv); static const struct rhashtable_params efx_tc_ct_zone_ht_params = { - .key_len = offsetof(struct efx_tc_ct_zone, linkage), + .key_len = sizeof_field(struct efx_tc_ct_zone, zone), .key_offset = 0, .head_offset = offsetof(struct efx_tc_ct_zone, linkage), }; From 68e068cabd2c6c533ef934c2e5151609cf6ecc6d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 1 Jan 2025 11:47:40 -0500 Subject: [PATCH 1119/1450] net: reenable NETIF_F_IPV6_CSUM offload for BIG TCP packets The blamed commit disabled hardware offoad of IPv6 packets with extension headers on devices that advertise NETIF_F_IPV6_CSUM, based on the definition of that feature in skbuff.h: * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). The change causes skb_warn_bad_offload to fire for BIG TCP packets. [ 496.310233] WARNING: CPU: 13 PID: 23472 at net/core/dev.c:3129 skb_warn_bad_offload+0xc4/0xe0 [ 496.310297] ? skb_warn_bad_offload+0xc4/0xe0 [ 496.310300] skb_checksum_help+0x129/0x1f0 [ 496.310303] skb_csum_hwoffload_help+0x150/0x1b0 [ 496.310306] validate_xmit_skb+0x159/0x270 [ 496.310309] validate_xmit_skb_list+0x41/0x70 [ 496.310312] sch_direct_xmit+0x5c/0x250 [ 496.310317] __qdisc_run+0x388/0x620 BIG TCP introduced an IPV6_TLV_JUMBO IPv6 extension header to communicate packet length, as this is an IPv6 jumbogram. But, the feature is only enabled on devices that support BIG TCP TSO. The header is only present for PF_PACKET taps like tcpdump, and not transmitted by physical devices. For this specific case of extension headers that are not transmitted, return to the situation before the blamed commit and support hardware offload. ipv6_has_hopopt_jumbo() tests not only whether this header is present, but also that it is the only extension header before a terminal (L4) header. Fixes: 04c20a9356f2 ("net: skip offload for NETIF_F_IPV6_CSUM if ipv6 header contains extension") Reported-by: syzbot Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iK1hdC3Nt8KPhOtTF8vCPc1AHDCtse_BTNki1pWxAByTQ@mail.gmail.com/ Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250101164909.1331680-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 45a8c3dd4a648..faa23042df384 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3642,8 +3642,10 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; + switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): From 5b0af621c3f6ef9261cf6067812f2fd9943acb4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Dec 2024 16:05:27 +0000 Subject: [PATCH 1120/1450] net: restrict SO_REUSEPORT to inet sockets After blamed commit, crypto sockets could accidentally be destroyed from RCU call back, as spotted by zyzbot [1]. Trying to acquire a mutex in RCU callback is not allowed. Restrict SO_REUSEPORT socket option to inet sockets. v1 of this patch supported TCP, UDP and SCTP sockets, but fcnal-test.sh test needed RAW and ICMP support. [1] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:562 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 24, name: ksoftirqd/1 preempt_count: 100, expected: 0 RCU nest depth: 0, expected: 0 1 lock held by ksoftirqd/1/24: #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_do_batch kernel/rcu/tree.c:2561 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_core+0xa37/0x17a0 kernel/rcu/tree.c:2823 Preemption disabled at: [] softirq_handle_begin kernel/softirq.c:402 [inline] [] handle_softirqs+0x128/0x9b0 kernel/softirq.c:537 CPU: 1 UID: 0 PID: 24 Comm: ksoftirqd/1 Not tainted 6.13.0-rc3-syzkaller-00174-ga024e377efed #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 __might_resched+0x5d4/0x780 kernel/sched/core.c:8758 __mutex_lock_common kernel/locking/mutex.c:562 [inline] __mutex_lock+0x131/0xee0 kernel/locking/mutex.c:735 crypto_put_default_null_skcipher+0x18/0x70 crypto/crypto_null.c:179 aead_release+0x3d/0x50 crypto/algif_aead.c:489 alg_do_release crypto/af_alg.c:118 [inline] alg_sock_destruct+0x86/0xc0 crypto/af_alg.c:502 __sk_destruct+0x58/0x5f0 net/core/sock.c:2260 rcu_do_batch kernel/rcu/tree.c:2567 [inline] rcu_core+0xaaa/0x17a0 kernel/rcu/tree.c:2823 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 run_ksoftirqd+0xca/0x130 kernel/softirq.c:950 smpboot_thread_fn+0x544/0xa30 kernel/smpboot.c:164 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: 8c7138b33e5c ("net: Unpublish sk from sk_reuseport_cb before call_rcu") Reported-by: syzbot+b3e02953598f447d4d2a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772f2f4.050a0220.2f3838.04cb.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241231160527.3994168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 74729d20cd009..be84885f9290a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1295,7 +1295,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - sk->sk_reuseport = valbool; + if (valbool && !sk_is_inet(sk)) + ret = -EOPNOTSUPP; + else + sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); From 5df7ca0b827dac9fc44b42a4b695adcaa6d07ffb Mon Sep 17 00:00:00 2001 From: Yu Tian Date: Tue, 31 Dec 2024 10:36:10 +0800 Subject: [PATCH 1121/1450] ipv4: remove useless arg The "struct sock *sk" parameter in ip_rcv_finish_core is unused, which leads the compiler to optimize it out. As a result, the "struct sk_buff *skb" parameter is passed using x1. And this make kprobe hard to use. Signed-off-by: Yu Tian Link: https://patch.msgid.link/20241231023610.1657926-1-tianyu2@kernelsoft.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_input.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f0a4dda246abc..30a5e9460d006 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -314,7 +314,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_early_demux(struct sk_buff *skb); int udp_v4_early_demux(struct sk_buff *skb); -static int ip_rcv_finish_core(struct net *net, struct sock *sk, +static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { @@ -442,7 +442,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) if (!skb) return NET_RX_SUCCESS; - ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); + ret = ip_rcv_finish_core(net, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -589,8 +589,7 @@ static struct sk_buff *ip_extract_route_hint(const struct net *net, return skb; } -static void ip_list_rcv_finish(struct net *net, struct sock *sk, - struct list_head *head) +static void ip_list_rcv_finish(struct net *net, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; @@ -607,7 +606,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip_rcv(skb); if (!skb) continue; - if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) + if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); @@ -633,7 +632,7 @@ static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, { NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); - ip_list_rcv_finish(net, NULL, head); + ip_list_rcv_finish(net, head); } /* Receive a list of IP packets */ From 94c16fd4df9089931f674fb9aaec41ea20b0fd7a Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Fri, 27 Dec 2024 10:59:22 +0100 Subject: [PATCH 1122/1450] net: dwmac-imx: add imx93 clock input support in RMII mode If the rmii_refclk_ext boolean is set, configure the ENET QOS TX_CLK pin direction to input. Otherwise, it defaults to output. That mirrors what is already happening for the imx8mp in the imx8mp_set_intf_mode function. Signed-off-by: Mathieu Othacehe Link: https://patch.msgid.link/20241227095923.4414-1-othacehe@gnu.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 43e0fbba4f77b..4ac7a78f4b14b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -36,6 +36,8 @@ #define MX93_GPR_ENET_QOS_INTF_SEL_RMII (0x4 << 1) #define MX93_GPR_ENET_QOS_INTF_SEL_RGMII (0x1 << 1) #define MX93_GPR_ENET_QOS_CLK_GEN_EN (0x1 << 0) +#define MX93_GPR_ENET_QOS_CLK_SEL_MASK BIT_MASK(0) +#define MX93_GPR_CLK_SEL_OFFSET (4) #define DMA_BUS_MODE 0x00001000 #define DMA_BUS_MODE_SFT_RESET (0x1 << 0) @@ -108,13 +110,21 @@ imx8dxl_set_intf_mode(struct plat_stmmacenet_data *plat_dat) static int imx93_set_intf_mode(struct plat_stmmacenet_data *plat_dat) { struct imx_priv_data *dwmac = plat_dat->bsp_priv; - int val; + int val, ret; switch (plat_dat->mac_interface) { case PHY_INTERFACE_MODE_MII: val = MX93_GPR_ENET_QOS_INTF_SEL_MII; break; case PHY_INTERFACE_MODE_RMII: + if (dwmac->rmii_refclk_ext) { + ret = regmap_clear_bits(dwmac->intf_regmap, + dwmac->intf_reg_off + + MX93_GPR_CLK_SEL_OFFSET, + MX93_GPR_ENET_QOS_CLK_SEL_MASK); + if (ret) + return ret; + } val = MX93_GPR_ENET_QOS_INTF_SEL_RMII; break; case PHY_INTERFACE_MODE_RGMII: From a7af435df0e04cfb4a4004136d597c42639a2ae7 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Sun, 29 Dec 2024 17:46:58 +0100 Subject: [PATCH 1123/1450] net: wwan: iosm: Properly check for valid exec stage in ipc_mmio_init() ipc_mmio_init() used the post-decrement operator in its loop continuing condition of "retries" counter being "> 0", which meant that when this condition caused loop exit "retries" counter reached -1. But the later valid exec stage failure check only tests for "retries" counter being exactly zero, so it didn't trigger in this case (but would wrongly trigger if the code reaches a valid exec stage in the very last loop iteration). Fix this by using the pre-decrement operator instead, so the loop counter is exactly zero on valid exec stage failure. Fixes: dc0514f5d828 ("net: iosm: mmio scratchpad") Signed-off-by: Maciej S. Szmigiero Link: https://patch.msgid.link/8b19125a825f9dcdd81c667c1e5c48ba28d505a6.1735490770.git.mail@maciej.szmigiero.name Signed-off-by: Jakub Kicinski --- drivers/net/wwan/iosm/iosm_ipc_mmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_mmio.c b/drivers/net/wwan/iosm/iosm_ipc_mmio.c index 63eb08c43c051..6764c13530b9b 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mmio.c +++ b/drivers/net/wwan/iosm/iosm_ipc_mmio.c @@ -104,7 +104,7 @@ struct iosm_mmio *ipc_mmio_init(void __iomem *mmio, struct device *dev) break; msleep(20); - } while (retries-- > 0); + } while (--retries > 0); if (!retries) { dev_err(ipc_mmio->dev, "invalid exec stage %X", stage); From 77ee7a6d16b6ec07b5c3ae2b6b60a24c1afbed09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:03 +0000 Subject: [PATCH 1124/1450] af_packet: fix vlan_get_tci() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_tci() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8da482 len:32 put:14 head:ffff88807a1d5800 data:ffff88807a1d5810 tail:0x14 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 5880 Comm: syz-executor172 Not tainted 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 9e 6c 26 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 3a 5a 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc90003baf5b8 EFLAGS: 00010286 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 8565c1eec37aa000 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802616fb50 R08: ffffffff817f0a4c R09: 1ffff92000775e50 R10: dffffc0000000000 R11: fffff52000775e51 R12: 0000000000000140 R13: ffff88807a1d5800 R14: ffff88807a1d5810 R15: 0000000000000014 FS: 00007fa03261f6c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffd65753000 CR3: 0000000031720000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_tci+0x272/0x550 net/packet/af_packet.c:565 packet_recvmsg+0x13c9/0x1ef0 net/packet/af_packet.c:3616 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1066 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2814 ___sys_recvmsg net/socket.c:2856 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2951 __sys_recvmmsg net/socket.c:3025 [inline] __do_sys_recvmmsg net/socket.c:3048 [inline] __se_sys_recvmmsg net/socket.c:3041 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3041 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+8400677f3fd43f37d3bc@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c6.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 886c0dd47b662..e2e34a49e98de 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -538,10 +538,8 @@ static void *packet_current_frame(struct packet_sock *po, return packet_lookup_frame(po, rb, rb->head, status); } -static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) +static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; struct vlan_hdr vhdr, *vh; unsigned int header_len; @@ -562,12 +560,8 @@ static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) else return 0; - skb_push(skb, skb->data - skb_mac_header(skb)); - vh = skb_header_pointer(skb, header_len, sizeof(vhdr), &vhdr); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } + vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, + sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; From f91a5b8089389eb408501af2762f168c3aaa7b79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:04 +0000 Subject: [PATCH 1125/1450] af_packet: fix vlan_get_protocol_dgram() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_protocol_dgram() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8ccd05 len:29 put:14 head:ffff88807fc8e400 data:ffff88807fc8e3f4 tail:0x11 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5892 Comm: syz-executor883 Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 86 d5 25 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 5a 69 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc900038d7638 EFLAGS: 00010282 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 609ffd18ea660600 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802483c8d0 R08: ffffffff817f0a8c R09: 1ffff9200071ae60 R10: dffffc0000000000 R11: fffff5200071ae61 R12: 0000000000000140 R13: ffff88807fc8e400 R14: ffff88807fc8e3f4 R15: 0000000000000011 FS: 00007fbac5e006c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbac5e00d58 CR3: 000000001238e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_protocol_dgram+0x165/0x290 net/packet/af_packet.c:585 packet_recvmsg+0x948/0x1ef0 net/packet/af_packet.c:3552 sock_recvmsg_nosec net/socket.c:1033 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1055 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2803 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2940 __sys_recvmmsg net/socket.c:3014 [inline] __do_sys_recvmmsg net/socket.c:3037 [inline] __se_sys_recvmmsg net/socket.c:3030 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+74f70bb1cb968bf09e4f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c5.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 16 +++++++++++++--- net/packet/af_packet.c | 16 ++++------------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed96..d65b5d71b93bf 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -585,13 +585,16 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol + * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, - int *depth) +static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset, + int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; @@ -610,7 +613,8 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, do { struct vlan_hdr vhdr, *vh; - vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; @@ -625,6 +629,12 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, return type; } +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, + int *depth) +{ + return __vlan_get_protocol_offset(skb, type, 0, depth); +} + /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2e34a49e98de..2d73769d67f47 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -568,21 +568,13 @@ static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) return ntohs(vh->h_vlan_TCI); } -static __be16 vlan_get_protocol_dgram(struct sk_buff *skb) +static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; - if (unlikely(eth_type_vlan(proto))) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; - - skb_push(skb, skb->data - skb_mac_header(skb)); - proto = __vlan_get_protocol(skb, proto, NULL); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } - } + if (unlikely(eth_type_vlan(proto))) + proto = __vlan_get_protocol_offset(skb, proto, + skb_mac_offset(skb), NULL); return proto; } From 260466b576bca0081a7d4acecc8e93687aa22d0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:28:49 +0000 Subject: [PATCH 1126/1450] ila: serialize calls to nf_register_net_hooks() syzbot found a race in ila_add_mapping() [1] commit 031ae72825ce ("ila: call nf_unregister_net_hooks() sooner") attempted to fix a similar issue. Looking at the syzbot repro, we have concurrent ILA_CMD_ADD commands. Add a mutex to make sure at most one thread is calling nf_register_net_hooks(). [1] BUG: KASAN: slab-use-after-free in rht_key_hashfn include/linux/rhashtable.h:159 [inline] BUG: KASAN: slab-use-after-free in __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 Read of size 4 at addr ffff888028f40008 by task dhcpcd/5501 CPU: 1 UID: 0 PID: 5501 Comm: dhcpcd Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xc3/0x620 mm/kasan/report.c:489 kasan_report+0xd9/0x110 mm/kasan/report.c:602 rht_key_hashfn include/linux/rhashtable.h:159 [inline] __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 rhashtable_lookup include/linux/rhashtable.h:646 [inline] rhashtable_lookup_fast include/linux/rhashtable.h:672 [inline] ila_lookup_wildcards net/ipv6/ila/ila_xlat.c:127 [inline] ila_xlat_addr net/ipv6/ila/ila_xlat.c:652 [inline] ila_nf_input+0x1ee/0x620 net/ipv6/ila/ila_xlat.c:185 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xbb/0x200 net/netfilter/core.c:626 nf_hook.constprop.0+0x42e/0x750 include/linux/netfilter.h:269 NF_HOOK include/linux/netfilter.h:312 [inline] ipv6_rcv+0xa4/0x680 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core+0x12e/0x1e0 net/core/dev.c:5672 __netif_receive_skb+0x1d/0x160 net/core/dev.c:5785 process_backlog+0x443/0x15f0 net/core/dev.c:6117 __napi_poll.constprop.0+0xb7/0x550 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0xa94/0x1010 net/core/dev.c:7074 handle_softirqs+0x213/0x8f0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0x109/0x170 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0xa4/0xc0 arch/x86/kernel/apic/apic.c:1049 Fixes: 7f00feaf1076 ("ila: Add generic ILA translation facility") Reported-by: syzbot+47e761d22ecf745f72b9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c9ae.050a0220.2f3838.04c7.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Tom Herbert Link: https://patch.msgid.link/20241230162849.2795486-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ila/ila_xlat.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 7646e401c6304..1d41b2ab48846 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -195,6 +195,8 @@ static const struct nf_hook_ops ila_nf_hook_ops[] = { }, }; +static DEFINE_MUTEX(ila_mutex); + static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -202,16 +204,20 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->xlat.hooks_registered) { + if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ - err = nf_register_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); + mutex_lock(&ila_mutex); + if (!ilan->xlat.hooks_registered) { + err = nf_register_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); + if (!err) + WRITE_ONCE(ilan->xlat.hooks_registered, true); + } + mutex_unlock(&ila_mutex); if (err) return err; - - ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); From 449e6912a2522af672e99992e1201a454910864e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:30 +0100 Subject: [PATCH 1127/1450] mptcp: fix recvbuffer adjust on sleeping rcvmsg If the recvmsg() blocks after receiving some data - i.e. due to SO_RCVLOWAT - the MPTCP code will attempt multiple times to adjust the receive buffer size, wrongly accounting every time the cumulative of received data - instead of accounting only for the delta. Address the issue moving mptcp_rcv_space_adjust just after the data reception and passing it only the just received bytes. This also removes an unneeded difference between the TCP and MPTCP RX code path implementation. Fixes: 581302298524 ("mptcp: error out earlier on disconnect") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-1-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 08a72242428c0..27afdb7e2071b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1939,6 +1939,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; } +static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); + static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -1992,6 +1994,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } + mptcp_rcv_space_adjust(msk, copied); return copied; } @@ -2268,7 +2271,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); - mptcp_rcv_space_adjust(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2276,8 +2278,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } - mptcp_rcv_space_adjust(msk, copied); - out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) From 551844f26da2a9f76c0a698baaffa631d1178645 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:31 +0100 Subject: [PATCH 1128/1450] mptcp: don't always assume copied data in mptcp_cleanup_rbuf() Under some corner cases the MPTCP protocol can end-up invoking mptcp_cleanup_rbuf() when no data has been copied, but such helper assumes the opposite condition. Explicitly drop such assumption and performs the costly call only when strictly needed - before releasing the msk socket lock. Fixes: fd8976790a6c ("mptcp: be careful on MPTCP-level ack.") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-2-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 27afdb7e2071b..5307fff9d9953 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -528,13 +528,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } -static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); + tcp_cleanup_rbuf(ssk, copied); unlock_sock_fast(ssk, slow); } @@ -551,7 +551,7 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } -static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; @@ -559,14 +559,14 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) int space = __mptcp_space(sk); bool cleanup, rx_empty; - cleanup = (space > 0) && (space >= (old_space << 1)); - rx_empty = !__mptcp_rmem(sk); + cleanup = (space > 0) && (space >= (old_space << 1)) && copied; + rx_empty = !__mptcp_rmem(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) - mptcp_subflow_cleanup_rbuf(ssk); + mptcp_subflow_cleanup_rbuf(ssk, copied); } } @@ -2220,9 +2220,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - /* be sure to advertise window change */ - mptcp_cleanup_rbuf(msk); - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; @@ -2271,6 +2268,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); + mptcp_cleanup_rbuf(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2278,6 +2276,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } + mptcp_cleanup_rbuf(msk, copied); + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) From 56b824eb49d6258aa0bad09a406ceac3f643cdae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:32 +0100 Subject: [PATCH 1129/1450] mptcp: prevent excessive coalescing on receive Currently the skb size after coalescing is only limited by the skb layout (the skb must not carry frag_list). A single coalesced skb covering several MSS can potentially fill completely the receive buffer. In such a case, the snd win will zero until the receive buffer will be empty again, affecting tput badly. Fixes: 8268ed4c9d19 ("mptcp: introduce and use mptcp_try_coalesce()") Cc: stable@vger.kernel.org # please delay 2 weeks after 6.13-final release Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-3-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5307fff9d9953..1b2e7cbb577fc 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -136,6 +136,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, int delta; if (MPTCP_SKB_CB(from)->offset || + ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; From 9facce84f4062f782ebde18daa7006a23d40b607 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Mon, 23 Dec 2024 20:45:49 +0530 Subject: [PATCH 1130/1450] net: ti: icssg-prueth: Fix firmware load sequence. Timesync related operations are ran in PRU0 cores for both ICSSG SLICE0 and SLICE1. Currently whenever any ICSSG interface comes up we load the respective firmwares to PRU cores and whenever interface goes down, we stop the resective cores. Due to this, when SLICE0 goes down while SLICE1 is still active, PRU0 firmwares are unloaded and PRU0 core is stopped. This results in clock jump for SLICE1 interface as the timesync related operations are no longer running. As there are interdependencies between SLICE0 and SLICE1 firmwares, fix this by running both PRU0 and PRU1 firmwares as long as at least 1 ICSSG interface is up. Add new flag in prueth struct to check if all firmwares are running and remove the old flag (fw_running). Use emacs_initialized as reference count to load the firmwares for the first and last interface up/down. Moving init_emac_mode and fw_offload_mode API outside of icssg_config to icssg_common_start API as they need to be called only once per firmware boot. Change prueth_emac_restart() to return error code and add error prints inside the caller of this functions in case of any failures. Move prueth_emac_stop() from common to sr1 driver. sr1 and sr2 drivers have different logic handling for stopping the firmwares. While sr1 driver is dependent on emac structure to stop the corresponding pru cores for that slice, for sr2 all the pru cores of both the slices are stopped and is not dependent on emac. So the prueth_emac_stop() function is no longer common and can be moved to sr1 driver. Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") Signed-off-by: MD Danish Anwar Signed-off-by: Meghana Malladi Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icssg_common.c | 25 -- drivers/net/ethernet/ti/icssg/icssg_config.c | 41 ++- drivers/net/ethernet/ti/icssg/icssg_config.h | 1 + drivers/net/ethernet/ti/icssg/icssg_prueth.c | 261 ++++++++++++------ drivers/net/ethernet/ti/icssg/icssg_prueth.h | 5 +- .../net/ethernet/ti/icssg/icssg_prueth_sr1.c | 24 +- 6 files changed, 236 insertions(+), 121 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c index fdebeb2f84e00..74f0f200a89d4 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_common.c +++ b/drivers/net/ethernet/ti/icssg/icssg_common.c @@ -855,31 +855,6 @@ irqreturn_t prueth_rx_irq(int irq, void *dev_id) } EXPORT_SYMBOL_GPL(prueth_rx_irq); -void prueth_emac_stop(struct prueth_emac *emac) -{ - struct prueth *prueth = emac->prueth; - int slice; - - switch (emac->port_id) { - case PRUETH_PORT_MII0: - slice = ICSS_SLICE0; - break; - case PRUETH_PORT_MII1: - slice = ICSS_SLICE1; - break; - default: - netdev_err(emac->ndev, "invalid port\n"); - return; - } - - emac->fw_running = 0; - if (!emac->is_sr1) - rproc_shutdown(prueth->txpru[slice]); - rproc_shutdown(prueth->rtu[slice]); - rproc_shutdown(prueth->pru[slice]); -} -EXPORT_SYMBOL_GPL(prueth_emac_stop); - void prueth_cleanup_tx_ts(struct prueth_emac *emac) { int i; diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.c b/drivers/net/ethernet/ti/icssg/icssg_config.c index 5d2491c2943a8..ddfd1c02a8854 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.c +++ b/drivers/net/ethernet/ti/icssg/icssg_config.c @@ -397,7 +397,7 @@ static int prueth_emac_buffer_setup(struct prueth_emac *emac) return 0; } -static void icssg_init_emac_mode(struct prueth *prueth) +void icssg_init_emac_mode(struct prueth *prueth) { /* When the device is configured as a bridge and it is being brought * back to the emac mode, the host mac address has to be set as 0. @@ -406,9 +406,6 @@ static void icssg_init_emac_mode(struct prueth *prueth) int i; u8 mac[ETH_ALEN] = { 0 }; - if (prueth->emacs_initialized) - return; - /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); @@ -423,15 +420,13 @@ static void icssg_init_emac_mode(struct prueth *prueth) /* Clear host MAC address */ icssg_class_set_host_mac_addr(prueth->miig_rt, mac); } +EXPORT_SYMBOL_GPL(icssg_init_emac_mode); -static void icssg_init_fw_offload_mode(struct prueth *prueth) +void icssg_init_fw_offload_mode(struct prueth *prueth) { u32 addr = prueth->shram.pa + EMAC_ICSSG_SWITCH_DEFAULT_VLAN_TABLE_OFFSET; int i; - if (prueth->emacs_initialized) - return; - /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); @@ -448,6 +443,7 @@ static void icssg_init_fw_offload_mode(struct prueth *prueth) icssg_class_set_host_mac_addr(prueth->miig_rt, prueth->hw_bridge_dev->dev_addr); icssg_set_pvid(prueth, prueth->default_vlan, PRUETH_PORT_HOST); } +EXPORT_SYMBOL_GPL(icssg_init_fw_offload_mode); int icssg_config(struct prueth *prueth, struct prueth_emac *emac, int slice) { @@ -455,11 +451,6 @@ int icssg_config(struct prueth *prueth, struct prueth_emac *emac, int slice) struct icssg_flow_cfg __iomem *flow_cfg; int ret; - if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) - icssg_init_fw_offload_mode(prueth); - else - icssg_init_emac_mode(prueth); - memset_io(config, 0, TAS_GATE_MASK_LIST0); icssg_miig_queues_init(prueth, slice); @@ -786,3 +777,27 @@ void icssg_set_pvid(struct prueth *prueth, u8 vid, u8 port) writel(pvid, prueth->shram.va + EMAC_ICSSG_SWITCH_PORT0_DEFAULT_VLAN_OFFSET); } EXPORT_SYMBOL_GPL(icssg_set_pvid); + +int emac_fdb_flow_id_updated(struct prueth_emac *emac) +{ + struct mgmt_cmd_rsp fdb_cmd_rsp = { 0 }; + int slice = prueth_emac_slice(emac); + struct mgmt_cmd fdb_cmd = { 0 }; + int ret; + + fdb_cmd.header = ICSSG_FW_MGMT_CMD_HEADER; + fdb_cmd.type = ICSSG_FW_MGMT_FDB_CMD_TYPE_RX_FLOW; + fdb_cmd.seqnum = ++(emac->prueth->icssg_hwcmdseq); + fdb_cmd.param = 0; + + fdb_cmd.param |= (slice << 4); + fdb_cmd.cmd_args[0] = 0; + + ret = icssg_send_fdb_msg(emac, &fdb_cmd, &fdb_cmd_rsp); + if (ret) + return ret; + + WARN_ON(fdb_cmd.seqnum != fdb_cmd_rsp.seqnum); + return fdb_cmd_rsp.status == 1 ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(emac_fdb_flow_id_updated); diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.h b/drivers/net/ethernet/ti/icssg/icssg_config.h index 92c2deaa30683..c884e9fa099e6 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.h +++ b/drivers/net/ethernet/ti/icssg/icssg_config.h @@ -55,6 +55,7 @@ struct icssg_rxq_ctx { #define ICSSG_FW_MGMT_FDB_CMD_TYPE 0x03 #define ICSSG_FW_MGMT_CMD_TYPE 0x04 #define ICSSG_FW_MGMT_PKT 0x80000000 +#define ICSSG_FW_MGMT_FDB_CMD_TYPE_RX_FLOW 0x05 struct icssg_r30_cmd { u32 cmd[4]; diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index c568c84a032b6..d76fe6d05e10a 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -164,11 +164,26 @@ static struct icssg_firmwares icssg_emac_firmwares[] = { } }; -static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) +static int prueth_start(struct rproc *rproc, const char *fw_name) +{ + int ret; + + ret = rproc_set_firmware(rproc, fw_name); + if (ret) + return ret; + return rproc_boot(rproc); +} + +static void prueth_shutdown(struct rproc *rproc) +{ + rproc_shutdown(rproc); +} + +static int prueth_emac_start(struct prueth *prueth) { struct icssg_firmwares *firmwares; struct device *dev = prueth->dev; - int slice, ret; + int ret, slice; if (prueth->is_switch_mode) firmwares = icssg_switch_firmwares; @@ -177,49 +192,126 @@ static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) else firmwares = icssg_emac_firmwares; - slice = prueth_emac_slice(emac); - if (slice < 0) { - netdev_err(emac->ndev, "invalid port\n"); - return -EINVAL; + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + ret = prueth_start(prueth->pru[slice], firmwares[slice].pru); + if (ret) { + dev_err(dev, "failed to boot PRU%d: %d\n", slice, ret); + goto unwind_slices; + } + + ret = prueth_start(prueth->rtu[slice], firmwares[slice].rtu); + if (ret) { + dev_err(dev, "failed to boot RTU%d: %d\n", slice, ret); + rproc_shutdown(prueth->pru[slice]); + goto unwind_slices; + } + + ret = prueth_start(prueth->txpru[slice], firmwares[slice].txpru); + if (ret) { + dev_err(dev, "failed to boot TX_PRU%d: %d\n", slice, ret); + rproc_shutdown(prueth->rtu[slice]); + rproc_shutdown(prueth->pru[slice]); + goto unwind_slices; + } } - ret = icssg_config(prueth, emac, slice); - if (ret) - return ret; + return 0; - ret = rproc_set_firmware(prueth->pru[slice], firmwares[slice].pru); - ret = rproc_boot(prueth->pru[slice]); - if (ret) { - dev_err(dev, "failed to boot PRU%d: %d\n", slice, ret); - return -EINVAL; +unwind_slices: + while (--slice >= 0) { + prueth_shutdown(prueth->txpru[slice]); + prueth_shutdown(prueth->rtu[slice]); + prueth_shutdown(prueth->pru[slice]); } - ret = rproc_set_firmware(prueth->rtu[slice], firmwares[slice].rtu); - ret = rproc_boot(prueth->rtu[slice]); - if (ret) { - dev_err(dev, "failed to boot RTU%d: %d\n", slice, ret); - goto halt_pru; + return ret; +} + +static void prueth_emac_stop(struct prueth *prueth) +{ + int slice; + + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + prueth_shutdown(prueth->txpru[slice]); + prueth_shutdown(prueth->rtu[slice]); + prueth_shutdown(prueth->pru[slice]); } +} + +static int prueth_emac_common_start(struct prueth *prueth) +{ + struct prueth_emac *emac; + int ret = 0; + int slice; + + if (!prueth->emac[ICSS_SLICE0] && !prueth->emac[ICSS_SLICE1]) + return -EINVAL; + + /* clear SMEM and MSMC settings for all slices */ + memset_io(prueth->msmcram.va, 0, prueth->msmcram.size); + memset_io(prueth->shram.va, 0, ICSSG_CONFIG_OFFSET_SLICE1 * PRUETH_NUM_MACS); + + icssg_class_default(prueth->miig_rt, ICSS_SLICE0, 0, false); + icssg_class_default(prueth->miig_rt, ICSS_SLICE1, 0, false); + + if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) + icssg_init_fw_offload_mode(prueth); + else + icssg_init_emac_mode(prueth); + + for (slice = 0; slice < PRUETH_NUM_MACS; slice++) { + emac = prueth->emac[slice]; + if (!emac) + continue; + ret = icssg_config(prueth, emac, slice); + if (ret) + goto disable_class; + } + + ret = prueth_emac_start(prueth); + if (ret) + goto disable_class; - ret = rproc_set_firmware(prueth->txpru[slice], firmwares[slice].txpru); - ret = rproc_boot(prueth->txpru[slice]); + emac = prueth->emac[ICSS_SLICE0] ? prueth->emac[ICSS_SLICE0] : + prueth->emac[ICSS_SLICE1]; + ret = icss_iep_init(emac->iep, &prueth_iep_clockops, + emac, IEP_DEFAULT_CYCLE_TIME_NS); if (ret) { - dev_err(dev, "failed to boot TX_PRU%d: %d\n", slice, ret); - goto halt_rtu; + dev_err(prueth->dev, "Failed to initialize IEP module\n"); + goto stop_pruss; } - emac->fw_running = 1; return 0; -halt_rtu: - rproc_shutdown(prueth->rtu[slice]); +stop_pruss: + prueth_emac_stop(prueth); -halt_pru: - rproc_shutdown(prueth->pru[slice]); +disable_class: + icssg_class_disable(prueth->miig_rt, ICSS_SLICE0); + icssg_class_disable(prueth->miig_rt, ICSS_SLICE1); return ret; } +static int prueth_emac_common_stop(struct prueth *prueth) +{ + struct prueth_emac *emac; + + if (!prueth->emac[ICSS_SLICE0] && !prueth->emac[ICSS_SLICE1]) + return -EINVAL; + + icssg_class_disable(prueth->miig_rt, ICSS_SLICE0); + icssg_class_disable(prueth->miig_rt, ICSS_SLICE1); + + prueth_emac_stop(prueth); + + emac = prueth->emac[ICSS_SLICE0] ? prueth->emac[ICSS_SLICE0] : + prueth->emac[ICSS_SLICE1]; + icss_iep_exit(emac->iep); + + return 0; +} + /* called back by PHY layer if there is change in link state of hw port*/ static void emac_adjust_link(struct net_device *ndev) { @@ -374,9 +466,6 @@ static void prueth_iep_settime(void *clockops_data, u64 ns) u32 cycletime; int timeout; - if (!emac->fw_running) - return; - sc_descp = emac->prueth->shram.va + TIMESYNC_FW_WC_SETCLOCK_DESC_OFFSET; cycletime = IEP_DEFAULT_CYCLE_TIME_NS; @@ -543,23 +632,17 @@ static int emac_ndo_open(struct net_device *ndev) { struct prueth_emac *emac = netdev_priv(ndev); int ret, i, num_data_chn = emac->tx_ch_num; + struct icssg_flow_cfg __iomem *flow_cfg; struct prueth *prueth = emac->prueth; int slice = prueth_emac_slice(emac); struct device *dev = prueth->dev; int max_rx_flows; int rx_flow; - /* clear SMEM and MSMC settings for all slices */ - if (!prueth->emacs_initialized) { - memset_io(prueth->msmcram.va, 0, prueth->msmcram.size); - memset_io(prueth->shram.va, 0, ICSSG_CONFIG_OFFSET_SLICE1 * PRUETH_NUM_MACS); - } - /* set h/w MAC as user might have re-configured */ ether_addr_copy(emac->mac_addr, ndev->dev_addr); icssg_class_set_mac_addr(prueth->miig_rt, slice, emac->mac_addr); - icssg_class_default(prueth->miig_rt, slice, 0, false); icssg_ft1_set_mac_addr(prueth->miig_rt, slice, emac->mac_addr); /* Notify the stack of the actual queue counts. */ @@ -597,18 +680,23 @@ static int emac_ndo_open(struct net_device *ndev) goto cleanup_napi; } - /* reset and start PRU firmware */ - ret = prueth_emac_start(prueth, emac); - if (ret) - goto free_rx_irq; + if (!prueth->emacs_initialized) { + ret = prueth_emac_common_start(prueth); + if (ret) + goto free_rx_irq; + } - icssg_mii_update_mtu(prueth->mii_rt, slice, ndev->max_mtu); + flow_cfg = emac->dram.va + ICSSG_CONFIG_OFFSET + PSI_L_REGULAR_FLOW_ID_BASE_OFFSET; + writew(emac->rx_flow_id_base, &flow_cfg->rx_base_flow); + ret = emac_fdb_flow_id_updated(emac); - if (!prueth->emacs_initialized) { - ret = icss_iep_init(emac->iep, &prueth_iep_clockops, - emac, IEP_DEFAULT_CYCLE_TIME_NS); + if (ret) { + netdev_err(ndev, "Failed to update Rx Flow ID %d", ret); + goto stop; } + icssg_mii_update_mtu(prueth->mii_rt, slice, ndev->max_mtu); + ret = request_threaded_irq(emac->tx_ts_irq, NULL, prueth_tx_ts_irq, IRQF_ONESHOT, dev_name(dev), emac); if (ret) @@ -653,7 +741,8 @@ static int emac_ndo_open(struct net_device *ndev) free_tx_ts_irq: free_irq(emac->tx_ts_irq, emac); stop: - prueth_emac_stop(emac); + if (!prueth->emacs_initialized) + prueth_emac_common_stop(prueth); free_rx_irq: free_irq(emac->rx_chns.irq[rx_flow], emac); cleanup_napi: @@ -689,8 +778,6 @@ static int emac_ndo_stop(struct net_device *ndev) if (ndev->phydev) phy_stop(ndev->phydev); - icssg_class_disable(prueth->miig_rt, prueth_emac_slice(emac)); - if (emac->prueth->is_hsr_offload_mode) __dev_mc_unsync(ndev, icssg_prueth_hsr_del_mcast); else @@ -728,11 +815,9 @@ static int emac_ndo_stop(struct net_device *ndev) /* Destroying the queued work in ndo_stop() */ cancel_delayed_work_sync(&emac->stats_work); - if (prueth->emacs_initialized == 1) - icss_iep_exit(emac->iep); - /* stop PRUs */ - prueth_emac_stop(emac); + if (prueth->emacs_initialized == 1) + prueth_emac_common_stop(prueth); free_irq(emac->tx_ts_irq, emac); @@ -1053,10 +1138,11 @@ static void prueth_offload_fwd_mark_update(struct prueth *prueth) } } -static void prueth_emac_restart(struct prueth *prueth) +static int prueth_emac_restart(struct prueth *prueth) { struct prueth_emac *emac0 = prueth->emac[PRUETH_MAC0]; struct prueth_emac *emac1 = prueth->emac[PRUETH_MAC1]; + int ret; /* Detach the net_device for both PRUeth ports*/ if (netif_running(emac0->ndev)) @@ -1065,36 +1151,46 @@ static void prueth_emac_restart(struct prueth *prueth) netif_device_detach(emac1->ndev); /* Disable both PRUeth ports */ - icssg_set_port_state(emac0, ICSSG_EMAC_PORT_DISABLE); - icssg_set_port_state(emac1, ICSSG_EMAC_PORT_DISABLE); + ret = icssg_set_port_state(emac0, ICSSG_EMAC_PORT_DISABLE); + ret |= icssg_set_port_state(emac1, ICSSG_EMAC_PORT_DISABLE); + if (ret) + return ret; /* Stop both pru cores for both PRUeth ports*/ - prueth_emac_stop(emac0); - prueth->emacs_initialized--; - prueth_emac_stop(emac1); - prueth->emacs_initialized--; + ret = prueth_emac_common_stop(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to stop the firmwares"); + return ret; + } /* Start both pru cores for both PRUeth ports */ - prueth_emac_start(prueth, emac0); - prueth->emacs_initialized++; - prueth_emac_start(prueth, emac1); - prueth->emacs_initialized++; + ret = prueth_emac_common_start(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to start the firmwares"); + return ret; + } /* Enable forwarding for both PRUeth ports */ - icssg_set_port_state(emac0, ICSSG_EMAC_PORT_FORWARD); - icssg_set_port_state(emac1, ICSSG_EMAC_PORT_FORWARD); + ret = icssg_set_port_state(emac0, ICSSG_EMAC_PORT_FORWARD); + ret |= icssg_set_port_state(emac1, ICSSG_EMAC_PORT_FORWARD); /* Attache net_device for both PRUeth ports */ netif_device_attach(emac0->ndev); netif_device_attach(emac1->ndev); + + return ret; } static void icssg_change_mode(struct prueth *prueth) { struct prueth_emac *emac; - int mac; + int mac, ret; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { emac = prueth->emac[mac]; @@ -1173,13 +1269,18 @@ static void prueth_netdevice_port_unlink(struct net_device *ndev) { struct prueth_emac *emac = netdev_priv(ndev); struct prueth *prueth = emac->prueth; + int ret; prueth->br_members &= ~BIT(emac->port_id); if (prueth->is_switch_mode) { prueth->is_switch_mode = false; emac->port_vlan = 0; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } } prueth_offload_fwd_mark_update(prueth); @@ -1228,6 +1329,7 @@ static void prueth_hsr_port_unlink(struct net_device *ndev) struct prueth *prueth = emac->prueth; struct prueth_emac *emac0; struct prueth_emac *emac1; + int ret; emac0 = prueth->emac[PRUETH_MAC0]; emac1 = prueth->emac[PRUETH_MAC1]; @@ -1238,7 +1340,11 @@ static void prueth_hsr_port_unlink(struct net_device *ndev) emac0->port_vlan = 0; emac1->port_vlan = 0; prueth->hsr_dev = NULL; - prueth_emac_restart(prueth); + ret = prueth_emac_restart(prueth); + if (ret) { + dev_err(prueth->dev, "Failed to restart the firmwares, aborting the process"); + return; + } netdev_dbg(ndev, "Disabling HSR Offload mode\n"); } } @@ -1413,13 +1519,10 @@ static int prueth_probe(struct platform_device *pdev) prueth->pa_stats = NULL; } - if (eth0_node) { + if (eth0_node || eth1_node) { ret = prueth_get_cores(prueth, ICSS_SLICE0, false); if (ret) goto put_cores; - } - - if (eth1_node) { ret = prueth_get_cores(prueth, ICSS_SLICE1, false); if (ret) goto put_cores; @@ -1618,14 +1721,12 @@ static int prueth_probe(struct platform_device *pdev) pruss_put(prueth->pruss); put_cores: - if (eth1_node) { - prueth_put_cores(prueth, ICSS_SLICE1); - of_node_put(eth1_node); - } - - if (eth0_node) { + if (eth0_node || eth1_node) { prueth_put_cores(prueth, ICSS_SLICE0); of_node_put(eth0_node); + + prueth_put_cores(prueth, ICSS_SLICE1); + of_node_put(eth1_node); } return ret; diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index f5c1d473e9f99..5473315ea2040 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -140,7 +140,6 @@ struct prueth_rx_chn { /* data for each emac port */ struct prueth_emac { bool is_sr1; - bool fw_running; struct prueth *prueth; struct net_device *ndev; u8 mac_addr[6]; @@ -361,6 +360,8 @@ int icssg_set_port_state(struct prueth_emac *emac, enum icssg_port_state_cmd state); void icssg_config_set_speed(struct prueth_emac *emac); void icssg_config_half_duplex(struct prueth_emac *emac); +void icssg_init_emac_mode(struct prueth *prueth); +void icssg_init_fw_offload_mode(struct prueth *prueth); /* Buffer queue helpers */ int icssg_queue_pop(struct prueth *prueth, u8 queue); @@ -377,6 +378,7 @@ void icssg_vtbl_modify(struct prueth_emac *emac, u8 vid, u8 port_mask, u8 untag_mask, bool add); u16 icssg_get_pvid(struct prueth_emac *emac); void icssg_set_pvid(struct prueth *prueth, u8 vid, u8 port); +int emac_fdb_flow_id_updated(struct prueth_emac *emac); #define prueth_napi_to_tx_chn(pnapi) \ container_of(pnapi, struct prueth_tx_chn, napi_tx) @@ -407,7 +409,6 @@ void emac_rx_timestamp(struct prueth_emac *emac, struct sk_buff *skb, u32 *psdata); enum netdev_tx icssg_ndo_start_xmit(struct sk_buff *skb, struct net_device *ndev); irqreturn_t prueth_rx_irq(int irq, void *dev_id); -void prueth_emac_stop(struct prueth_emac *emac); void prueth_cleanup_tx_ts(struct prueth_emac *emac); int icssg_napi_rx_poll(struct napi_struct *napi_rx, int budget); int prueth_prepare_rx_chan(struct prueth_emac *emac, diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c b/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c index 5024f0647a0d3..3dc86397c367d 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth_sr1.c @@ -440,7 +440,6 @@ static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) goto halt_pru; } - emac->fw_running = 1; return 0; halt_pru: @@ -449,6 +448,29 @@ static int prueth_emac_start(struct prueth *prueth, struct prueth_emac *emac) return ret; } +static void prueth_emac_stop(struct prueth_emac *emac) +{ + struct prueth *prueth = emac->prueth; + int slice; + + switch (emac->port_id) { + case PRUETH_PORT_MII0: + slice = ICSS_SLICE0; + break; + case PRUETH_PORT_MII1: + slice = ICSS_SLICE1; + break; + default: + netdev_err(emac->ndev, "invalid port\n"); + return; + } + + if (!emac->is_sr1) + rproc_shutdown(prueth->txpru[slice]); + rproc_shutdown(prueth->rtu[slice]); + rproc_shutdown(prueth->pru[slice]); +} + /** * emac_ndo_open - EMAC device open * @ndev: network adapter device From 9b115361248dc6cce182a2dc030c1c70b0a9639e Mon Sep 17 00:00:00 2001 From: Meghana Malladi Date: Mon, 23 Dec 2024 20:45:50 +0530 Subject: [PATCH 1131/1450] net: ti: icssg-prueth: Fix clearing of IEP_CMP_CFG registers during iep_init When ICSSG interfaces are brought down and brought up again, the pru cores are shut down and booted again, flushing out all the memories and start again in a clean state. Hence it is expected that the IEP_CMP_CFG register needs to be flushed during iep_init() to ensure that the existing residual configuration doesn't cause any unusual behavior. If the register is not cleared, existing IEP_CMP_CFG set for CMP1 will result in SYNC0_OUT signal based on the SYNC_OUT register values. After bringing the interface up, calling PPS enable doesn't work as the driver believes PPS is already enabled, (iep->pps_enabled is not cleared during interface bring down) and driver will just return true even though there is no signal. Fix this by disabling pps and perout. Fixes: c1e0230eeaab ("net: ti: icss-iep: Add IEP driver") Signed-off-by: Meghana Malladi Reviewed-by: Roger Quadros Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icss_iep.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c index 5d6d1cf78e93f..768578c0d9587 100644 --- a/drivers/net/ethernet/ti/icssg/icss_iep.c +++ b/drivers/net/ethernet/ti/icssg/icss_iep.c @@ -215,6 +215,9 @@ static void icss_iep_enable_shadow_mode(struct icss_iep *iep) for (cmp = IEP_MIN_CMP; cmp < IEP_MAX_CMP; cmp++) { regmap_update_bits(iep->map, ICSS_IEP_CMP_STAT_REG, IEP_CMP_STATUS(cmp), IEP_CMP_STATUS(cmp)); + + regmap_update_bits(iep->map, ICSS_IEP_CMP_CFG_REG, + IEP_CMP_CFG_CMP_EN(cmp), 0); } /* enable reset counter on CMP0 event */ @@ -780,6 +783,11 @@ int icss_iep_exit(struct icss_iep *iep) } icss_iep_disable(iep); + if (iep->pps_enabled) + icss_iep_pps_enable(iep, false); + else if (iep->perout_enabled) + icss_iep_perout_enable(iep, NULL, false); + return 0; } EXPORT_SYMBOL_GPL(icss_iep_exit); From 3fff5da4ca2164bb4d0f1e6cd33f6eb8a0e73e50 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Mon, 30 Dec 2024 12:56:47 -0800 Subject: [PATCH 1132/1450] team: prevent adding a device which is already a team device lower Prevent adding a device which is already a team device lower, e.g. adding veth0 if vlan1 was already added and veth0 is a lower of vlan1. This is not useful in practice and can lead to recursive locking: $ ip link add veth0 type veth peer name veth1 $ ip link set veth0 up $ ip link set veth1 up $ ip link add link veth0 name veth0.1 type vlan protocol 802.1Q id 1 $ ip link add team0 type team $ ip link set veth0.1 down $ ip link set veth0.1 master team0 team0: Port device veth0.1 added $ ip link set veth0 down $ ip link set veth0 master team0 ============================================ WARNING: possible recursive locking detected 6.13.0-rc2-virtme-00441-ga14a429069bb #46 Not tainted -------------------------------------------- ip/7684 is trying to acquire lock: ffff888016848e00 (team->team_lock_key){+.+.}-{4:4}, at: team_device_event (drivers/net/team/team_core.c:2928 drivers/net/team/team_core.c:2951 drivers/net/team/team_core.c:2973) but task is already holding lock: ffff888016848e00 (team->team_lock_key){+.+.}-{4:4}, at: team_add_slave (drivers/net/team/team_core.c:1147 drivers/net/team/team_core.c:1977) other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(team->team_lock_key); lock(team->team_lock_key); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by ip/7684: stack backtrace: CPU: 3 UID: 0 PID: 7684 Comm: ip Not tainted 6.13.0-rc2-virtme-00441-ga14a429069bb #46 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:122) print_deadlock_bug.cold (kernel/locking/lockdep.c:3040) __lock_acquire (kernel/locking/lockdep.c:3893 kernel/locking/lockdep.c:5226) ? netlink_broadcast_filtered (net/netlink/af_netlink.c:1548) lock_acquire.part.0 (kernel/locking/lockdep.c:467 kernel/locking/lockdep.c:5851) ? team_device_event (drivers/net/team/team_core.c:2928 drivers/net/team/team_core.c:2951 drivers/net/team/team_core.c:2973) ? trace_lock_acquire (./include/trace/events/lock.h:24 (discriminator 2)) ? team_device_event (drivers/net/team/team_core.c:2928 drivers/net/team/team_core.c:2951 drivers/net/team/team_core.c:2973) ? lock_acquire (kernel/locking/lockdep.c:5822) ? team_device_event (drivers/net/team/team_core.c:2928 drivers/net/team/team_core.c:2951 drivers/net/team/team_core.c:2973) __mutex_lock (kernel/locking/mutex.c:587 kernel/locking/mutex.c:735) ? team_device_event (drivers/net/team/team_core.c:2928 drivers/net/team/team_core.c:2951 drivers/net/team/team_core.c:2973) ? team_device_event (drivers/net/team/team_core.c:2928 drivers/net/team/team_core.c:2951 drivers/net/team/team_core.c:2973) ? fib_sync_up (net/ipv4/fib_semantics.c:2167) ? team_device_event (drivers/net/team/team_core.c:2928 drivers/net/team/team_core.c:2951 drivers/net/team/team_core.c:2973) team_device_event (drivers/net/team/team_core.c:2928 drivers/net/team/team_core.c:2951 drivers/net/team/team_core.c:2973) notifier_call_chain (kernel/notifier.c:85) call_netdevice_notifiers_info (net/core/dev.c:1996) __dev_notify_flags (net/core/dev.c:8993) ? __dev_change_flags (net/core/dev.c:8975) dev_change_flags (net/core/dev.c:9027) vlan_device_event (net/8021q/vlan.c:85 net/8021q/vlan.c:470) ? br_device_event (net/bridge/br.c:143) notifier_call_chain (kernel/notifier.c:85) call_netdevice_notifiers_info (net/core/dev.c:1996) dev_open (net/core/dev.c:1519 net/core/dev.c:1505) team_add_slave (drivers/net/team/team_core.c:1219 drivers/net/team/team_core.c:1977) ? __pfx_team_add_slave (drivers/net/team/team_core.c:1972) do_set_master (net/core/rtnetlink.c:2917) do_setlink.isra.0 (net/core/rtnetlink.c:3117) Reported-by: syzbot+3c47b5843403a45aef57@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=3c47b5843403a45aef57 Fixes: 3d249d4ca7d0 ("net: introduce ethernet teaming device") Signed-off-by: Octavian Purdila Reviewed-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/team/team_core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index c7690adec8db7..dc7cbd6a9798a 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -1175,6 +1175,13 @@ static int team_port_add(struct team *team, struct net_device *port_dev, return -EBUSY; } + if (netdev_has_upper_dev(port_dev, dev)) { + NL_SET_ERR_MSG(extack, "Device is already a lower device of the team interface"); + netdev_err(dev, "Device %s is already a lower device of the team interface\n", + portname); + return -EBUSY; + } + if (port_dev->features & NETIF_F_VLAN_CHALLENGED && vlan_uses_dev(dev)) { NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up"); From 45d339fefaa3dcd237038769e0d34584fb867390 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 19 Dec 2024 14:23:36 +0200 Subject: [PATCH 1133/1450] RDMA/mlx5: Enable multiplane mode only when it is supported Driver queries vport_cxt.num_plane and enables multiplane when it is greater then 0, but some old FWs (versions from x.40.1000 till x.42.1000), report vport_cxt.num_plane = 1 unexpectedly. Fix it by querying num_plane only when HCA_CAP2.multiplane bit is set. Fixes: 2a5db20fa532 ("RDMA/mlx5: Add support to multi-plane device and port") Link: https://patch.msgid.link/r/1ef901acdf564716fcf550453cf5e94f343777ec.1734610916.git.leon@kernel.org Cc: stable@vger.kernel.org Reported-by: Francesco Poli Closes: https://lore.kernel.org/all/nvs4i2v7o6vn6zhmtq4sgazy2hu5kiulukxcntdelggmznnl7h@so3oul6uwgbl/ Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky Reviewed-by: Michal Swiatkowski Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c2314797afc9b..f5b59d02f4d3a 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2839,7 +2839,7 @@ static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) int err; *num_plane = 0; - if (!MLX5_CAP_GEN(mdev, ib_virt)) + if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane)) return 0; err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498bb..48d47181c7cd1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2119,7 +2119,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 migration_in_chunks[0x1]; u8 reserved_at_d1[0x1]; u8 sf_eq_usage[0x1]; - u8 reserved_at_d3[0xd]; + u8 reserved_at_d3[0x5]; + u8 multiplane[0x1]; + u8 reserved_at_d9[0x7]; u8 cross_vhca_object_to_object_supported[0x20]; From 09dfc8a5f2ce897005a94bf66cca4f91e4e03700 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 2 Jan 2025 11:32:54 -0700 Subject: [PATCH 1134/1450] vfio/pci: Fallback huge faults for unaligned pfn The PFN must also be aligned to the fault order to insert a huge pfnmap. Test the alignment and fallback when unaligned. Fixes: f9e54c3a2f5b ("vfio/pci: implement huge_fault support") Link: https://bugzilla.kernel.org/show_bug.cgi?id=219619 Reported-by: Athul Krishna Reported-by: Precific Reviewed-by: Peter Xu Tested-by: Precific Link: https://lore.kernel.org/r/20250102183416.1841878-1-alex.williamson@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1ab58da9f38a6..1a4ed5a357d36 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1661,14 +1661,15 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; vm_fault_t ret = VM_FAULT_SIGBUS; - if (order && (vmf->address & ((PAGE_SIZE << order) - 1) || + pfn = vma_to_pfn(vma) + pgoff; + + if (order && (pfn & ((1 << order) - 1) || + vmf->address & ((PAGE_SIZE << order) - 1) || vmf->address + (PAGE_SIZE << order) > vma->vm_end)) { ret = VM_FAULT_FALLBACK; goto out; } - pfn = vma_to_pfn(vma); - down_read(&vdev->memory_lock); if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) @@ -1676,18 +1677,18 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, switch (order) { case 0: - ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); + ret = vmf_insert_pfn(vma, vmf->address, pfn); break; #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP case PMD_ORDER: - ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff, - PFN_DEV), false); + ret = vmf_insert_pfn_pmd(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); break; #endif #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP case PUD_ORDER: - ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff, - PFN_DEV), false); + ret = vmf_insert_pfn_pud(vmf, + __pfn_to_pfn_t(pfn, PFN_DEV), false); break; #endif default: From 6df90c02bae468a3a6110bafbc659884d0c4966c Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 18 Dec 2024 13:56:58 +0100 Subject: [PATCH 1135/1450] dm-verity FEC: Fix RS FEC repair for roots unaligned to block size (take 2) This patch fixes an issue that was fixed in the commit df7b59ba9245 ("dm verity: fix FEC for RS roots unaligned to block size") but later broken again in the commit 8ca7cab82bda ("dm verity fec: fix misaligned RS roots IO") If the Reed-Solomon roots setting spans multiple blocks, the code does not use proper parity bytes and randomly fails to repair even trivial errors. This bug cannot happen if the sector size is multiple of RS roots setting (Android case with roots 2). The previous solution was to find a dm-bufio block size that is multiple of the device sector size and roots size. Unfortunately, the optimization in commit 8ca7cab82bda ("dm verity fec: fix misaligned RS roots IO") is incorrect and uses data block size for some roots (for example, it uses 4096 block size for roots = 20). This patch uses a different approach: - It always uses a configured data block size for dm-bufio to avoid possible misaligned IOs. - and it caches the processed parity bytes, so it can join it if it spans two blocks. As the RS calculation is called only if an error is detected and the process is computationally intensive, copying a few more bytes should not introduce performance issues. The issue was reported to cryptsetup with trivial reproducer https://gitlab.com/cryptsetup/cryptsetup/-/issues/923 Reproducer (with roots=20): # create verity device with RS FEC dd if=/dev/urandom of=data.img bs=4096 count=8 status=none veritysetup format data.img hash.img --fec-device=fec.img --fec-roots=20 | \ awk '/^Root hash/{ print $3 }' >roothash # create an erasure that should always be repairable with this roots setting dd if=/dev/zero of=data.img conv=notrunc bs=1 count=4 seek=4 status=none # try to read it through dm-verity veritysetup open data.img test hash.img --fec-device=fec.img --fec-roots=20 $(cat roothash) dd if=/dev/mapper/test of=/dev/null bs=4096 status=noxfer Even now the log says it cannot repair it: : verity-fec: 7:1: FEC 0: failed to correct: -74 : device-mapper: verity: 7:1: data block 0 is corrupted ... With this fix, errors are properly repaired. : verity-fec: 7:1: FEC 0: corrected 4 errors Signed-off-by: Milan Broz Fixes: 8ca7cab82bda ("dm verity fec: fix misaligned RS roots IO") Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 40 +++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 62b1a44b8dd2e..6bd9848518d47 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -60,15 +60,19 @@ static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, * to the data block. Caller is responsible for releasing buf. */ static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, - unsigned int *offset, struct dm_buffer **buf, - unsigned short ioprio) + unsigned int *offset, unsigned int par_buf_offset, + struct dm_buffer **buf, unsigned short ioprio) { u64 position, block, rem; u8 *res; + /* We have already part of parity bytes read, skip to the next block */ + if (par_buf_offset) + index++; + position = (index + rsb) * v->fec->roots; block = div64_u64_rem(position, v->fec->io_size, &rem); - *offset = (unsigned int)rem; + *offset = par_buf_offset ? 0 : (unsigned int)rem; res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio); if (IS_ERR(res)) { @@ -128,11 +132,12 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, { int r, corrected = 0, res; struct dm_buffer *buf; - unsigned int n, i, offset; - u8 *par, *block; + unsigned int n, i, offset, par_buf_offset = 0; + u8 *par, *block, par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, + par_buf_offset, &buf, bio_prio(bio)); if (IS_ERR(par)) return PTR_ERR(par); @@ -142,7 +147,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, */ fec_for_each_buffer_rs_block(fio, n, i) { block = fec_buffer_rs_block(v, fio, n, i); - res = fec_decode_rs8(v, fio, block, &par[offset], neras); + memcpy(&par_buf[par_buf_offset], &par[offset], v->fec->roots - par_buf_offset); + res = fec_decode_rs8(v, fio, block, par_buf, neras); if (res < 0) { r = res; goto error; @@ -155,12 +161,21 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, if (block_offset >= 1 << v->data_dev_block_bits) goto done; - /* read the next block when we run out of parity bytes */ - offset += v->fec->roots; + /* Read the next block when we run out of parity bytes */ + offset += (v->fec->roots - par_buf_offset); + /* Check if parity bytes are split between blocks */ + if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) { + par_buf_offset = v->fec->io_size - offset; + memcpy(par_buf, &par[offset], par_buf_offset); + offset += par_buf_offset; + } else + par_buf_offset = 0; + if (offset >= v->fec->io_size) { dm_bufio_release(buf); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, + par_buf_offset, &buf, bio_prio(bio)); if (IS_ERR(par)) return PTR_ERR(par); } @@ -724,10 +739,7 @@ int verity_fec_ctr(struct dm_verity *v) return -E2BIG; } - if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1)) - f->io_size = 1 << v->data_dev_block_bits; - else - f->io_size = v->fec->roots << SECTOR_SHIFT; + f->io_size = 1 << v->data_dev_block_bits; f->bufio = dm_bufio_client_create(f->dev->bdev, f->io_size, From 548c6edbed92031baa4aa32cae55628c810c3ebb Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 18 Dec 2024 13:56:59 +0100 Subject: [PATCH 1136/1450] dm-verity FEC: Avoid copying RS parity bytes twice. Caching RS parity bytes is already done in fec_decode_bufs() now, no need to use yet another buffer for conversion to uint16_t. This patch removes that double copy of RS parity bytes. Signed-off-by: Milan Broz Signed-off-by: Mikulas Patocka --- drivers/md/dm-verity-fec.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 6bd9848518d47..e61855da6461a 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -39,22 +39,6 @@ static inline u64 fec_interleave(struct dm_verity *v, u64 offset) return offset + mod * (v->fec->rounds << v->data_dev_block_bits); } -/* - * Decode an RS block using Reed-Solomon. - */ -static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, - u8 *data, u8 *fec, int neras) -{ - int i; - uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; - - for (i = 0; i < v->fec->roots; i++) - par[i] = fec[i]; - - return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras, - fio->erasures, 0, NULL); -} - /* * Read error-correcting codes for the requested RS block. Returns a pointer * to the data block. Caller is responsible for releasing buf. @@ -132,8 +116,9 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, { int r, corrected = 0, res; struct dm_buffer *buf; - unsigned int n, i, offset, par_buf_offset = 0; - u8 *par, *block, par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; + unsigned int n, i, j, offset, par_buf_offset = 0; + uint16_t par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; + u8 *par, *block; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); par = fec_read_parity(v, rsb, block_offset, &offset, @@ -147,8 +132,11 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, */ fec_for_each_buffer_rs_block(fio, n, i) { block = fec_buffer_rs_block(v, fio, n, i); - memcpy(&par_buf[par_buf_offset], &par[offset], v->fec->roots - par_buf_offset); - res = fec_decode_rs8(v, fio, block, par_buf, neras); + for (j = 0; j < v->fec->roots - par_buf_offset; j++) + par_buf[par_buf_offset + j] = par[offset + j]; + /* Decode an RS block using Reed-Solomon */ + res = decode_rs8(fio->rs, block, par_buf, v->fec->rsn, + NULL, neras, fio->erasures, 0, NULL); if (res < 0) { r = res; goto error; @@ -166,7 +154,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, /* Check if parity bytes are split between blocks */ if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) { par_buf_offset = v->fec->io_size - offset; - memcpy(par_buf, &par[offset], par_buf_offset); + for (j = 0; j < par_buf_offset; j++) + par_buf[j] = par[offset + j]; offset += par_buf_offset; } else par_buf_offset = 0; From ed123c948d06688d10f3b10a7bce1d6fbfd1ed07 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 09:29:09 -0700 Subject: [PATCH 1137/1450] io_uring/kbuf: use pre-committed buffer address for non-pollable file For non-pollable files, buffer ring consumption will commit upfront. This is fine, but io_ring_buffer_select() will return the address of the buffer after having committed it. For incrementally consumed buffers, this is incorrect as it will modify the buffer address. Store the pre-committed value and return that. If that isn't done, then the initial part of the buffer is not used and the application will correctly assume the content arrived at the start of the userspace buffer, but the kernel will have put it later in the buffer. Or it can cause a spurious -EFAULT returned in the CQE, depending on the buffer size. As bounds are suitably checked for doing the actual IO, no adverse side effects are possible - it's just a data misplacement within the existing buffer. Reported-by: Gwendal Fernet Cc: stable@vger.kernel.org Fixes: ae98dbf43d75 ("io_uring/kbuf: add support for incremental buffer consumption") Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d407576ddfb78..eec5eb7de8430 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -139,6 +139,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_uring_buf_ring *br = bl->buf_ring; __u16 tail, head = bl->head; struct io_uring_buf *buf; + void __user *ret; tail = smp_load_acquire(&br->tail); if (unlikely(tail == head)) @@ -153,6 +154,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; + ret = u64_to_user_ptr(buf->addr); if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { /* @@ -168,7 +170,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; } - return u64_to_user_ptr(buf->addr); + return ret; } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, From 03f275adb8fbd7b4ebe96a1ad5044d8e602692dc Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 1 Jan 2025 14:00:37 +0100 Subject: [PATCH 1138/1450] fuse: respect FOPEN_KEEP_CACHE on opendir The re-factoring of fuse_dir_open() missed the need to invalidate directory inode page cache with open flag FOPEN_KEEP_CACHE. Fixes: 7de64d521bf92 ("fuse: break up fuse_open_common()") Reported-by: Prince Kumar Closes: https://lore.kernel.org/linux-fsdevel/CAEW=TRr7CYb4LtsvQPLj-zx5Y+EYBmGfM24SuzwyDoGVNoKm7w@mail.gmail.com/ Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250101130037.96680-1-amir73il@gmail.com Reviewed-by: Bernd Schubert Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 494ac372ace07..e540d05549fff 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1681,6 +1681,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file) */ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) nonseekable_open(inode, file); + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; From 1e9b0e1c550c42c13c111d1a31e822057232abc4 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Thu, 2 Jan 2025 20:23:00 -0500 Subject: [PATCH 1139/1450] net: 802: LLC+SNAP OID:PID lookup on start of skb data 802.2+LLC+SNAP frames received by napi_complete_done() with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. This was an issue as snap_rcv() expected offset to point to SNAP header (OID:PID), causing packet to be dropped. A fix at llc_fixup_skb() (a024e377efed) resets transport_header for any LLC consumers that may care about it, and stops SNAP packets from being dropped, but doesn't fix the problem which is that LLC and SNAP should not use transport_header offset. Ths patch eliminates the use of transport_header offset for SNAP lookup of OID:PID so that SNAP does not rely on the offset at all. The offset is reset after pull for any SNAP packet consumers that may (but shouldn't) use it. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103012303.746521-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/802/psnap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/802/psnap.c b/net/802/psnap.c index fca9d454905fe..389df460c8c4b 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -55,11 +55,11 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; rcu_read_lock(); - proto = find_snap_client(skb_transport_header(skb)); + proto = find_snap_client(skb->data); if (proto) { /* Pass the frame on. */ - skb->transport_header += 5; skb_pull_rcsum(skb, 5); + skb_reset_transport_header(skb); rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } rcu_read_unlock(); From 3479c7549fb1dfa7a1db4efb7347c7b8ef50de4b Mon Sep 17 00:00:00 2001 From: Zhongqiu Duan Date: Thu, 2 Jan 2025 17:14:26 +0000 Subject: [PATCH 1140/1450] tcp/dccp: allow a connection when sk_max_ack_backlog is zero If the backlog of listen() is set to zero, sk_acceptq_is_full() allows one connection to be made, but inet_csk_reqsk_queue_is_full() does not. When the net.ipv4.tcp_syncookies is zero, inet_csk_reqsk_queue_is_full() will cause an immediate drop before the sk_acceptq_is_full() check in tcp_conn_request(), resulting in no connection can be made. This patch tries to keep consistent with 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes."). Link: https://lore.kernel.org/netdev/20250102080258.53858-1-kuniyu@amazon.com/ Fixes: ef547f2ac16b ("tcp: remove max_qlen_log") Signed-off-by: Zhongqiu Duan Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250102171426.915276-1-dzq.aishenghu0@gmail.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 3c82fad904d4c..c7f42844c79a9 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -282,7 +282,7 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return inet_csk_reqsk_queue_len(sk) >= READ_ONCE(sk->sk_max_ack_backlog); + return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog); } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); From a003c38d9bbbacd26b2354795bddb8d25631b0b5 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Thu, 2 Jan 2025 12:41:21 +0000 Subject: [PATCH 1141/1450] net: pcs: pcs-mtk-lynxi: correctly report in-band status capabilities Neither does the LynxI PCS support QSGMII, nor is in-band-status supported in 2500Base-X mode. Fix the pcs_inband_caps() method accordingly. Fixes: 520d29bdda86 ("net: pcs: pcs-mtk-lynxi: implement pcs_inband_caps() method") Signed-off-by: Daniel Golle Link: https://patch.msgid.link/Z3aJccb1vW14aukg@pidgin.makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-mtk-lynxi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/pcs/pcs-mtk-lynxi.c b/drivers/net/pcs/pcs-mtk-lynxi.c index 7de8045352298..ed91cd7a406ae 100644 --- a/drivers/net/pcs/pcs-mtk-lynxi.c +++ b/drivers/net/pcs/pcs-mtk-lynxi.c @@ -93,11 +93,12 @@ static unsigned int mtk_pcs_lynxi_inband_caps(struct phylink_pcs *pcs, { switch (interface) { case PHY_INTERFACE_MODE_1000BASEX: - case PHY_INTERFACE_MODE_2500BASEX: case PHY_INTERFACE_MODE_SGMII: - case PHY_INTERFACE_MODE_QSGMII: return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; + case PHY_INTERFACE_MODE_2500BASEX: + return LINK_INBAND_DISABLE; + default: return 0; } From 3569399994384f7e409a560910613edc2ad4a779 Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Fri, 20 Dec 2024 16:07:26 +0800 Subject: [PATCH 1142/1450] net: stmmac: TSO: Simplify the code flow of DMA descriptor allocations The TCP Segmentation Offload (TSO) engine is an optional function in DWMAC cores, it is implemented for dwmac4 and dwxgmac2 only, ancient dwmac100 and dwmac1000 are not supported by hardware. Current driver code checks priv->dma_cap.tsoen which is read from MAC_HW_Feature1 register to determine if TSO is enabled in hardware configurations, if (!priv->dma_cap.tsoen) driver never sets NETIF_F_TSO for net_device. This patch never affects dwmac100/dwmac1000 and their stmmac_desc_ops: ndesc_ops/enh_desc_ops, since TSO is never supported by them two. The DMA AXI address width of DWMAC cores can be configured to 32-bit/40-bit/48-bit, then the format of DMA transmit descriptors get a little different between 32-bit and 40-bit/48-bit. Current driver code checks priv->dma_cap.addr64 to use certain format with certain configuration. This patch converts the format of DMA transmit descriptors on dwmac4 and dwxgmac2 that the DMA AXI address width is configured to 32-bit (as described by function comments of stmmac_tso_xmit() in current code) to a more generic format (see updated function comments after this patch) which is actually already used on 40-bit/48-bit platforms to provide better compatibility and make code flow cleaner in TSO TX routine. Another interesting finding, struct stmmac_desc_ops is a common abstract interface to maintain descriptors, we should avoid the direct assignment of descriptor members (e.g. desc->des0), stmmac_set_desc_addr() is the proper method yet. This patch tries to improve this by the way. Tested and verified on: DWMAC CORE 5.00a with 32-bit DMA AXI address width DWMAC CORE 5.10a with 32-bit DMA AXI address width DWXGMAC CORE 3.20a with 40-bit DMA AXI address width Signed-off-by: Furong Xu <0x1207@gmail.com> Link: https://patch.msgid.link/20241220080726.1733837-1-0x1207@gmail.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 60 ++++++++----------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 6bc10ffe7a2bc..99eaec8bac4a9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4116,11 +4116,7 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des, desc = &tx_q->dma_tx[tx_q->cur_tx]; curr_addr = des + (total_len - tmp_len); - if (priv->dma_cap.addr64 <= 32) - desc->des0 = cpu_to_le32(curr_addr); - else - stmmac_set_desc_addr(priv, desc, curr_addr); - + stmmac_set_desc_addr(priv, desc, curr_addr); buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ? TSO_MAX_BUFF_SIZE : tmp_len; @@ -4166,17 +4162,27 @@ static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int queue) * First Descriptor * -------- * | DES0 |---> buffer1 = L2/L3/L4 header - * | DES1 |---> TCP Payload (can continue on next descr...) - * | DES2 |---> buffer 1 and 2 len + * | DES1 |---> can be used as buffer2 for TCP Payload if the DMA AXI address + * | | width is 32-bit, but we never use it. + * | | Also can be used as the most-significant 8-bits or 16-bits of + * | | buffer1 address pointer if the DMA AXI address width is 40-bit + * | | or 48-bit, and we always use it. + * | DES2 |---> buffer1 len * | DES3 |---> must set TSE, TCP hdr len-> [22:19]. TCP payload len [17:0] * -------- + * -------- + * | DES0 |---> buffer1 = TCP Payload (can continue on next descr...) + * | DES1 |---> same as the First Descriptor + * | DES2 |---> buffer1 len + * | DES3 | + * -------- * | * ... * | * -------- - * | DES0 | --| Split TCP Payload on Buffers 1 and 2 - * | DES1 | --| - * | DES2 | --> buffer 1 and 2 len + * | DES0 |---> buffer1 = Split TCP Payload + * | DES1 |---> same as the First Descriptor + * | DES2 |---> buffer1 len * | DES3 | * -------- * @@ -4186,15 +4192,14 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) { struct dma_desc *desc, *first, *mss_desc = NULL; struct stmmac_priv *priv = netdev_priv(dev); - int tmp_pay_len = 0, first_tx, nfrags; unsigned int first_entry, tx_packets; struct stmmac_txq_stats *txq_stats; struct stmmac_tx_queue *tx_q; u32 pay_len, mss, queue; - dma_addr_t tso_des, des; + int i, first_tx, nfrags; u8 proto_hdr_len, hdr; + dma_addr_t des; bool set_ic; - int i; /* Always insert VLAN tag to SKB payload for TSO frames. * @@ -4279,24 +4284,9 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) if (dma_mapping_error(priv->device, des)) goto dma_map_err; - if (priv->dma_cap.addr64 <= 32) { - first->des0 = cpu_to_le32(des); - - /* Fill start of payload in buff2 of first descriptor */ - if (pay_len) - first->des1 = cpu_to_le32(des + proto_hdr_len); - - /* If needed take extra descriptors to fill the remaining payload */ - tmp_pay_len = pay_len - TSO_MAX_BUFF_SIZE; - tso_des = des; - } else { - stmmac_set_desc_addr(priv, first, des); - tmp_pay_len = pay_len; - tso_des = des + proto_hdr_len; - pay_len = 0; - } - - stmmac_tso_allocator(priv, tso_des, tmp_pay_len, (nfrags == 0), queue); + stmmac_set_desc_addr(priv, first, des); + stmmac_tso_allocator(priv, des + proto_hdr_len, pay_len, + (nfrags == 0), queue); /* In case two or more DMA transmit descriptors are allocated for this * non-paged SKB data, the DMA buffer address should be saved to @@ -4400,11 +4390,9 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) } /* Complete the first descriptor before granting the DMA */ - stmmac_prepare_tso_tx_desc(priv, first, 1, - proto_hdr_len, - pay_len, - 1, tx_q->tx_skbuff_dma[first_entry].last_segment, - hdr / 4, (skb->len - proto_hdr_len)); + stmmac_prepare_tso_tx_desc(priv, first, 1, proto_hdr_len, 0, 1, + tx_q->tx_skbuff_dma[first_entry].last_segment, + hdr / 4, (skb->len - proto_hdr_len)); /* If context desc is used to change MSS */ if (mss_desc) { From a039e54397c6a75b713b9ce7894a62e06956aa92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jan 2025 10:45:46 +0000 Subject: [PATCH 1143/1450] net_sched: cls_flow: validate TCA_FLOW_RSHIFT attribute syzbot found that TCA_FLOW_RSHIFT attribute was not validated. Right shitfing a 32bit integer is undefined for large shift values. UBSAN: shift-out-of-bounds in net/sched/cls_flow.c:329:23 shift exponent 9445 is too large for 32-bit type 'u32' (aka 'unsigned int') CPU: 1 UID: 0 PID: 54 Comm: kworker/u8:3 Not tainted 6.13.0-rc3-syzkaller-00180-g4f619d518db9 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: ipv6_addrconf addrconf_dad_work Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:231 [inline] __ubsan_handle_shift_out_of_bounds+0x3c8/0x420 lib/ubsan.c:468 flow_classify+0x24d5/0x25b0 net/sched/cls_flow.c:329 tc_classify include/net/tc_wrapper.h:197 [inline] __tcf_classify net/sched/cls_api.c:1771 [inline] tcf_classify+0x420/0x1160 net/sched/cls_api.c:1867 sfb_classify net/sched/sch_sfb.c:260 [inline] sfb_enqueue+0x3ad/0x18b0 net/sched/sch_sfb.c:318 dev_qdisc_enqueue+0x4b/0x290 net/core/dev.c:3793 __dev_xmit_skb net/core/dev.c:3889 [inline] __dev_queue_xmit+0xf0e/0x3f50 net/core/dev.c:4400 dev_queue_xmit include/linux/netdevice.h:3168 [inline] neigh_hh_output include/net/neighbour.h:523 [inline] neigh_output include/net/neighbour.h:537 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:236 iptunnel_xmit+0x55d/0x9b0 net/ipv4/ip_tunnel_core.c:82 udp_tunnel_xmit_skb+0x262/0x3b0 net/ipv4/udp_tunnel_core.c:173 geneve_xmit_skb drivers/net/geneve.c:916 [inline] geneve_xmit+0x21dc/0x2d00 drivers/net/geneve.c:1039 __netdev_start_xmit include/linux/netdevice.h:5002 [inline] netdev_start_xmit include/linux/netdevice.h:5011 [inline] xmit_one net/core/dev.c:3590 [inline] dev_hard_start_xmit+0x27a/0x7d0 net/core/dev.c:3606 __dev_queue_xmit+0x1b73/0x3f50 net/core/dev.c:4434 Fixes: e5dfb815181f ("[NET_SCHED]: Add flow classifier") Reported-by: syzbot+1dbb57d994e54aaa04d2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6777bf49.050a0220.178762.0040.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250103104546.3714168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/cls_flow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5502998aace74..5c2580a07530e 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -356,7 +356,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, - [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, + [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32, + 31 /* BITS_PER_U32 - 1 */), [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, From e95274dfe86490ec2a5633035c24b2de6722841f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Jan 2025 10:24:58 -0800 Subject: [PATCH 1144/1450] selftests: tc-testing: reduce rshift value After previous change rshift >= 32 is no longer allowed. Modify the test to use 31, the test doesn't seem to send any traffic so the exact value shouldn't matter. Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103182458.1213486-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/tc-tests/filters/flow.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json index 996448afe31b3..91d120548bf50 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/flow.json @@ -78,10 +78,10 @@ "setup": [ "$TC qdisc add dev $DEV1 ingress" ], - "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0xff", + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 protocol ip flow map key dst rshift 0x1f", "expExitCode": "0", "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 protocol ip prio 1 flow", - "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 255 baseclass", + "matchPattern": "filter parent ffff: protocol ip pref 1 flow chain [0-9]+ handle 0x1 map keys dst rshift 31 baseclass", "matchCount": "1", "teardown": [ "$TC qdisc del dev $DEV1 ingress" From 8ce4f287524c74a118b0af1eebd4b24a8efca57a Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Fri, 3 Jan 2025 16:10:13 +0800 Subject: [PATCH 1145/1450] net: libwx: fix firmware mailbox abnormal return The existing SW-FW interaction flow on the driver is wrong. Follow this wrong flow, driver would never return error if there is a unknown command. Since firmware writes back 'firmware ready' and 'unknown command' in the mailbox message if there is an unknown command sent by driver. So reading 'firmware ready' does not timeout. Then driver would mistakenly believe that the interaction has completed successfully. It tends to happen with the use of custom firmware. Move the check for 'unknown command' out of the poll timeout for 'firmware ready'. And adjust the debug log so that mailbox messages are always printed when commands timeout. Fixes: 1efa9bfe58c5 ("net: libwx: Implement interaction with firmware") Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/20250103081013.1995939-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 24 ++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index 1bf9c38e41256..deaf670c160eb 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -334,27 +334,25 @@ int wx_host_interface_command(struct wx *wx, u32 *buffer, status = read_poll_timeout(rd32, hicr, hicr & WX_MNG_MBOX_CTL_FWRDY, 1000, timeout * 1000, false, wx, WX_MNG_MBOX_CTL); + buf[0] = rd32(wx, WX_MNG_MBOX); + if ((buf[0] & 0xff0000) >> 16 == 0x80) { + wx_err(wx, "Unknown FW command: 0x%x\n", buffer[0] & 0xff); + status = -EINVAL; + goto rel_out; + } + /* Check command completion */ if (status) { - wx_dbg(wx, "Command has failed with no status valid.\n"); - - buf[0] = rd32(wx, WX_MNG_MBOX); - if ((buffer[0] & 0xff) != (~buf[0] >> 24)) { - status = -EINVAL; - goto rel_out; - } - if ((buf[0] & 0xff0000) >> 16 == 0x80) { - wx_dbg(wx, "It's unknown cmd.\n"); - status = -EINVAL; - goto rel_out; - } - + wx_err(wx, "Command has failed with no status valid.\n"); wx_dbg(wx, "write value:\n"); for (i = 0; i < dword_len; i++) wx_dbg(wx, "%x ", buffer[i]); wx_dbg(wx, "read value:\n"); for (i = 0; i < dword_len; i++) wx_dbg(wx, "%x ", buf[i]); + wx_dbg(wx, "\ncheck: %x %x\n", buffer[0] & 0xff, ~buf[0] >> 24); + + goto rel_out; } if (!return_data) From 5e7f0efd23238039bcd4fc72ff28d94f364ec26b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thi=C3=A9baud=20Weksteen?= Date: Fri, 20 Dec 2024 15:25:58 +1100 Subject: [PATCH 1146/1450] selinux: match extended permissions to their base permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit d1d991efaf34 ("selinux: Add netlink xperm support") a new extended permission was added ("nlmsg"). This was the second extended permission implemented in selinux ("ioctl" being the first one). Extended permissions are associated with a base permission. It was found that, in the access vector cache (avc), the extended permission did not keep track of its base permission. This is an issue for a domain that is using both extended permissions (i.e., a domain calling ioctl() on a netlink socket). In this case, the extended permissions were overlapping. Keep track of the base permission in the cache. A new field "base_perm" is added to struct extended_perms_decision to make sure that the extended permission refers to the correct policy permission. A new field "base_perms" is added to struct extended_perms to quickly decide if extended permissions apply. While it is in theory possible to retrieve the base permission from the access vector, the same base permission may not be mapped to the same bit for each class (e.g., "nlmsg" is mapped to a different bit for "netlink_route_socket" and "netlink_audit_socket"). Instead, use a constant (AVC_EXT_IOCTL or AVC_EXT_NLMSG) provided by the caller. Fixes: d1d991efaf34 ("selinux: Add netlink xperm support") Signed-off-by: Thiébaud Weksteen Signed-off-by: Paul Moore --- security/selinux/avc.c | 61 ++++++++++++++++------------- security/selinux/hooks.c | 6 +-- security/selinux/include/avc.h | 5 ++- security/selinux/include/security.h | 3 ++ security/selinux/ss/services.c | 28 +++++++++---- 5 files changed, 65 insertions(+), 38 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index cc0b0af202961..1f2680bcc43a4 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -174,13 +174,15 @@ int avc_get_hash_stats(char *page) * using a linked list for extended_perms_decision lookup because the list is * always small. i.e. less than 5, typically 1 */ -static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver, - struct avc_xperms_node *xp_node) +static struct extended_perms_decision * +avc_xperms_decision_lookup(u8 driver, u8 base_perm, + struct avc_xperms_node *xp_node) { struct avc_xperms_decision_node *xpd_node; list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) { - if (xpd_node->xpd.driver == driver) + if (xpd_node->xpd.driver == driver && + xpd_node->xpd.base_perm == base_perm) return &xpd_node->xpd; } return NULL; @@ -205,11 +207,12 @@ avc_xperms_has_perm(struct extended_perms_decision *xpd, } static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node, - u8 driver, u8 perm) + u8 driver, u8 base_perm, u8 perm) { struct extended_perms_decision *xpd; security_xperm_set(xp_node->xp.drivers.p, driver); - xpd = avc_xperms_decision_lookup(driver, xp_node); + xp_node->xp.base_perms |= base_perm; + xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node); if (xpd && xpd->allowed) security_xperm_set(xpd->allowed->p, perm); } @@ -245,6 +248,7 @@ static void avc_xperms_free(struct avc_xperms_node *xp_node) static void avc_copy_xperms_decision(struct extended_perms_decision *dest, struct extended_perms_decision *src) { + dest->base_perm = src->base_perm; dest->driver = src->driver; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) @@ -272,6 +276,7 @@ static inline void avc_quick_copy_xperms_decision(u8 perm, */ u8 i = perm >> 5; + dest->base_perm = src->base_perm; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) dest->allowed->p[i] = src->allowed->p[i]; @@ -357,6 +362,7 @@ static int avc_xperms_populate(struct avc_node *node, memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p)); dest->xp.len = src->xp.len; + dest->xp.base_perms = src->xp.base_perms; /* for each source xpd allocate a destination xpd and copy */ list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) { @@ -807,6 +813,7 @@ int __init avc_add_callback(int (*callback)(u32 event), u32 events) * @event : Updating event * @perms : Permission mask bits * @driver: xperm driver information + * @base_perm: the base permission associated with the extended permission * @xperm: xperm permissions * @ssid: AVC entry source sid * @tsid: AVC entry target sid @@ -820,10 +827,9 @@ int __init avc_add_callback(int (*callback)(u32 event), u32 events) * otherwise, this function updates the AVC entry. The original AVC-entry object * will release later by RCU. */ -static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, - u32 tsid, u16 tclass, u32 seqno, - struct extended_perms_decision *xpd, - u32 flags) +static int avc_update_node(u32 event, u32 perms, u8 driver, u8 base_perm, + u8 xperm, u32 ssid, u32 tsid, u16 tclass, u32 seqno, + struct extended_perms_decision *xpd, u32 flags) { u32 hvalue; int rc = 0; @@ -880,7 +886,7 @@ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, case AVC_CALLBACK_GRANT: node->ae.avd.allowed |= perms; if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS)) - avc_xperms_allow_perm(node->ae.xp_node, driver, xperm); + avc_xperms_allow_perm(node->ae.xp_node, driver, base_perm, xperm); break; case AVC_CALLBACK_TRY_REVOKE: case AVC_CALLBACK_REVOKE: @@ -987,10 +993,9 @@ static noinline void avc_compute_av(u32 ssid, u32 tsid, u16 tclass, avc_insert(ssid, tsid, tclass, avd, xp_node); } -static noinline int avc_denied(u32 ssid, u32 tsid, - u16 tclass, u32 requested, - u8 driver, u8 xperm, unsigned int flags, - struct av_decision *avd) +static noinline int avc_denied(u32 ssid, u32 tsid, u16 tclass, u32 requested, + u8 driver, u8 base_perm, u8 xperm, + unsigned int flags, struct av_decision *avd) { if (flags & AVC_STRICT) return -EACCES; @@ -999,7 +1004,7 @@ static noinline int avc_denied(u32 ssid, u32 tsid, !(avd->flags & AVD_FLAGS_PERMISSIVE)) return -EACCES; - avc_update_node(AVC_CALLBACK_GRANT, requested, driver, + avc_update_node(AVC_CALLBACK_GRANT, requested, driver, base_perm, xperm, ssid, tsid, tclass, avd->seqno, NULL, flags); return 0; } @@ -1012,7 +1017,8 @@ static noinline int avc_denied(u32 ssid, u32 tsid, * driver field is used to specify which set contains the permission. */ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, - u8 driver, u8 xperm, struct common_audit_data *ad) + u8 driver, u8 base_perm, u8 xperm, + struct common_audit_data *ad) { struct avc_node *node; struct av_decision avd; @@ -1047,22 +1053,23 @@ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, local_xpd.auditallow = &auditallow; local_xpd.dontaudit = &dontaudit; - xpd = avc_xperms_decision_lookup(driver, xp_node); + xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node); if (unlikely(!xpd)) { /* * Compute the extended_perms_decision only if the driver - * is flagged + * is flagged and the base permission is known. */ - if (!security_xperm_test(xp_node->xp.drivers.p, driver)) { + if (!security_xperm_test(xp_node->xp.drivers.p, driver) || + !(xp_node->xp.base_perms & base_perm)) { avd.allowed &= ~requested; goto decision; } rcu_read_unlock(); - security_compute_xperms_decision(ssid, tsid, tclass, - driver, &local_xpd); + security_compute_xperms_decision(ssid, tsid, tclass, driver, + base_perm, &local_xpd); rcu_read_lock(); - avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, - driver, xperm, ssid, tsid, tclass, avd.seqno, + avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, + base_perm, xperm, ssid, tsid, tclass, avd.seqno, &local_xpd, 0); } else { avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd); @@ -1075,8 +1082,8 @@ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, decision: denied = requested & ~(avd.allowed); if (unlikely(denied)) - rc = avc_denied(ssid, tsid, tclass, requested, - driver, xperm, AVC_EXTENDED_PERMS, &avd); + rc = avc_denied(ssid, tsid, tclass, requested, driver, + base_perm, xperm, AVC_EXTENDED_PERMS, &avd); rcu_read_unlock(); @@ -1110,7 +1117,7 @@ static noinline int avc_perm_nonode(u32 ssid, u32 tsid, u16 tclass, avc_compute_av(ssid, tsid, tclass, avd, &xp_node); denied = requested & ~(avd->allowed); if (unlikely(denied)) - return avc_denied(ssid, tsid, tclass, requested, 0, 0, + return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0, flags, avd); return 0; } @@ -1158,7 +1165,7 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid, rcu_read_unlock(); if (unlikely(denied)) - return avc_denied(ssid, tsid, tclass, requested, 0, 0, + return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0, flags, avd); return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index f5a08f94e0940..011d9121b3ab5 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3688,8 +3688,8 @@ static int ioctl_has_perm(const struct cred *cred, struct file *file, return 0; isec = inode_security(inode); - rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, - requested, driver, xperm, &ad); + rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, requested, + driver, AVC_EXT_IOCTL, xperm, &ad); out: return rc; } @@ -5952,7 +5952,7 @@ static int nlmsg_sock_has_extended_perms(struct sock *sk, u32 perms, u16 nlmsg_t xperm = nlmsg_type & 0xff; return avc_has_extended_perms(current_sid(), sksec->sid, sksec->sclass, - perms, driver, xperm, &ad); + perms, driver, AVC_EXT_NLMSG, xperm, &ad); } static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb) diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index 96a614d47df8d..281f401036633 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -136,8 +136,11 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata); +#define AVC_EXT_IOCTL (1 << 0) /* Cache entry for an ioctl extended permission */ +#define AVC_EXT_NLMSG (1 << 1) /* Cache entry for an nlmsg extended permission */ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, - u8 driver, u8 perm, struct common_audit_data *ad); + u8 driver, u8 base_perm, u8 perm, + struct common_audit_data *ad); u32 avc_policy_seqno(void); diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index c7f2731abd03e..700bd6c8bb386 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -239,6 +239,7 @@ struct extended_perms_data { struct extended_perms_decision { u8 used; u8 driver; + u8 base_perm; struct extended_perms_data *allowed; struct extended_perms_data *auditallow; struct extended_perms_data *dontaudit; @@ -246,6 +247,7 @@ struct extended_perms_decision { struct extended_perms { u16 len; /* length associated decision chain */ + u8 base_perms; /* which base permissions are covered */ struct extended_perms_data drivers; /* flag drivers that are used */ }; @@ -257,6 +259,7 @@ void security_compute_av(u32 ssid, u32 tsid, u16 tclass, struct extended_perms *xperms); void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass, u8 driver, + u8 base_perm, struct extended_perms_decision *xpermd); void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass, diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 3d5c563cfc4c8..d9f58b5d0f49f 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -582,7 +582,7 @@ static void type_attribute_bounds_av(struct policydb *policydb, } /* - * Flag which drivers have permissions. + * Flag which drivers have permissions and which base permissions are covered. */ void services_compute_xperms_drivers( struct extended_perms *xperms, @@ -592,12 +592,19 @@ void services_compute_xperms_drivers( switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLDRIVER: + xperms->base_perms |= AVC_EXT_IOCTL; /* if one or more driver has all permissions allowed */ for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++) xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i]; break; case AVTAB_XPERMS_IOCTLFUNCTION: + xperms->base_perms |= AVC_EXT_IOCTL; + /* if allowing permissions within a driver */ + security_xperm_set(xperms->drivers.p, + node->datum.u.xperms->driver); + break; case AVTAB_XPERMS_NLMSG: + xperms->base_perms |= AVC_EXT_NLMSG; /* if allowing permissions within a driver */ security_xperm_set(xperms->drivers.p, node->datum.u.xperms->driver); @@ -631,8 +638,7 @@ static void context_struct_compute_av(struct policydb *policydb, avd->auditallow = 0; avd->auditdeny = 0xffffffff; if (xperms) { - memset(&xperms->drivers, 0, sizeof(xperms->drivers)); - xperms->len = 0; + memset(xperms, 0, sizeof(*xperms)); } if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) { @@ -969,13 +975,19 @@ void services_compute_xperms_decision(struct extended_perms_decision *xpermd, { switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLFUNCTION: - case AVTAB_XPERMS_NLMSG: - if (xpermd->driver != node->datum.u.xperms->driver) + if (xpermd->base_perm != AVC_EXT_IOCTL || + xpermd->driver != node->datum.u.xperms->driver) return; break; case AVTAB_XPERMS_IOCTLDRIVER: - if (!security_xperm_test(node->datum.u.xperms->perms.p, - xpermd->driver)) + if (xpermd->base_perm != AVC_EXT_IOCTL || + !security_xperm_test(node->datum.u.xperms->perms.p, + xpermd->driver)) + return; + break; + case AVTAB_XPERMS_NLMSG: + if (xpermd->base_perm != AVC_EXT_NLMSG || + xpermd->driver != node->datum.u.xperms->driver) return; break; default: @@ -1010,6 +1022,7 @@ void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 orig_tclass, u8 driver, + u8 base_perm, struct extended_perms_decision *xpermd) { struct selinux_policy *policy; @@ -1023,6 +1036,7 @@ void security_compute_xperms_decision(u32 ssid, struct ebitmap_node *snode, *tnode; unsigned int i, j; + xpermd->base_perm = base_perm; xpermd->driver = driver; xpermd->used = 0; memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p)); From 385443057f475e775fe1c66e77d4be9727f40973 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 3 Jan 2025 19:20:23 +0100 Subject: [PATCH 1147/1450] kbuild: pacman-pkg: provide versioned linux-api-headers package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Arch Linux glibc package contains a versioned dependency on "linux-api-headers". If the linux-api-headers package provided by pacman-pkg does not specify an explicit version this dependency is not satisfied. Fix the dependency by providing an explicit version. Fixes: c8578539deba ("kbuild: add script and target to generate pacman package") Signed-off-by: Thomas Weißschuh Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/package/PKGBUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD index f83493838cf96..dca706617adc7 100644 --- a/scripts/package/PKGBUILD +++ b/scripts/package/PKGBUILD @@ -103,7 +103,7 @@ _package-headers() { _package-api-headers() { pkgdesc="Kernel headers sanitized for use in userspace" - provides=(linux-api-headers) + provides=(linux-api-headers="${pkgver}") conflicts=(linux-api-headers) _prologue From 9d89551994a430b50c4fffcb1e617a057fa76e20 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 5 Jan 2025 14:13:40 -0800 Subject: [PATCH 1148/1450] Linux 6.13-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 48e89108aa58c..7904d5d88088d 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* From b9ed315d3c4c0c294a4348edb6874d489bac47fa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 21 Dec 2024 00:46:56 +0100 Subject: [PATCH 1149/1450] netkit: Allow for configuring needed_{head,tail}room Allow the user to configure needed_{head,tail}room for both netkit devices. The idea is similar to 163e529200af ("veth: implement ndo_set_rx_headroom") with the difference that the two parameters can be specified upon device creation. By default the current behavior stays as is which is needed_{head,tail}room is 0. In case of Cilium, for example, the netkit devices are not enslaved into a bridge or openvswitch device (rather, BPF-based redirection is used out of tcx), and as such these parameters are not propagated into the Pod's netns via peer device. Given Cilium can run in vxlan/geneve tunneling mode (needed_headroom) and/or be used in combination with WireGuard (needed_{head,tail}room), allow the Cilium CNI plugin to specify these two upon netkit device creation. Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/bpf/20241220234658.490686-1-daniel@iogearbox.net --- drivers/net/netkit.c | 66 +++++++++++++++++++----------- include/uapi/linux/if_link.h | 2 + tools/include/uapi/linux/if_link.h | 2 + 3 files changed, 47 insertions(+), 23 deletions(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index c1d881dc6409a..1e1b00756be7d 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -338,6 +338,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; enum netkit_mode mode = NETKIT_L3; unsigned char ifname_assign_type; + u16 headroom = 0, tailroom = 0; struct ifinfomsg *ifmp = NULL; struct net_device *peer; char ifname[IFNAMSIZ]; @@ -371,6 +372,10 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, if (err < 0) return err; } + if (data[IFLA_NETKIT_HEADROOM]) + headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); + if (data[IFLA_NETKIT_TAILROOM]) + tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); } if (ifmp && tbp[IFLA_IFNAME]) { @@ -390,6 +395,14 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, return PTR_ERR(peer); netif_inherit_tso_max(peer, dev); + if (headroom) { + peer->needed_headroom = headroom; + dev->needed_headroom = headroom; + } + if (tailroom) { + peer->needed_tailroom = tailroom; + dev->needed_tailroom = tailroom; + } if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) eth_hw_addr_random(peer); @@ -401,6 +414,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, nk->policy = policy_peer; nk->scrub = scrub_peer; nk->mode = mode; + nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); err = register_netdevice(peer); @@ -426,6 +440,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, nk->policy = policy_prim; nk->scrub = scrub_prim; nk->mode = mode; + nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); err = register_netdevice(dev); @@ -850,7 +865,18 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], struct net_device *peer = rtnl_dereference(nk->peer); enum netkit_action policy; struct nlattr *attr; - int err; + int err, i; + static const struct { + u32 attr; + char *name; + } fixed_params[] = { + { IFLA_NETKIT_MODE, "operating mode" }, + { IFLA_NETKIT_SCRUB, "scrubbing" }, + { IFLA_NETKIT_PEER_SCRUB, "peer scrubbing" }, + { IFLA_NETKIT_PEER_INFO, "peer info" }, + { IFLA_NETKIT_HEADROOM, "headroom" }, + { IFLA_NETKIT_TAILROOM, "tailroom" }, + }; if (!nk->primary) { NL_SET_ERR_MSG(extack, @@ -858,28 +884,14 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], return -EACCES; } - if (data[IFLA_NETKIT_MODE]) { - NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE], - "netkit link operating mode cannot be changed after device creation"); - return -EACCES; - } - - if (data[IFLA_NETKIT_SCRUB]) { - NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_SCRUB], - "netkit scrubbing cannot be changed after device creation"); - return -EACCES; - } - - if (data[IFLA_NETKIT_PEER_SCRUB]) { - NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_SCRUB], - "netkit scrubbing cannot be changed after device creation"); - return -EACCES; - } - - if (data[IFLA_NETKIT_PEER_INFO]) { - NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_INFO], - "netkit peer info cannot be changed after device creation"); - return -EINVAL; + for (i = 0; i < ARRAY_SIZE(fixed_params); i++) { + attr = data[fixed_params[i].attr]; + if (attr) { + NL_SET_ERR_MSG_ATTR_FMT(extack, attr, + "netkit link %s cannot be changed after device creation", + fixed_params[i].name); + return -EACCES; + } } if (data[IFLA_NETKIT_POLICY]) { @@ -914,6 +926,8 @@ static size_t netkit_get_size(const struct net_device *dev) nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_SCRUB */ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */ nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ + nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ + nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ 0; } @@ -930,6 +944,10 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_NETKIT_SCRUB, nk->scrub)) return -EMSGSIZE; + if (nla_put_u16(skb, IFLA_NETKIT_HEADROOM, dev->needed_headroom)) + return -EMSGSIZE; + if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) + return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); @@ -947,6 +965,8 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_MODE] = NLA_POLICY_MAX(NLA_U32, NETKIT_L3), [IFLA_NETKIT_POLICY] = { .type = NLA_U32 }, [IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 }, + [IFLA_NETKIT_HEADROOM] = { .type = NLA_U16 }, + [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2575e0cd9b482..2fa2c265dcbaf 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1315,6 +1315,8 @@ enum { IFLA_NETKIT_MODE, IFLA_NETKIT_SCRUB, IFLA_NETKIT_PEER_SCRUB, + IFLA_NETKIT_HEADROOM, + IFLA_NETKIT_TAILROOM, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 8516c1ccd57a7..7e46ca4cd31bb 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -1315,6 +1315,8 @@ enum { IFLA_NETKIT_MODE, IFLA_NETKIT_SCRUB, IFLA_NETKIT_PEER_SCRUB, + IFLA_NETKIT_HEADROOM, + IFLA_NETKIT_TAILROOM, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) From cc529a33d559cc75eb7250a4f4e2b9e431761312 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 21 Dec 2024 00:46:57 +0100 Subject: [PATCH 1150/1450] netkit: Add add netkit {head,tail}room to rt_link.yaml Add netkit {head,tail}room attribute support to the rt_link.yaml spec file. Example: # ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/rt_link.yaml \ --do getlink --json '{"ifname": "nk0"}' --output-json | jq [...] "linkinfo": { "kind": "netkit", "data": { "primary": 0, "policy": "forward", "mode": "l3", "scrub": "default", "headroom": 0, "tailroom": 0, "peer-policy": "forward", "peer-scrub": "default" } }, [...] Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/bpf/20241220234658.490686-2-daniel@iogearbox.net --- Documentation/netlink/specs/rt_link.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 9ffa13b77dcf1..dbeae6b1c548f 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -2166,6 +2166,12 @@ attribute-sets: name: peer-scrub type: u32 enum: netkit-scrub + - + name: headroom + type: u16 + - + name: tailroom + type: u16 sub-messages: - From 058268e23fcadc2bdb9297c6dff3a010c70f9762 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 21 Dec 2024 00:46:58 +0100 Subject: [PATCH 1151/1450] selftests/bpf: Extend netkit tests to validate set {head,tail}room Extend the netkit selftests to specify and validate the {head,tail}room on the netdevice: # ./vmtest.sh -- ./test_progs -t netkit [...] ./test_progs -t netkit [ 1.174147] bpf_testmod: loading out-of-tree module taints kernel. [ 1.174585] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel [ 1.422307] tsc: Refined TSC clocksource calibration: 3407.983 MHz [ 1.424511] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x311fc3e5084, max_idle_ns: 440795359833 ns [ 1.428092] clocksource: Switched to clocksource tsc #363 tc_netkit_basic:OK #364 tc_netkit_device:OK #365 tc_netkit_multi_links:OK #366 tc_netkit_multi_opts:OK #367 tc_netkit_neigh_links:OK #368 tc_netkit_pkt_type:OK #369 tc_netkit_scrub:OK Summary: 7/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/bpf/20241220234658.490686-3-daniel@iogearbox.net --- .../selftests/bpf/prog_tests/tc_netkit.c | 49 ++++++++++++------- .../selftests/bpf/progs/test_tc_link.c | 15 ++++++ 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c index 151a4210028f8..2461d183dee58 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_netkit.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_netkit.c @@ -14,10 +14,16 @@ #include "netlink_helpers.h" #include "tc_helpers.h" +#define NETKIT_HEADROOM 32 +#define NETKIT_TAILROOM 8 + #define MARK 42 #define PRIO 0xeb9f #define ICMP_ECHO 8 +#define FLAG_ADJUST_ROOM (1 << 0) +#define FLAG_SAME_NETNS (1 << 1) + struct icmphdr { __u8 type; __u8 code; @@ -35,7 +41,7 @@ struct iplink_req { }; static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, - bool same_netns, int scrub, int peer_scrub) + int scrub, int peer_scrub, __u32 flags) { struct rtnl_handle rth = { .fd = -1 }; struct iplink_req req = {}; @@ -63,6 +69,10 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, addattr32(&req.n, sizeof(req), IFLA_NETKIT_SCRUB, scrub); addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_SCRUB, peer_scrub); addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode); + if (flags & FLAG_ADJUST_ROOM) { + addattr16(&req.n, sizeof(req), IFLA_NETKIT_HEADROOM, NETKIT_HEADROOM); + addattr16(&req.n, sizeof(req), IFLA_NETKIT_TAILROOM, NETKIT_TAILROOM); + } addattr_nest_end(&req.n, data); addattr_nest_end(&req.n, linkinfo); @@ -87,7 +97,7 @@ static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, " addr ee:ff:bb:cc:aa:dd"), "set hwaddress"); } - if (same_netns) { + if (flags & FLAG_SAME_NETNS) { ASSERT_OK(system("ip link set dev " netkit_peer " up"), "up peer"); ASSERT_OK(system("ip addr add dev " netkit_peer " 10.0.0.2/24"), @@ -184,8 +194,8 @@ void serial_test_tc_netkit_basic(void) int err, ifindex; err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, 0); if (err) return; @@ -299,8 +309,8 @@ static void serial_test_tc_netkit_multi_links_target(int mode, int target) int err, ifindex; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, 0); if (err) return; @@ -428,8 +438,8 @@ static void serial_test_tc_netkit_multi_opts_target(int mode, int target) int err, ifindex; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, 0); if (err) return; @@ -543,8 +553,8 @@ void serial_test_tc_netkit_device(void) int err, ifindex, ifindex2; err = create_netkit(NETKIT_L3, NETKIT_PASS, NETKIT_PASS, - &ifindex, true, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, FLAG_SAME_NETNS); if (err) return; @@ -655,8 +665,8 @@ static void serial_test_tc_netkit_neigh_links_target(int mode, int target) int err, ifindex; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, 0); if (err) return; @@ -733,8 +743,8 @@ static void serial_test_tc_netkit_pkt_type_mode(int mode) struct bpf_link *link; err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, - &ifindex, true, NETKIT_SCRUB_DEFAULT, - NETKIT_SCRUB_DEFAULT); + &ifindex, NETKIT_SCRUB_DEFAULT, + NETKIT_SCRUB_DEFAULT, FLAG_SAME_NETNS); if (err) return; @@ -799,7 +809,7 @@ void serial_test_tc_netkit_pkt_type(void) serial_test_tc_netkit_pkt_type_mode(NETKIT_L3); } -static void serial_test_tc_netkit_scrub_type(int scrub) +static void serial_test_tc_netkit_scrub_type(int scrub, bool room) { LIBBPF_OPTS(bpf_netkit_opts, optl); struct test_tc_link *skel; @@ -807,7 +817,8 @@ static void serial_test_tc_netkit_scrub_type(int scrub) int err, ifindex; err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS, - &ifindex, false, scrub, scrub); + &ifindex, scrub, scrub, + room ? FLAG_ADJUST_ROOM : 0); if (err) return; @@ -842,6 +853,8 @@ static void serial_test_tc_netkit_scrub_type(int scrub) ASSERT_EQ(skel->bss->seen_tc8, true, "seen_tc8"); ASSERT_EQ(skel->bss->mark, scrub == NETKIT_SCRUB_NONE ? MARK : 0, "mark"); ASSERT_EQ(skel->bss->prio, scrub == NETKIT_SCRUB_NONE ? PRIO : 0, "prio"); + ASSERT_EQ(skel->bss->headroom, room ? NETKIT_HEADROOM : 0, "headroom"); + ASSERT_EQ(skel->bss->tailroom, room ? NETKIT_TAILROOM : 0, "tailroom"); cleanup: test_tc_link__destroy(skel); @@ -852,6 +865,6 @@ static void serial_test_tc_netkit_scrub_type(int scrub) void serial_test_tc_netkit_scrub(void) { - serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT); - serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE); + serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT, false); + serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE, true); } diff --git a/tools/testing/selftests/bpf/progs/test_tc_link.c b/tools/testing/selftests/bpf/progs/test_tc_link.c index 10d8259284994..630f12e51b075 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_link.c +++ b/tools/testing/selftests/bpf/progs/test_tc_link.c @@ -8,6 +8,7 @@ #include #include #include +#include char LICENSE[] SEC("license") = "GPL"; @@ -27,6 +28,7 @@ bool seen_host; bool seen_mcast; int mark, prio; +unsigned short headroom, tailroom; SEC("tc/ingress") int tc1(struct __sk_buff *skb) @@ -104,11 +106,24 @@ int tc7(struct __sk_buff *skb) return TCX_PASS; } +struct sk_buff { + struct net_device *dev; +}; + +struct net_device { + unsigned short needed_headroom; + unsigned short needed_tailroom; +}; + SEC("tc/egress") int tc8(struct __sk_buff *skb) { + struct net_device *dev = BPF_CORE_READ((struct sk_buff *)skb, dev); + seen_tc8 = true; mark = skb->mark; prio = skb->priority; + headroom = BPF_CORE_READ(dev, needed_headroom); + tailroom = BPF_CORE_READ(dev, needed_tailroom); return TCX_PASS; } From dadf03cfd4eaa09f1d0e8b2521de1e11d3e3bec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:23 +0000 Subject: [PATCH 1152/1450] io_uring/cmd: rename struct uring_cache to io_uring_cmd_data In preparation for making this more generically available for ->uring_cmd() usage that needs stable command data, rename it and move it to io_uring/cmd.h instead. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 4 ++++ io_uring/io_uring.c | 2 +- io_uring/opdef.c | 3 ++- io_uring/uring_cmd.c | 10 +++++----- io_uring/uring_cmd.h | 4 ---- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c189d36ad55ea..24cff2b9b9d4d 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -18,6 +18,10 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; +struct io_uring_cmd_data { + struct io_uring_sqe sqes[2]; +}; + static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) { return sqe->cmd; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b2736e3491b86..8ae6bf746fcc5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -315,7 +315,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct uring_cache)); + sizeof(struct io_uring_cmd_data)); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb)); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index a2be3bbca5ffa..c7746f67cc65f 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "io_uring.h" #include "opdef.h" @@ -414,7 +415,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = 2 * sizeof(struct io_uring_sqe), + .async_size = sizeof(struct io_uring_cmd_data), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index e2e8485932d64..eefc203a12144 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -16,10 +16,10 @@ #include "rsrc.h" #include "uring_cmd.h" -static struct uring_cache *io_uring_async_get(struct io_kiocb *req) +static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct uring_cache *cache; + struct io_uring_cmd_data *cache; cache = io_alloc_cache_get(&ctx->uring_cache); if (cache) { @@ -35,7 +35,7 @@ static struct uring_cache *io_uring_async_get(struct io_kiocb *req) static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache = req->async_data; + struct io_uring_cmd_data *cache = req->async_data; if (issue_flags & IO_URING_F_UNLOCKED) return; @@ -183,7 +183,7 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache; + struct io_uring_cmd_data *cache; cache = io_uring_async_get(req); if (unlikely(!cache)) @@ -256,7 +256,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { - struct uring_cache *cache = req->async_data; + struct io_uring_cmd_data *cache = req->async_data; if (ioucmd->sqe != (void *) cache) memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index a361f98664d2d..515823ca68b85 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -struct uring_cache { - struct io_uring_sqe sqes[2]; -}; - int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); From 3347fa658a1baecd61b007787d031b729cd86537 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:24 +0000 Subject: [PATCH 1153/1450] io_uring/cmd: add per-op data to struct io_uring_cmd_data In case an op handler for ->uring_cmd() needs stable storage for user data, it can allocate io_uring_cmd_data->op_data and use it for the duration of the request. When the request gets cleaned up, uring_cmd will free it automatically. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 1 + io_uring/uring_cmd.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 24cff2b9b9d4d..3df6636ec3a32 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -20,6 +20,7 @@ struct io_uring_cmd { struct io_uring_cmd_data { struct io_uring_sqe sqes[2]; + void *op_data; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index eefc203a12144..019d6f49ff20f 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -23,12 +23,16 @@ static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req) cache = io_alloc_cache_get(&ctx->uring_cache); if (cache) { + cache->op_data = NULL; req->flags |= REQ_F_ASYNC_DATA; req->async_data = cache; return cache; } - if (!io_alloc_async_data(req)) - return req->async_data; + if (!io_alloc_async_data(req)) { + cache = req->async_data; + cache->op_data = NULL; + return cache; + } return NULL; } @@ -37,6 +41,11 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd_data *cache = req->async_data; + if (cache->op_data) { + kfree(cache->op_data); + cache->op_data = NULL; + } + if (issue_flags & IO_URING_F_UNLOCKED) return; if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { From b0af20d33f63c74985a6dd98344326e5111b2fea Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Fri, 3 Jan 2025 15:02:25 +0000 Subject: [PATCH 1154/1450] io_uring: add io_uring_cmd_get_async_data helper Add a helper function in include/linux/io_uring/cmd.h to read the async_data pointer from a struct io_uring_cmd. Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 3df6636ec3a32..b0aeec834c1db 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -118,4 +118,9 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd return cmd_to_io_kiocb(cmd)->task; } +static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->async_data; +} + #endif /* _LINUX_IO_URING_CMD_H */ From c21b89d495bab6ae7ce0a1592bb955e5e80127fd Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Fri, 3 Jan 2025 15:02:26 +0000 Subject: [PATCH 1155/1450] btrfs: don't read from userspace twice in btrfs_uring_encoded_read() If we return -EAGAIN the first time because we need to block, btrfs_uring_encoded_read() will get called twice. Take a copy of args, the iovs, and the iter the first time, as by the time we are called the second time these may have gone out of scope. Reported-by: Jens Axboe Fixes: 34310c442e17 ("btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl)") Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 122 +++++++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 57 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f8680e7cc974f..149259180faa0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4879,25 +4879,29 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, return ret; } +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; - struct btrfs_ioctl_encoded_io_args args = { 0 }; int ret; u64 disk_bytenr, disk_io_size; struct file *file; struct btrfs_inode *inode; struct btrfs_fs_info *fs_info; struct extent_io_tree *io_tree; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4911,43 +4915,64 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_encoded_io_args_32 args32; - copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); - if (copy_from_user(&args32, sqe_addr, copy_end)) { - ret = -EFAULT; - goto out_acct; - } - args.iov = compat_ptr(args32.iov); - args.iovcnt = args32.iovcnt; - args.offset = args32.offset; - args.flags = args32.flags; #else return -ENOTTY; #endif } else { copy_end = copy_end_kernel; - if (copy_from_user(&args, sqe_addr, copy_end)) { - ret = -EFAULT; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; goto out_acct; } - } - if (args.flags != 0) - return -EINVAL; + io_uring_cmd_get_async_data(cmd)->op_data = data; - ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), - &iov, &iter); - if (ret < 0) - goto out_acct; + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; - if (iov_iter_count(&iter) == 0) { - ret = 0; - goto out_free; + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (data->args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + data->iov = data->iovstack; + ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_free; + } } - pos = args.offset; - ret = rw_verify_area(READ, file, &pos, args.len); + pos = data->args.offset; + ret = rw_verify_area(READ, file, &pos, data->args.len); if (ret < 0) goto out_free; @@ -4960,15 +4985,16 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue start = ALIGN_DOWN(pos, fs_info->sectorsize); lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; file_accessed(file); - if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, - sizeof(args) - copy_end_kernel)) { + if (copy_to_user(sqe_addr + copy_end, + (const char *)&data->args + copy_end_kernel, + sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -4978,40 +5004,22 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue } if (ret == -EIOCBQUEUED) { - u64 count; - - /* - * If we've optimized things by storing the iovecs on the stack, - * undo this. - */ - if (!iov) { - iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); - if (!iov) { - unlock_extent(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - ret = -ENOMEM; - goto out_acct; - } - - memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); - } - - count = min_t(u64, iov_iter_count(&iter), disk_io_size); + u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); /* Match ioctl by not returning past EOF if uncompressed. */ - if (!args.compression) - count = min_t(u64, count, args.len); + if (!data->args.compression) + count = min_t(u64, count, data->args.len); - ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, - cached_state, disk_bytenr, - disk_io_size, count, - args.compression, iov, cmd); + ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, + cached_state, disk_bytenr, disk_io_size, + count, data->args.compression, + data->iov, cmd); goto out_acct; } out_free: - kfree(iov); + kfree(data->iov); out_acct: if (ret > 0) From cd6313beaeaea0b2e6d428afef7a86a986b50abe Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 6 Jan 2025 06:10:24 -0800 Subject: [PATCH 1156/1450] Revert "vmstat: disable vmstat_work on vmstat_cpu_down_prep()" This reverts commit adcfb264c3ed51fbbf5068ddf10d309a63683868. It turns out this just causes a different warning splat instead that seems to be much easier to trigger, so let's revert ASAP. Reported-and-bisected-by: Borislav Petkov Tested-by: Breno Leitao Reported-by: Alexander Gordeev Link: https://lore.kernel.org/all/20250106131817.GAZ3vYGVr3-hWFFPLj@fat_crate.local/ Cc: Koichiro Den Cc: Sebastian Andrzej Siewior Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 0889b75cef149..4d016314a56c9 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2148,14 +2148,13 @@ static int vmstat_cpu_online(unsigned int cpu) if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); } - enable_delayed_work(&per_cpu(vmstat_work, cpu)); return 0; } static int vmstat_cpu_down_prep(unsigned int cpu) { - disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); return 0; } From 07aeefae7ff44d80524375253980b1bdee2396b0 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 5 Jan 2025 17:24:03 +0100 Subject: [PATCH 1157/1450] ovl: pass realinode to ovl_encode_real_fh() instead of realdentry We want to be able to encode an fid from an inode with no alias. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250105162404.357058-2-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/overlayfs/copy_up.c | 11 ++++++----- fs/overlayfs/export.c | 5 +++-- fs/overlayfs/namei.c | 4 ++-- fs/overlayfs/overlayfs.h | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 56eee9f23ea9a..0c28e5fa34077 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -415,13 +415,13 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, return err; } -struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, bool is_upper) { struct ovl_fh *fh; int fh_type, dwords; int buflen = MAX_HANDLE_SZ; - uuid_t *uuid = &real->d_sb->s_uuid; + uuid_t *uuid = &realinode->i_sb->s_uuid; int err; /* Make sure the real fid stays 32bit aligned */ @@ -438,7 +438,8 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); + fh_type = exportfs_encode_inode_fh(realinode, (void *)fh->fb.fid, + &dwords, NULL, 0); buflen = (dwords << 2); err = -EIO; @@ -479,7 +480,7 @@ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin) if (!ovl_can_decode_fh(origin->d_sb)) return NULL; - return ovl_encode_real_fh(ofs, origin, false); + return ovl_encode_real_fh(ofs, d_inode(origin), false); } int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, @@ -504,7 +505,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, const struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, upper, true); + fh = ovl_encode_real_fh(ofs, d_inode(upper), true); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 5868cb2229552..036c9f39a14d7 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -223,6 +223,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, u32 *fid, int buflen) { + struct inode *inode = d_inode(dentry); struct ovl_fh *fh = NULL; int err, enc_lower; int len; @@ -236,8 +237,8 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, goto fail; /* Encode an upper or lower file handle */ - fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) : - ovl_dentry_upper(dentry), !enc_lower); + fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) : + ovl_inode_upper(inode), !enc_lower); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 7e27b7d4adee8..cea820cb3b55b 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -542,7 +542,7 @@ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, real, is_upper); + fh = ovl_encode_real_fh(ofs, d_inode(real), is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) { fh = NULL; @@ -738,7 +738,7 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, origin, false); + fh = ovl_encode_real_fh(ofs, d_inode(origin), false); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index b361f35762be0..0021e20250202 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -865,7 +865,7 @@ int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new); int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); -struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, bool is_upper); struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin); int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, From c45beebfde34aa71afbc48b2c54cdda623515037 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 5 Jan 2025 17:24:04 +0100 Subject: [PATCH 1158/1450] ovl: support encoding fid from inode with no alias Dmitry Safonov reported that a WARN_ON() assertion can be trigered by userspace when calling inotify_show_fdinfo() for an overlayfs watched inode, whose dentry aliases were discarded with drop_caches. The WARN_ON() assertion in inotify_show_fdinfo() was removed, because it is possible for encoding file handle to fail for other reason, but the impact of failing to encode an overlayfs file handle goes beyond this assertion. As shown in the LTP test case mentioned in the link below, failure to encode an overlayfs file handle from a non-aliased inode also leads to failure to report an fid with FAN_DELETE_SELF fanotify events. As Dmitry notes in his analyzis of the problem, ovl_encode_fh() fails if it cannot find an alias for the inode, but this failure can be fixed. ovl_encode_fh() seldom uses the alias and in the case of non-decodable file handles, as is often the case with fanotify fid info, ovl_encode_fh() never needs to use the alias to encode a file handle. Defer finding an alias until it is actually needed so ovl_encode_fh() will not fail in the common case of FAN_DELETE_SELF fanotify events. Fixes: 16aac5ad1fa9 ("ovl: support encoding non-decodable file handles") Reported-by: Dmitry Safonov Closes: https://lore.kernel.org/linux-fsdevel/CAOQ4uxiie81voLZZi2zXS1BziXZCM24nXqPAxbu8kxXCUWdwOg@mail.gmail.com/ Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250105162404.357058-3-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/overlayfs/export.c | 46 +++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 036c9f39a14d7..444aeeccb6daf 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -176,35 +176,37 @@ static int ovl_connect_layer(struct dentry *dentry) * * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ -static int ovl_check_encode_origin(struct dentry *dentry) +static int ovl_check_encode_origin(struct inode *inode) { - struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct ovl_fs *ofs = OVL_FS(inode->i_sb); bool decodable = ofs->config.nfs_export; + struct dentry *dentry; + int err; /* No upper layer? */ if (!ovl_upper_mnt(ofs)) return 1; /* Lower file handle for non-upper non-decodable */ - if (!ovl_dentry_upper(dentry) && !decodable) + if (!ovl_inode_upper(inode) && !decodable) return 1; /* Upper file handle for pure upper */ - if (!ovl_dentry_lower(dentry)) + if (!ovl_inode_lower(inode)) return 0; /* * Root is never indexed, so if there's an upper layer, encode upper for * root. */ - if (dentry == dentry->d_sb->s_root) + if (inode == d_inode(inode->i_sb->s_root)) return 0; /* * Upper decodable file handle for non-indexed upper. */ - if (ovl_dentry_upper(dentry) && decodable && - !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + if (ovl_inode_upper(inode) && decodable && + !ovl_test_flag(OVL_INDEX, inode)) return 0; /* @@ -213,17 +215,25 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && decodable) - return ovl_connect_layer(dentry); + if (!decodable || !S_ISDIR(inode->i_mode)) + return 1; + + dentry = d_find_any_alias(inode); + if (!dentry) + return -ENOENT; + + err = ovl_connect_layer(dentry); + dput(dentry); + if (err < 0) + return err; /* Lower file handle for indexed and non-upper dir/non-dir */ return 1; } -static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, +static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode, u32 *fid, int buflen) { - struct inode *inode = d_inode(dentry); struct ovl_fh *fh = NULL; int err, enc_lower; int len; @@ -232,7 +242,7 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, * Check if we should encode a lower or upper file handle and maybe * copy up an ancestor to make lower file handle connectable. */ - err = enc_lower = ovl_check_encode_origin(dentry); + err = enc_lower = ovl_check_encode_origin(inode); if (enc_lower < 0) goto fail; @@ -252,8 +262,8 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, return err; fail: - pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n", - dentry, err); + pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n", + inode->i_ino, err); goto out; } @@ -261,19 +271,13 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); - struct dentry *dentry; int bytes, buflen = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) return FILEID_INVALID; - dentry = d_find_any_alias(inode); - if (!dentry) - return FILEID_INVALID; - - bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen); - dput(dentry); + bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen); if (bytes <= 0) return FILEID_INVALID; From 6aecd91a5c5b68939cf4169e32bc49f3cd2dd329 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 2 Jan 2025 14:44:16 +1030 Subject: [PATCH 1159/1450] btrfs: avoid NULL pointer dereference if no valid extent tree [BUG] Syzbot reported a crash with the following call trace: BTRFS info (device loop0): scrub: started on devid 1 BUG: kernel NULL pointer dereference, address: 0000000000000208 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 106e70067 P4D 106e70067 PUD 107143067 PMD 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 UID: 0 PID: 689 Comm: repro Kdump: loaded Tainted: G O 6.13.0-rc4-custom+ #206 Tainted: [O]=OOT_MODULE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022 RIP: 0010:find_first_extent_item+0x26/0x1f0 [btrfs] Call Trace: scrub_find_fill_first_stripe+0x13d/0x3b0 [btrfs] scrub_simple_mirror+0x175/0x260 [btrfs] scrub_stripe+0x5d4/0x6c0 [btrfs] scrub_chunk+0xbb/0x170 [btrfs] scrub_enumerate_chunks+0x2f4/0x5f0 [btrfs] btrfs_scrub_dev+0x240/0x600 [btrfs] btrfs_ioctl+0x1dc8/0x2fa0 [btrfs] ? do_sys_openat2+0xa5/0xf0 __x64_sys_ioctl+0x97/0xc0 do_syscall_64+0x4f/0x120 entry_SYSCALL_64_after_hwframe+0x76/0x7e [CAUSE] The reproducer is using a corrupted image where extent tree root is corrupted, thus forcing to use "rescue=all,ro" mount option to mount the image. Then it triggered a scrub, but since scrub relies on extent tree to find where the data/metadata extents are, scrub_find_fill_first_stripe() relies on an non-empty extent root. But unfortunately scrub_find_fill_first_stripe() doesn't really expect an NULL pointer for extent root, it use extent_root to grab fs_info and triggered a NULL pointer dereference. [FIX] Add an extra check for a valid extent root at the beginning of scrub_find_fill_first_stripe(). The new error path is introduced by 42437a6386ff ("btrfs: introduce mount option rescue=ignorebadroots"), but that's pretty old, and later commit b979547513ff ("btrfs: scrub: introduce helper to find and fill sector info for a scrub_stripe") changed how we do scrub. So for kernels older than 6.6, the fix will need manual backport. Reported-by: syzbot+339e9dbe3a2ca419b85d@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/67756935.050a0220.25abdd.0a12.GAE@google.com/ Fixes: 42437a6386ff ("btrfs: introduce mount option rescue=ignorebadroots") Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 204c928beaf9c..531312efee8df 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1541,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; + if (unlikely(!extent_root)) { + btrfs_err(fs_info, "no valid extent root for scrub"); + return -EUCLEAN; + } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); From 7467bc5959bf02ef5210ea7e7948e548565c799c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Dec 2024 07:43:43 +0100 Subject: [PATCH 1160/1450] btrfs: zoned: calculate max_extent_size properly on non-zoned setup Since commit 559218d43ec9 ("block: pre-calculate max_zone_append_sectors"), queue_limits's max_zone_append_sectors is default to be 0 and it is only updated when there is a zoned device. So, we have lim->max_zone_append_sectors = 0 when there is no zoned device in the filesystem. That leads to fs_info->max_zone_append_size and thus fs_info->max_extent_size to be 0, which is wrong and can for example lead to a divide by zero in count_max_extents(). Fix this by only capping fs_info->max_extent_size to fs_info->max_zone_append_size when it is non-zero. Based on a patch from Naohiro Aota , from which much of this commit message is stolen as well. Reported-by: Shinichiro Kawasaki Fixes: 559218d43ec9 ("block: pre-calculate max_zone_append_sectors") Tested-by: Shinichiro Kawasaki Reviewed-by: Johannes Thumshirn Reviewed-by: Naohiro Aota Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index cb32966380f57..5f9d3be1234ab 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -745,8 +745,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; - if (fs_info->max_zone_append_size < fs_info->max_extent_size) - fs_info->max_extent_size = fs_info->max_zone_append_size; + + fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, + fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned From 0ee4736c003daded513de0ff112d4a1e9c85bbab Mon Sep 17 00:00:00 2001 From: Mikhail Zaslonko Date: Wed, 18 Dec 2024 11:32:51 +0100 Subject: [PATCH 1161/1450] btrfs: zlib: fix avail_in bytes for s390 zlib HW compression path Since the input data length passed to zlib_compress_folios() can be arbitrary, always setting strm.avail_in to a multiple of PAGE_SIZE may cause read-in bytes to exceed the input range. Currently this triggers an assert in btrfs_compress_folios() on the debug kernel (see below). Fix strm.avail_in calculation for S390 hardware acceleration path. assertion failed: *total_in <= orig_len, in fs/btrfs/compression.c:1041 ------------[ cut here ]------------ kernel BUG at fs/btrfs/compression.c:1041! monitor event: 0040 ilc:2 [#1] PREEMPT SMP CPU: 16 UID: 0 PID: 325 Comm: kworker/u273:3 Not tainted 6.13.0-20241204.rc1.git6.fae3b21430ca.300.fc41.s390x+debug #1 Hardware name: IBM 3931 A01 703 (z/VM 7.4.0) Workqueue: btrfs-delalloc btrfs_work_helper Krnl PSW : 0704d00180000000 0000021761df6538 (btrfs_compress_folios+0x198/0x1a0) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3 Krnl GPRS: 0000000080000000 0000000000000001 0000000000000047 0000000000000000 0000000000000006 ffffff01757bb000 000001976232fcc0 000000000000130c 000001976232fcd0 000001976232fcc8 00000118ff4a0e30 0000000000000001 00000111821ab400 0000011100000000 0000021761df6534 000001976232fb58 Krnl Code: 0000021761df6528: c020006f5ef4 larl %r2,0000021762be2310 0000021761df652e: c0e5ffbd09d5 brasl %r14,00000217615978d8 #0000021761df6534: af000000 mc 0,0 >0000021761df6538: 0707 bcr 0,%r7 0000021761df653a: 0707 bcr 0,%r7 0000021761df653c: 0707 bcr 0,%r7 0000021761df653e: 0707 bcr 0,%r7 0000021761df6540: c004004bb7ec brcl 0,000002176276d518 Call Trace: [<0000021761df6538>] btrfs_compress_folios+0x198/0x1a0 ([<0000021761df6534>] btrfs_compress_folios+0x194/0x1a0) [<0000021761d97788>] compress_file_range+0x3b8/0x6d0 [<0000021761dcee7c>] btrfs_work_helper+0x10c/0x160 [<0000021761645760>] process_one_work+0x2b0/0x5d0 [<000002176164637e>] worker_thread+0x20e/0x3e0 [<000002176165221a>] kthread+0x15a/0x170 [<00000217615b859c>] __ret_from_fork+0x3c/0x60 [<00000217626e72d2>] ret_from_fork+0xa/0x38 INFO: lockdep is turned off. Last Breaking-Event-Address: [<0000021761597924>] _printk+0x4c/0x58 Kernel panic - not syncing: Fatal exception: panic_on_oops Fixes: fd1e75d0105d ("btrfs: make compression path to be subpage compatible") CC: stable@vger.kernel.org # 6.12+ Acked-by: Ilya Leoshkevich Reviewed-by: Qu Wenruo Signed-off-by: Mikhail Zaslonko Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ddf0d5a448a74..c9e92c6941ec4 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -174,10 +174,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; - workspace->strm.avail_in = - (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = min(bytes_left, + in_buf_folios << PAGE_SHIFT); } else { unsigned int pg_off; unsigned int cur_len; From 59ec698d01ebb5bae4865f6083bb9f398e39d63b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:37:09 +0000 Subject: [PATCH 1162/1450] i40e: Deadcode i40e_aq_* i40e_aq_add_mirrorrule(), i40e_aq_delete_mirrorrule() and i40e_aq_set_vsi_vlan_promisc() were added in 2016 by commit 7bd6875bef70 ("i40e: APIs to Add/remove port mirroring rules") but haven't been used. They were the last user of i40e_mirrorrule_op(). i40e_aq_rearrange_nvm() was added in 2018 by commit f05798b4ff82 ("i40e: Add AQ command for rearrange NVM structure") but hasn't been used. i40e_aq_restore_lldp() was added in 2019 by commit c65e78f87f81 ("i40e: Further implementation of LLDP") but hasn't been used. Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102173717.200359-2-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_common.c | 234 ------------------ .../net/ethernet/intel/i40e/i40e_prototype.h | 17 -- 2 files changed, 251 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index e8031f1a9b4fc..47e71f72d87ba 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1804,37 +1804,6 @@ int i40e_aq_set_vsi_broadcast(struct i40e_hw *hw, return status; } -/** - * i40e_aq_set_vsi_vlan_promisc - control the VLAN promiscuous setting - * @hw: pointer to the hw struct - * @seid: vsi number - * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN - * @cmd_details: pointer to command details structure or NULL - **/ -int i40e_aq_set_vsi_vlan_promisc(struct i40e_hw *hw, - u16 seid, bool enable, - struct i40e_asq_cmd_details *cmd_details) -{ - struct i40e_aq_desc desc; - struct i40e_aqc_set_vsi_promiscuous_modes *cmd = - (struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw; - u16 flags = 0; - int status; - - i40e_fill_default_direct_cmd_desc(&desc, - i40e_aqc_opc_set_vsi_promiscuous_modes); - if (enable) - flags |= I40E_AQC_SET_VSI_PROMISC_VLAN; - - cmd->promiscuous_flags = cpu_to_le16(flags); - cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_VLAN); - cmd->seid = cpu_to_le16(seid); - - status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); - - return status; -} - /** * i40e_aq_get_vsi_params - get VSI configuration info * @hw: pointer to the hw struct @@ -2435,136 +2404,6 @@ i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid, cmd_details, true, aq_status); } -/** - * i40e_mirrorrule_op - Internal helper function to add/delete mirror rule - * @hw: pointer to the hw struct - * @opcode: AQ opcode for add or delete mirror rule - * @sw_seid: Switch SEID (to which rule refers) - * @rule_type: Rule Type (ingress/egress/VLAN) - * @id: Destination VSI SEID or Rule ID - * @count: length of the list - * @mr_list: list of mirrored VSI SEIDs or VLAN IDs - * @cmd_details: pointer to command details structure or NULL - * @rule_id: Rule ID returned from FW - * @rules_used: Number of rules used in internal switch - * @rules_free: Number of rules free in internal switch - * - * Add/Delete a mirror rule to a specific switch. Mirror rules are supported for - * VEBs/VEPA elements only - **/ -static int i40e_mirrorrule_op(struct i40e_hw *hw, - u16 opcode, u16 sw_seid, u16 rule_type, u16 id, - u16 count, __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rule_id, u16 *rules_used, u16 *rules_free) -{ - struct i40e_aq_desc desc; - struct i40e_aqc_add_delete_mirror_rule *cmd = - (struct i40e_aqc_add_delete_mirror_rule *)&desc.params.raw; - struct i40e_aqc_add_delete_mirror_rule_completion *resp = - (struct i40e_aqc_add_delete_mirror_rule_completion *)&desc.params.raw; - u16 buf_size; - int status; - - buf_size = count * sizeof(*mr_list); - - /* prep the rest of the request */ - i40e_fill_default_direct_cmd_desc(&desc, opcode); - cmd->seid = cpu_to_le16(sw_seid); - cmd->rule_type = cpu_to_le16(rule_type & - I40E_AQC_MIRROR_RULE_TYPE_MASK); - cmd->num_entries = cpu_to_le16(count); - /* Dest VSI for add, rule_id for delete */ - cmd->destination = cpu_to_le16(id); - if (mr_list) { - desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | - I40E_AQ_FLAG_RD)); - if (buf_size > I40E_AQ_LARGE_BUF) - desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB); - } - - status = i40e_asq_send_command(hw, &desc, mr_list, buf_size, - cmd_details); - if (!status || - hw->aq.asq_last_status == I40E_AQ_RC_ENOSPC) { - if (rule_id) - *rule_id = le16_to_cpu(resp->rule_id); - if (rules_used) - *rules_used = le16_to_cpu(resp->mirror_rules_used); - if (rules_free) - *rules_free = le16_to_cpu(resp->mirror_rules_free); - } - return status; -} - -/** - * i40e_aq_add_mirrorrule - add a mirror rule - * @hw: pointer to the hw struct - * @sw_seid: Switch SEID (to which rule refers) - * @rule_type: Rule Type (ingress/egress/VLAN) - * @dest_vsi: SEID of VSI to which packets will be mirrored - * @count: length of the list - * @mr_list: list of mirrored VSI SEIDs or VLAN IDs - * @cmd_details: pointer to command details structure or NULL - * @rule_id: Rule ID returned from FW - * @rules_used: Number of rules used in internal switch - * @rules_free: Number of rules free in internal switch - * - * Add mirror rule. Mirror rules are supported for VEBs or VEPA elements only - **/ -int i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid, - u16 rule_type, u16 dest_vsi, u16 count, - __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rule_id, u16 *rules_used, u16 *rules_free) -{ - if (!(rule_type == I40E_AQC_MIRROR_RULE_TYPE_ALL_INGRESS || - rule_type == I40E_AQC_MIRROR_RULE_TYPE_ALL_EGRESS)) { - if (count == 0 || !mr_list) - return -EINVAL; - } - - return i40e_mirrorrule_op(hw, i40e_aqc_opc_add_mirror_rule, sw_seid, - rule_type, dest_vsi, count, mr_list, - cmd_details, rule_id, rules_used, rules_free); -} - -/** - * i40e_aq_delete_mirrorrule - delete a mirror rule - * @hw: pointer to the hw struct - * @sw_seid: Switch SEID (to which rule refers) - * @rule_type: Rule Type (ingress/egress/VLAN) - * @count: length of the list - * @rule_id: Rule ID that is returned in the receive desc as part of - * add_mirrorrule. - * @mr_list: list of mirrored VLAN IDs to be removed - * @cmd_details: pointer to command details structure or NULL - * @rules_used: Number of rules used in internal switch - * @rules_free: Number of rules free in internal switch - * - * Delete a mirror rule. Mirror rules are supported for VEBs/VEPA elements only - **/ -int i40e_aq_delete_mirrorrule(struct i40e_hw *hw, u16 sw_seid, - u16 rule_type, u16 rule_id, u16 count, - __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rules_used, u16 *rules_free) -{ - /* Rule ID has to be valid except rule_type: INGRESS VLAN mirroring */ - if (rule_type == I40E_AQC_MIRROR_RULE_TYPE_VLAN) { - /* count and mr_list shall be valid for rule_type INGRESS VLAN - * mirroring. For other rule_type, count and rule_type should - * not matter. - */ - if (count == 0 || !mr_list) - return -EINVAL; - } - - return i40e_mirrorrule_op(hw, i40e_aqc_opc_delete_mirror_rule, sw_seid, - rule_type, rule_id, count, mr_list, - cmd_details, NULL, rules_used, rules_free); -} - /** * i40e_aq_send_msg_to_vf * @hw: pointer to the hardware structure @@ -3179,41 +3018,6 @@ int i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer, return status; } -/** - * i40e_aq_rearrange_nvm - * @hw: pointer to the hw struct - * @rearrange_nvm: defines direction of rearrangement - * @cmd_details: pointer to command details structure or NULL - * - * Rearrange NVM structure, available only for transition FW - **/ -int i40e_aq_rearrange_nvm(struct i40e_hw *hw, - u8 rearrange_nvm, - struct i40e_asq_cmd_details *cmd_details) -{ - struct i40e_aqc_nvm_update *cmd; - struct i40e_aq_desc desc; - int status; - - cmd = (struct i40e_aqc_nvm_update *)&desc.params.raw; - - i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_update); - - rearrange_nvm &= (I40E_AQ_NVM_REARRANGE_TO_FLAT | - I40E_AQ_NVM_REARRANGE_TO_STRUCT); - - if (!rearrange_nvm) { - status = -EINVAL; - goto i40e_aq_rearrange_nvm_exit; - } - - cmd->command_flags |= rearrange_nvm; - status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); - -i40e_aq_rearrange_nvm_exit: - return status; -} - /** * i40e_aq_get_lldp_mib * @hw: pointer to the hw struct @@ -3334,44 +3138,6 @@ int i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw, return status; } -/** - * i40e_aq_restore_lldp - * @hw: pointer to the hw struct - * @setting: pointer to factory setting variable or NULL - * @restore: True if factory settings should be restored - * @cmd_details: pointer to command details structure or NULL - * - * Restore LLDP Agent factory settings if @restore set to True. In other case - * only returns factory setting in AQ response. - **/ -int -i40e_aq_restore_lldp(struct i40e_hw *hw, u8 *setting, bool restore, - struct i40e_asq_cmd_details *cmd_details) -{ - struct i40e_aq_desc desc; - struct i40e_aqc_lldp_restore *cmd = - (struct i40e_aqc_lldp_restore *)&desc.params.raw; - int status; - - if (!test_bit(I40E_HW_CAP_FW_LLDP_PERSISTENT, hw->caps)) { - i40e_debug(hw, I40E_DEBUG_ALL, - "Restore LLDP not supported by current FW version.\n"); - return -ENODEV; - } - - i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_restore); - - if (restore) - cmd->command |= I40E_AQ_LLDP_AGENT_RESTORE; - - status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); - - if (setting) - *setting = cmd->command & 1; - - return status; -} - /** * i40e_aq_stop_lldp * @hw: pointer to the hw struct diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index 5a0699ca7ce50..29f6a903a30ce 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -141,9 +141,6 @@ int i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw, int i40e_aq_set_vsi_bc_promisc_on_vlan(struct i40e_hw *hw, u16 seid, bool enable, u16 vid, struct i40e_asq_cmd_details *cmd_details); -int i40e_aq_set_vsi_vlan_promisc(struct i40e_hw *hw, - u16 seid, bool enable, - struct i40e_asq_cmd_details *cmd_details); int i40e_aq_get_vsi_params(struct i40e_hw *hw, struct i40e_vsi_context *vsi_ctx, struct i40e_asq_cmd_details *cmd_details); @@ -176,14 +173,6 @@ i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid, struct i40e_aqc_remove_macvlan_element_data *mv_list, u16 count, struct i40e_asq_cmd_details *cmd_details, enum i40e_admin_queue_err *aq_status); -int i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid, - u16 rule_type, u16 dest_vsi, u16 count, __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rule_id, u16 *rules_used, u16 *rules_free); -int i40e_aq_delete_mirrorrule(struct i40e_hw *hw, u16 sw_seid, - u16 rule_type, u16 rule_id, u16 count, __le16 *mr_list, - struct i40e_asq_cmd_details *cmd_details, - u16 *rules_used, u16 *rules_free); int i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen, @@ -220,9 +209,6 @@ int i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer, u32 offset, u16 length, void *data, bool last_command, u8 preservation_flags, struct i40e_asq_cmd_details *cmd_details); -int i40e_aq_rearrange_nvm(struct i40e_hw *hw, - u8 rearrange_nvm, - struct i40e_asq_cmd_details *cmd_details); int i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type, u8 mib_type, void *buff, u16 buff_size, u16 *local_len, u16 *remote_len, @@ -234,9 +220,6 @@ i40e_aq_set_lldp_mib(struct i40e_hw *hw, int i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw, bool enable_update, struct i40e_asq_cmd_details *cmd_details); -int -i40e_aq_restore_lldp(struct i40e_hw *hw, u8 *setting, bool restore, - struct i40e_asq_cmd_details *cmd_details); int i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent, bool persist, struct i40e_asq_cmd_details *cmd_details); From 39cabb01d26d2d27bd4794c62e67349d86f8b1df Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:37:10 +0000 Subject: [PATCH 1163/1450] i40e: Remove unused i40e_blink_phy_link_led i40e_blink_phy_link_led() was added in 2016 by commit fd077cd3399b ("i40e: Add functions to blink led on 10GBaseT PHY") but hasn't been used. Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kalesh AP Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102173717.200359-3-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_common.c | 74 ------------------- .../net/ethernet/intel/i40e/i40e_prototype.h | 4 - 2 files changed, 78 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 47e71f72d87ba..ba780a949a477 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -4428,80 +4428,6 @@ u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num) return (u8)(reg_val >> ((dev_num + 1) * 5)) & 0x1f; } -/** - * i40e_blink_phy_link_led - * @hw: pointer to the HW structure - * @time: time how long led will blinks in secs - * @interval: gap between LED on and off in msecs - * - * Blinks PHY link LED - **/ -int i40e_blink_phy_link_led(struct i40e_hw *hw, - u32 time, u32 interval) -{ - u16 led_addr = I40E_PHY_LED_PROV_REG_1; - u16 gpio_led_port; - u8 phy_addr = 0; - int status = 0; - u16 led_ctl; - u8 port_num; - u16 led_reg; - u32 i; - - i = rd32(hw, I40E_PFGEN_PORTNUM); - port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK); - phy_addr = i40e_get_phy_address(hw, port_num); - - for (gpio_led_port = 0; gpio_led_port < 3; gpio_led_port++, - led_addr++) { - status = i40e_read_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, - &led_reg); - if (status) - goto phy_blinking_end; - led_ctl = led_reg; - if (led_reg & I40E_PHY_LED_LINK_MODE_MASK) { - led_reg = 0; - status = i40e_write_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, - led_reg); - if (status) - goto phy_blinking_end; - break; - } - } - - if (time > 0 && interval > 0) { - for (i = 0; i < time * 1000; i += interval) { - status = i40e_read_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, &led_reg); - if (status) - goto restore_config; - if (led_reg & I40E_PHY_LED_MANUAL_ON) - led_reg = 0; - else - led_reg = I40E_PHY_LED_MANUAL_ON; - status = i40e_write_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, led_reg); - if (status) - goto restore_config; - msleep(interval); - } - } - -restore_config: - status = i40e_write_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, led_ctl); - -phy_blinking_end: - return status; -} - /** * i40e_led_get_reg - read LED register * @hw: pointer to the HW structure diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index 29f6a903a30ce..c0a4bd53501c8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -72,8 +72,6 @@ int i40e_led_set_phy(struct i40e_hw *hw, bool on, u16 led_addr, u32 mode); int i40e_led_get_phy(struct i40e_hw *hw, u16 *led_addr, u16 *val); -int i40e_blink_phy_link_led(struct i40e_hw *hw, - u32 time, u32 interval); /* admin send queue commands */ @@ -446,8 +444,6 @@ int i40e_read_phy_register(struct i40e_hw *hw, u8 page, u16 reg, int i40e_write_phy_register(struct i40e_hw *hw, u8 page, u16 reg, u8 phy_addr, u16 value); u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num); -int i40e_blink_phy_link_led(struct i40e_hw *hw, - u32 time, u32 interval); int i40e_aq_write_ddp(struct i40e_hw *hw, void *buff, u16 buff_size, u32 track_id, u32 *error_offset, u32 *error_info, From 8cc51e28ecce4c8b3a96d7802b543553d11f682c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:37:11 +0000 Subject: [PATCH 1164/1450] i40e: Remove unused i40e_(read|write)_phy_register i40e_read_phy_register() and i40e_write_phy_register() were added in 2016 by commit f62ba91458b5 ("i40e: Add functions which apply correct PHY access method for read and write operation") but haven't been used. Remove them. (There are more specific _clause* variants of these functions that are still used.) Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102173717.200359-4-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_common.c | 78 ------------------- .../net/ethernet/intel/i40e/i40e_prototype.h | 4 - 2 files changed, 82 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index ba780a949a477..6779e281a6486 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -4335,84 +4335,6 @@ int i40e_write_phy_register_clause45(struct i40e_hw *hw, return status; } -/** - * i40e_write_phy_register - * @hw: pointer to the HW structure - * @page: registers page number - * @reg: register address in the page - * @phy_addr: PHY address on MDIO interface - * @value: PHY register value - * - * Writes value to specified PHY register - **/ -int i40e_write_phy_register(struct i40e_hw *hw, - u8 page, u16 reg, u8 phy_addr, u16 value) -{ - int status; - - switch (hw->device_id) { - case I40E_DEV_ID_1G_BASE_T_X722: - status = i40e_write_phy_register_clause22(hw, reg, phy_addr, - value); - break; - case I40E_DEV_ID_1G_BASE_T_BC: - case I40E_DEV_ID_5G_BASE_T_BC: - case I40E_DEV_ID_10G_BASE_T: - case I40E_DEV_ID_10G_BASE_T4: - case I40E_DEV_ID_10G_BASE_T_BC: - case I40E_DEV_ID_10G_BASE_T_X722: - case I40E_DEV_ID_25G_B: - case I40E_DEV_ID_25G_SFP28: - status = i40e_write_phy_register_clause45(hw, page, reg, - phy_addr, value); - break; - default: - status = -EIO; - break; - } - - return status; -} - -/** - * i40e_read_phy_register - * @hw: pointer to the HW structure - * @page: registers page number - * @reg: register address in the page - * @phy_addr: PHY address on MDIO interface - * @value: PHY register value - * - * Reads specified PHY register value - **/ -int i40e_read_phy_register(struct i40e_hw *hw, - u8 page, u16 reg, u8 phy_addr, u16 *value) -{ - int status; - - switch (hw->device_id) { - case I40E_DEV_ID_1G_BASE_T_X722: - status = i40e_read_phy_register_clause22(hw, reg, phy_addr, - value); - break; - case I40E_DEV_ID_1G_BASE_T_BC: - case I40E_DEV_ID_5G_BASE_T_BC: - case I40E_DEV_ID_10G_BASE_T: - case I40E_DEV_ID_10G_BASE_T4: - case I40E_DEV_ID_10G_BASE_T_BC: - case I40E_DEV_ID_10G_BASE_T_X722: - case I40E_DEV_ID_25G_B: - case I40E_DEV_ID_25G_SFP28: - status = i40e_read_phy_register_clause45(hw, page, reg, - phy_addr, value); - break; - default: - status = -EIO; - break; - } - - return status; -} - /** * i40e_get_phy_address * @hw: pointer to the HW structure diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index c0a4bd53501c8..bfebe18c0041b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -439,10 +439,6 @@ int i40e_read_phy_register_clause45(struct i40e_hw *hw, u8 page, u16 reg, u8 phy_addr, u16 *value); int i40e_write_phy_register_clause45(struct i40e_hw *hw, u8 page, u16 reg, u8 phy_addr, u16 value); -int i40e_read_phy_register(struct i40e_hw *hw, u8 page, u16 reg, - u8 phy_addr, u16 *value); -int i40e_write_phy_register(struct i40e_hw *hw, u8 page, u16 reg, - u8 phy_addr, u16 value); u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num); int i40e_aq_write_ddp(struct i40e_hw *hw, void *buff, u16 buff_size, u32 track_id, From 81d6bb2012e1d6410bc88dcb331113126a13a6ee Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:37:12 +0000 Subject: [PATCH 1165/1450] i40e: Deadcode profile code i40e_add_pinfo_to_list() was added in 2017 by commit 1d5c960c5ef5 ("i40e: new AQ commands") i40e_find_section_in_profile() was added in 2019 by commit cdc594e00370 ("i40e: Implement DDP support in i40e driver") Neither have been used. Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102173717.200359-5-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_common.c | 72 ------------------- .../net/ethernet/intel/i40e/i40e_prototype.h | 8 --- 2 files changed, 80 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 6779e281a6486..370b4bddee441 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -4882,39 +4882,6 @@ i40e_find_segment_in_package(u32 segment_type, #define I40E_SECTION_HEADER(profile, offset) \ (struct i40e_profile_section_header *)((u8 *)(profile) + (offset)) -/** - * i40e_find_section_in_profile - * @section_type: the section type to search for (i.e., SECTION_TYPE_NOTE) - * @profile: pointer to the i40e segment header to be searched - * - * This function searches i40e segment for a particular section type. On - * success it returns a pointer to the section header, otherwise it will - * return NULL. - **/ -struct i40e_profile_section_header * -i40e_find_section_in_profile(u32 section_type, - struct i40e_profile_segment *profile) -{ - struct i40e_profile_section_header *sec; - struct i40e_section_table *sec_tbl; - u32 sec_off; - u32 i; - - if (profile->header.type != SEGMENT_TYPE_I40E) - return NULL; - - I40E_SECTION_TABLE(profile, sec_tbl); - - for (i = 0; i < sec_tbl->section_count; i++) { - sec_off = sec_tbl->section_offset[i]; - sec = I40E_SECTION_HEADER(profile, sec_off); - if (sec->section.type == section_type) - return sec; - } - - return NULL; -} - /** * i40e_ddp_exec_aq_section - Execute generic AQ for DDP * @hw: pointer to the hw struct @@ -5137,45 +5104,6 @@ i40e_rollback_profile(struct i40e_hw *hw, struct i40e_profile_segment *profile, return status; } -/** - * i40e_add_pinfo_to_list - * @hw: pointer to the hardware structure - * @profile: pointer to the profile segment of the package - * @profile_info_sec: buffer for information section - * @track_id: package tracking id - * - * Register a profile to the list of loaded profiles. - */ -int -i40e_add_pinfo_to_list(struct i40e_hw *hw, - struct i40e_profile_segment *profile, - u8 *profile_info_sec, u32 track_id) -{ - struct i40e_profile_section_header *sec = NULL; - struct i40e_profile_info *pinfo; - u32 offset = 0, info = 0; - int status = 0; - - sec = (struct i40e_profile_section_header *)profile_info_sec; - sec->tbl_size = 1; - sec->data_end = sizeof(struct i40e_profile_section_header) + - sizeof(struct i40e_profile_info); - sec->section.type = SECTION_TYPE_INFO; - sec->section.offset = sizeof(struct i40e_profile_section_header); - sec->section.size = sizeof(struct i40e_profile_info); - pinfo = (struct i40e_profile_info *)(profile_info_sec + - sec->section.offset); - pinfo->track_id = track_id; - pinfo->version = profile->version; - pinfo->op = I40E_DDP_ADD_TRACKID; - memcpy(pinfo->name, profile->name, I40E_DDP_NAME_SIZE); - - status = i40e_aq_write_ddp(hw, (void *)sec, sec->data_end, - track_id, &offset, &info, NULL); - - return status; -} - /** * i40e_aq_add_cloud_filters * @hw: pointer to the hardware structure diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index bfebe18c0041b..ccb8af472cd7d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -452,20 +452,12 @@ int i40e_aq_get_ddp_list(struct i40e_hw *hw, void *buff, struct i40e_generic_seg_header * i40e_find_segment_in_package(u32 segment_type, struct i40e_package_header *pkg_header); -struct i40e_profile_section_header * -i40e_find_section_in_profile(u32 section_type, - struct i40e_profile_segment *profile); int i40e_write_profile(struct i40e_hw *hw, struct i40e_profile_segment *i40e_seg, u32 track_id); int i40e_rollback_profile(struct i40e_hw *hw, struct i40e_profile_segment *i40e_seg, u32 track_id); -int -i40e_add_pinfo_to_list(struct i40e_hw *hw, - struct i40e_profile_segment *profile, - u8 *profile_info_sec, u32 track_id); - /* i40e_ddp */ int i40e_ddp_flash(struct net_device *netdev, struct ethtool_flash *flash); From 3eb24a9e0af3a336da8af0bf37140203f742b493 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:37:13 +0000 Subject: [PATCH 1166/1450] i40e: Remove unused i40e_get_cur_guaranteed_fd_count The last use of i40e_get_cur_guaranteed_fd_count() was removed in 2015 by commit 04294e38a451 ("i40e: FD filters flush policy changes") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kalesh AP Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102173717.200359-6-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e.h | 1 - drivers/net/ethernet/intel/i40e/i40e_main.c | 13 ------------- 2 files changed, 14 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index d4255c2706fa3..5d9738b746f4f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1189,7 +1189,6 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi, struct i40e_fdir_filter *input, bool add); void i40e_fdir_check_and_reenable(struct i40e_pf *pf); u32 i40e_get_current_fd_count(struct i40e_pf *pf); -u32 i40e_get_cur_guaranteed_fd_count(struct i40e_pf *pf); u32 i40e_get_current_atr_cnt(struct i40e_pf *pf); u32 i40e_get_global_fd_count(struct i40e_pf *pf); bool i40e_set_ntuple(struct i40e_pf *pf, netdev_features_t features); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 0e1d9e2fbf38c..83ba1effe8ba5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -9628,19 +9628,6 @@ static void i40e_handle_lan_overflow_event(struct i40e_pf *pf, i40e_reset_vf(vf, false); } -/** - * i40e_get_cur_guaranteed_fd_count - Get the consumed guaranteed FD filters - * @pf: board private structure - **/ -u32 i40e_get_cur_guaranteed_fd_count(struct i40e_pf *pf) -{ - u32 val, fcnt_prog; - - val = rd32(&pf->hw, I40E_PFQF_FDSTAT); - fcnt_prog = (val & I40E_PFQF_FDSTAT_GUARANT_CNT_MASK); - return fcnt_prog; -} - /** * i40e_get_current_fd_count - Get total FD filters programmed for this PF * @pf: board private structure From 38dfb07d9a65dd408bc50f0cc8e49a5381bc40f5 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:37:14 +0000 Subject: [PATCH 1167/1450] i40e: Remove unused i40e_del_filter The last use of i40e_del_filter() was removed in 2016 by commit 9569a9a4547d ("i40e: when adding or removing MAC filters, correctly handle VLANs") Remove it. Fix up a comment that referenced it. Note: The __ version of this function is still used. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102173717.200359-7-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e.h | 1 - drivers/net/ethernet/intel/i40e/i40e_main.c | 28 ++------------------- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 5d9738b746f4f..399a5dbf35064 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1196,7 +1196,6 @@ void i40e_set_ethtool_ops(struct net_device *netdev); struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan); void __i40e_del_filter(struct i40e_vsi *vsi, struct i40e_mac_filter *f); -void i40e_del_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan); int i40e_sync_vsi_filters(struct i40e_vsi *vsi); struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type, u16 uplink, u32 param1); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 83ba1effe8ba5..276dde0bc1d44 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1666,9 +1666,8 @@ struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi, * @vsi: VSI to remove from * @f: the filter to remove from the list * - * This function should be called instead of i40e_del_filter only if you know - * the exact filter you will remove already, such as via i40e_find_filter or - * i40e_find_mac. + * This function requires you've found * the exact filter you will remove + * already, such as via i40e_find_filter or i40e_find_mac. * * NOTE: This function is expected to be called with mac_filter_hash_lock * being held. @@ -1697,29 +1696,6 @@ void __i40e_del_filter(struct i40e_vsi *vsi, struct i40e_mac_filter *f) set_bit(__I40E_MACVLAN_SYNC_PENDING, vsi->back->state); } -/** - * i40e_del_filter - Remove a MAC/VLAN filter from the VSI - * @vsi: the VSI to be searched - * @macaddr: the MAC address - * @vlan: the VLAN - * - * NOTE: This function is expected to be called with mac_filter_hash_lock - * being held. - * ANOTHER NOTE: This function MUST be called from within the context of - * the "safe" variants of any list iterators, e.g. list_for_each_entry_safe() - * instead of list_for_each_entry(). - **/ -void i40e_del_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan) -{ - struct i40e_mac_filter *f; - - if (!vsi || !macaddr) - return; - - f = i40e_find_filter(vsi, macaddr, vlan); - __i40e_del_filter(vsi, f); -} - /** * i40e_add_mac_filter - Add a MAC filter for all active VLANs * @vsi: the VSI to be searched From a324484ac855a6770c6e7220b2ce09810a625f75 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:37:15 +0000 Subject: [PATCH 1168/1450] i40e: Remove unused i40e_commit_partition_bw_setting i40e_commit_partition_bw_setting() was added in 2017 by commit 4fc8c6763957 ("i40e: genericize the partition bandwidth control") but hasn't been used. Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kalesh AP Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102173717.200359-8-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e.h | 1 - drivers/net/ethernet/intel/i40e/i40e_main.c | 83 --------------------- 2 files changed, 84 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 399a5dbf35064..ce63a7cfe9558 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1311,7 +1311,6 @@ int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset); int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi); int i40e_get_partition_bw_setting(struct i40e_pf *pf); int i40e_set_partition_bw_setting(struct i40e_pf *pf); -int i40e_commit_partition_bw_setting(struct i40e_pf *pf); void i40e_print_link_message(struct i40e_vsi *vsi, bool isup); void i40e_set_fec_in_flags(u8 fec_cfg, unsigned long *flags); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 276dde0bc1d44..8a333d0e22187 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -12576,89 +12576,6 @@ int i40e_set_partition_bw_setting(struct i40e_pf *pf) return status; } -/** - * i40e_commit_partition_bw_setting - Commit BW settings for this PF partition - * @pf: board private structure - **/ -int i40e_commit_partition_bw_setting(struct i40e_pf *pf) -{ - /* Commit temporary BW setting to permanent NVM image */ - enum i40e_admin_queue_err last_aq_status; - u16 nvm_word; - int ret; - - if (pf->hw.partition_id != 1) { - dev_info(&pf->pdev->dev, - "Commit BW only works on partition 1! This is partition %d", - pf->hw.partition_id); - ret = -EOPNOTSUPP; - goto bw_commit_out; - } - - /* Acquire NVM for read access */ - ret = i40e_acquire_nvm(&pf->hw, I40E_RESOURCE_READ); - last_aq_status = pf->hw.aq.asq_last_status; - if (ret) { - dev_info(&pf->pdev->dev, - "Cannot acquire NVM for read access, err %pe aq_err %s\n", - ERR_PTR(ret), - i40e_aq_str(&pf->hw, last_aq_status)); - goto bw_commit_out; - } - - /* Read word 0x10 of NVM - SW compatibility word 1 */ - ret = i40e_aq_read_nvm(&pf->hw, - I40E_SR_NVM_CONTROL_WORD, - 0x10, sizeof(nvm_word), &nvm_word, - false, NULL); - /* Save off last admin queue command status before releasing - * the NVM - */ - last_aq_status = pf->hw.aq.asq_last_status; - i40e_release_nvm(&pf->hw); - if (ret) { - dev_info(&pf->pdev->dev, "NVM read error, err %pe aq_err %s\n", - ERR_PTR(ret), - i40e_aq_str(&pf->hw, last_aq_status)); - goto bw_commit_out; - } - - /* Wait a bit for NVM release to complete */ - msleep(50); - - /* Acquire NVM for write access */ - ret = i40e_acquire_nvm(&pf->hw, I40E_RESOURCE_WRITE); - last_aq_status = pf->hw.aq.asq_last_status; - if (ret) { - dev_info(&pf->pdev->dev, - "Cannot acquire NVM for write access, err %pe aq_err %s\n", - ERR_PTR(ret), - i40e_aq_str(&pf->hw, last_aq_status)); - goto bw_commit_out; - } - /* Write it back out unchanged to initiate update NVM, - * which will force a write of the shadow (alt) RAM to - * the NVM - thus storing the bandwidth values permanently. - */ - ret = i40e_aq_update_nvm(&pf->hw, - I40E_SR_NVM_CONTROL_WORD, - 0x10, sizeof(nvm_word), - &nvm_word, true, 0, NULL); - /* Save off last admin queue command status before releasing - * the NVM - */ - last_aq_status = pf->hw.aq.asq_last_status; - i40e_release_nvm(&pf->hw); - if (ret) - dev_info(&pf->pdev->dev, - "BW settings NOT SAVED, err %pe aq_err %s\n", - ERR_PTR(ret), - i40e_aq_str(&pf->hw, last_aq_status)); -bw_commit_out: - - return ret; -} - /** * i40e_is_total_port_shutdown_enabled - read NVM and return value * if total port shutdown feature is enabled for this PF From d424b93f35a61dc1147ef816cb3ae151395af656 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:37:16 +0000 Subject: [PATCH 1169/1450] i40e: Remove unused i40e_asq_send_command_v2 i40e_asq_send_command_v2() was added in 2022 by commit 74073848b0d7 ("i40e: Add new versions of send ASQ command functions") but hasn't been used. Remove it. (The _atomic_v2 version of the function is used, so leave it). Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102173717.200359-9-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_adminq.c | 10 ---------- drivers/net/ethernet/intel/i40e/i40e_prototype.h | 7 ------- 2 files changed, 17 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c index f73f5930fc58d..175c1320c1431 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c @@ -1016,16 +1016,6 @@ i40e_asq_send_command_atomic_v2(struct i40e_hw *hw, return status; } -int -i40e_asq_send_command_v2(struct i40e_hw *hw, struct i40e_aq_desc *desc, - void *buff, /* can be NULL */ u16 buff_size, - struct i40e_asq_cmd_details *cmd_details, - enum i40e_admin_queue_err *aq_status) -{ - return i40e_asq_send_command_atomic_v2(hw, desc, buff, buff_size, - cmd_details, true, aq_status); -} - /** * i40e_fill_default_direct_cmd_desc - AQ descriptor helper function * @desc: pointer to the temp descriptor (non DMA mem) diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index ccb8af472cd7d..099bb8ab7d708 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -27,13 +27,6 @@ i40e_asq_send_command(struct i40e_hw *hw, struct i40e_aq_desc *desc, void *buff, /* can be NULL */ u16 buff_size, struct i40e_asq_cmd_details *cmd_details); int -i40e_asq_send_command_v2(struct i40e_hw *hw, - struct i40e_aq_desc *desc, - void *buff, /* can be NULL */ - u16 buff_size, - struct i40e_asq_cmd_details *cmd_details, - enum i40e_admin_queue_err *aq_status); -int i40e_asq_send_command_atomic(struct i40e_hw *hw, struct i40e_aq_desc *desc, void *buff, /* can be NULL */ u16 buff_size, struct i40e_asq_cmd_details *cmd_details, From 47ea5d4e6f40446eddaf308eed942a3d3a9397e9 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:37:17 +0000 Subject: [PATCH 1170/1450] i40e: Remove unused i40e_dcb_hw_get_num_tc The last useof i40e_dcb_hw_get_num_tc() was removed in 2022 by commit fe20371578ef ("Revert "i40e: Fix reset bw limit when DCB enabled with 1 TC"") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kalesh AP Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102173717.200359-10-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_dcb.c | 13 ------------- drivers/net/ethernet/intel/i40e/i40e_dcb.h | 1 - 2 files changed, 14 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c index 8db1eb0c1768c..352e957443fdf 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c +++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c @@ -1490,19 +1490,6 @@ void i40e_dcb_hw_set_num_tc(struct i40e_hw *hw, u8 num_tc) wr32(hw, I40E_PRTDCB_GENC, reg); } -/** - * i40e_dcb_hw_get_num_tc - * @hw: pointer to the hw struct - * - * Returns number of traffic classes configured in HW - **/ -u8 i40e_dcb_hw_get_num_tc(struct i40e_hw *hw) -{ - u32 reg = rd32(hw, I40E_PRTDCB_GENC); - - return FIELD_GET(I40E_PRTDCB_GENC_NUMTC_MASK, reg); -} - /** * i40e_dcb_hw_rx_ets_bw_config * @hw: pointer to the hw struct diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.h b/drivers/net/ethernet/intel/i40e/i40e_dcb.h index d76497566e40e..d5662c639c41d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_dcb.h +++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.h @@ -253,7 +253,6 @@ void i40e_dcb_hw_rx_cmd_monitor_config(struct i40e_hw *hw, void i40e_dcb_hw_pfc_config(struct i40e_hw *hw, u8 pfc_en, u8 *prio_tc); void i40e_dcb_hw_set_num_tc(struct i40e_hw *hw, u8 num_tc); -u8 i40e_dcb_hw_get_num_tc(struct i40e_hw *hw); void i40e_dcb_hw_rx_ets_bw_config(struct i40e_hw *hw, u8 *bw_share, u8 *mode, u8 *prio_type); void i40e_dcb_hw_rx_up2tc_config(struct i40e_hw *hw, u8 *prio_tc); From b37dba891b17703bd249b4b7a8c4eb04482e2692 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:41:40 +0000 Subject: [PATCH 1171/1450] igc: Remove unused igc_acquire/release_nvm igc_acquire_nvm() and igc_release_nvm() were added in 2018 as part of commit ab4056126813 ("igc: Add NVM support") but never used. Remove them. The igc_1225.c has it's own specific implementations. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102174142.200700-2-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_nvm.c | 50 ------------------------ drivers/net/ethernet/intel/igc/igc_nvm.h | 2 - 2 files changed, 52 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_nvm.c b/drivers/net/ethernet/intel/igc/igc_nvm.c index 58f81aba01446..efd121c039670 100644 --- a/drivers/net/ethernet/intel/igc/igc_nvm.c +++ b/drivers/net/ethernet/intel/igc/igc_nvm.c @@ -35,56 +35,6 @@ static s32 igc_poll_eerd_eewr_done(struct igc_hw *hw, int ee_reg) return ret_val; } -/** - * igc_acquire_nvm - Generic request for access to EEPROM - * @hw: pointer to the HW structure - * - * Set the EEPROM access request bit and wait for EEPROM access grant bit. - * Return successful if access grant bit set, else clear the request for - * EEPROM access and return -IGC_ERR_NVM (-1). - */ -s32 igc_acquire_nvm(struct igc_hw *hw) -{ - s32 timeout = IGC_NVM_GRANT_ATTEMPTS; - u32 eecd = rd32(IGC_EECD); - s32 ret_val = 0; - - wr32(IGC_EECD, eecd | IGC_EECD_REQ); - eecd = rd32(IGC_EECD); - - while (timeout) { - if (eecd & IGC_EECD_GNT) - break; - udelay(5); - eecd = rd32(IGC_EECD); - timeout--; - } - - if (!timeout) { - eecd &= ~IGC_EECD_REQ; - wr32(IGC_EECD, eecd); - hw_dbg("Could not acquire NVM grant\n"); - ret_val = -IGC_ERR_NVM; - } - - return ret_val; -} - -/** - * igc_release_nvm - Release exclusive access to EEPROM - * @hw: pointer to the HW structure - * - * Stop any current commands to the EEPROM and clear the EEPROM request bit. - */ -void igc_release_nvm(struct igc_hw *hw) -{ - u32 eecd; - - eecd = rd32(IGC_EECD); - eecd &= ~IGC_EECD_REQ; - wr32(IGC_EECD, eecd); -} - /** * igc_read_nvm_eerd - Reads EEPROM using EERD register * @hw: pointer to the HW structure diff --git a/drivers/net/ethernet/intel/igc/igc_nvm.h b/drivers/net/ethernet/intel/igc/igc_nvm.h index f9fc2e9cfb030..ab78d0c645479 100644 --- a/drivers/net/ethernet/intel/igc/igc_nvm.h +++ b/drivers/net/ethernet/intel/igc/igc_nvm.h @@ -4,8 +4,6 @@ #ifndef _IGC_NVM_H_ #define _IGC_NVM_H_ -s32 igc_acquire_nvm(struct igc_hw *hw); -void igc_release_nvm(struct igc_hw *hw); s32 igc_read_mac_addr(struct igc_hw *hw); s32 igc_read_nvm_eerd(struct igc_hw *hw, u16 offset, u16 words, u16 *data); s32 igc_validate_nvm_checksum(struct igc_hw *hw); From 121c3c6bc661039104119a18ad0730540b353eaf Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:41:41 +0000 Subject: [PATCH 1172/1450] igc: Remove unused igc_read/write_pci_cfg wrappers igc_read_pci_cfg() and igc_write_pci_cfg were added in 2018 as part of commit 146740f9abc4 ("igc: Add support for PF") but have remained unused. Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102174142.200700-3-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_hw.h | 2 -- drivers/net/ethernet/intel/igc/igc_main.c | 14 -------------- 2 files changed, 16 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_hw.h b/drivers/net/ethernet/intel/igc/igc_hw.h index d9d1a1a11daf8..7ec7e395020bd 100644 --- a/drivers/net/ethernet/intel/igc/igc_hw.h +++ b/drivers/net/ethernet/intel/igc/igc_hw.h @@ -281,7 +281,5 @@ struct net_device *igc_get_hw_dev(struct igc_hw *hw); s32 igc_read_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value); s32 igc_write_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value); -void igc_read_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value); -void igc_write_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value); #endif /* _IGC_HW_H_ */ diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 27872bdea9bd1..9c92673a72402 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6780,20 +6780,6 @@ static const struct net_device_ops igc_netdev_ops = { }; /* PCIe configuration access */ -void igc_read_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value) -{ - struct igc_adapter *adapter = hw->back; - - pci_read_config_word(adapter->pdev, reg, value); -} - -void igc_write_pci_cfg(struct igc_hw *hw, u32 reg, u16 *value) -{ - struct igc_adapter *adapter = hw->back; - - pci_write_config_word(adapter->pdev, reg, *value); -} - s32 igc_read_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value) { struct igc_adapter *adapter = hw->back; From c758890813665edca9b66e020c52646c84b7b694 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 2 Jan 2025 17:41:42 +0000 Subject: [PATCH 1173/1450] igc: Remove unused igc_read/write_pcie_cap_reg The last uses of igc_read_pcie_cap_reg() and igc_write_pcie_cap_reg() were removed in 2019 by commit 16ecd8d9af26 ("igc: Remove the obsolete workaround") Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Tony Nguyen Link: https://patch.msgid.link/20250102174142.200700-4-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_hw.h | 3 --- drivers/net/ethernet/intel/igc/igc_main.c | 25 ----------------------- 2 files changed, 28 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_hw.h b/drivers/net/ethernet/intel/igc/igc_hw.h index 7ec7e395020bd..be8a49a86d09f 100644 --- a/drivers/net/ethernet/intel/igc/igc_hw.h +++ b/drivers/net/ethernet/intel/igc/igc_hw.h @@ -279,7 +279,4 @@ struct net_device *igc_get_hw_dev(struct igc_hw *hw); #define hw_dbg(format, arg...) \ netdev_dbg(igc_get_hw_dev(hw), format, ##arg) -s32 igc_read_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value); -s32 igc_write_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value); - #endif /* _IGC_HW_H_ */ diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 9c92673a72402..f58cd6940434a 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6779,31 +6779,6 @@ static const struct net_device_ops igc_netdev_ops = { .ndo_get_tstamp = igc_get_tstamp, }; -/* PCIe configuration access */ -s32 igc_read_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value) -{ - struct igc_adapter *adapter = hw->back; - - if (!pci_is_pcie(adapter->pdev)) - return -IGC_ERR_CONFIG; - - pcie_capability_read_word(adapter->pdev, reg, value); - - return IGC_SUCCESS; -} - -s32 igc_write_pcie_cap_reg(struct igc_hw *hw, u32 reg, u16 *value) -{ - struct igc_adapter *adapter = hw->back; - - if (!pci_is_pcie(adapter->pdev)) - return -IGC_ERR_CONFIG; - - pcie_capability_write_word(adapter->pdev, reg, *value); - - return IGC_SUCCESS; -} - u32 igc_rd32(struct igc_hw *hw, u32 reg) { struct igc_adapter *igc = container_of(hw, struct igc_adapter, hw); From 3f9f5cd005f5b5243eaa2647d40b9857fa1a901d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 2 Jan 2025 17:34:18 +0100 Subject: [PATCH 1174/1450] sctp: Prepare sctp_v4_get_dst() to dscp_t conversion. Define inet_sk_dscp() to get a dscp_t value from struct inet_sock, so that sctp_v4_get_dst() can easily set ->flowi4_tos from a dscp_t variable. For the SCTP_DSCP_SET_MASK case, we can just use inet_dsfield_to_dscp() to get a dscp_t value. Then, when converting ->flowi4_tos from __u8 to dscp_t, we'll just have to drop the inet_dscp_to_dsfield() conversion function. Signed-off-by: Guillaume Nault Acked-by: Xin Long Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/1a645f4a0bc60ad18e7c0916642883ce8a43c013.1735835456.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/inet_sock.h | 6 ++++++ net/sctp/protocol.c | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 3ccbad881d74f..1086256549faa 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -302,6 +303,11 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL; } +static inline dscp_t inet_sk_dscp(const struct inet_sock *inet) +{ + return inet_dsfield_to_dscp(READ_ONCE(inet->tos)); +} + #define inet_test_bit(nr, sk) \ test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet_set_bit(nr, sk) \ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 8b9a1b96695e0..29727ed1008ed 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -427,16 +428,19 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; - u8 tos = READ_ONCE(inet_sk(sk)->tos); + dscp_t dscp; if (t->dscp & SCTP_DSCP_SET_MASK) - tos = t->dscp & SCTP_DSCP_VAL_MASK; + dscp = inet_dsfield_to_dscp(t->dscp); + else + dscp = inet_sk_dscp(inet_sk(sk)); + memset(&_fl, 0x0, sizeof(_fl)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); From 95fc45d1dea8e1253f8ec58abc5befb71553d666 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jan 2025 21:05:14 +0000 Subject: [PATCH 1175/1450] ax25: rcu protect dev->ax25_ptr syzbot found a lockdep issue [1]. We should remove ax25 RTNL dependency in ax25_setsockopt() This should also fix a variety of possible UAF in ax25. [1] WARNING: possible circular locking dependency detected 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Not tainted ------------------------------------------------------ syz.5.1818/12806 is trying to acquire lock: ffffffff8fcb3988 (rtnl_mutex){+.+.}-{4:4}, at: ax25_setsockopt+0xa55/0xe90 net/ax25/af_ax25.c:680 but task is already holding lock: ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1618 [inline] ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: ax25_setsockopt+0x209/0xe90 net/ax25/af_ax25.c:574 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (sk_lock-AF_AX25){+.+.}-{0:0}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 lock_sock_nested+0x48/0x100 net/core/sock.c:3642 lock_sock include/net/sock.h:1618 [inline] ax25_kill_by_device net/ax25/af_ax25.c:101 [inline] ax25_device_event+0x24d/0x580 net/ax25/af_ax25.c:146 notifier_call_chain+0x1a5/0x3f0 kernel/notifier.c:85 __dev_notify_flags+0x207/0x400 dev_change_flags+0xf0/0x1a0 net/core/dev.c:9026 dev_ifsioc+0x7c8/0xe70 net/core/dev_ioctl.c:563 dev_ioctl+0x719/0x1340 net/core/dev_ioctl.c:820 sock_do_ioctl+0x240/0x460 net/socket.c:1234 sock_ioctl+0x626/0x8e0 net/socket.c:1339 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (rtnl_mutex){+.+.}-{4:4}: check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0x1ac/0xee0 kernel/locking/mutex.c:735 ax25_setsockopt+0xa55/0xe90 net/ax25/af_ax25.c:680 do_sock_setsockopt+0x3af/0x720 net/socket.c:2324 __sys_setsockopt net/socket.c:2349 [inline] __do_sys_setsockopt net/socket.c:2355 [inline] __se_sys_setsockopt net/socket.c:2352 [inline] __x64_sys_setsockopt+0x1ee/0x280 net/socket.c:2352 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sk_lock-AF_AX25); lock(rtnl_mutex); lock(sk_lock-AF_AX25); lock(rtnl_mutex); *** DEADLOCK *** 1 lock held by syz.5.1818/12806: #0: ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1618 [inline] #0: ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: ax25_setsockopt+0x209/0xe90 net/ax25/af_ax25.c:574 stack backtrace: CPU: 1 UID: 0 PID: 12806 Comm: syz.5.1818 Not tainted 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206 check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0x1ac/0xee0 kernel/locking/mutex.c:735 ax25_setsockopt+0xa55/0xe90 net/ax25/af_ax25.c:680 do_sock_setsockopt+0x3af/0x720 net/socket.c:2324 __sys_setsockopt net/socket.c:2349 [inline] __do_sys_setsockopt net/socket.c:2355 [inline] __se_sys_setsockopt net/socket.c:2352 [inline] __x64_sys_setsockopt+0x1ee/0x280 net/socket.c:2352 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f7b62385d29 Fixes: c433570458e4 ("ax25: fix a use-after-free in ax25_fillin_cb()") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250103210514.87290-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- include/net/ax25.h | 10 +++++----- net/ax25/af_ax25.c | 12 ++++++------ net/ax25/ax25_dev.c | 4 ++-- net/ax25/ax25_ip.c | 3 ++- net/ax25/ax25_out.c | 22 +++++++++++++++++----- net/ax25/ax25_route.c | 2 ++ 7 files changed, 35 insertions(+), 20 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2593019ad5b16..e84602e0226c1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2261,7 +2261,7 @@ struct net_device { void *atalk_ptr; #endif #if IS_ENABLED(CONFIG_AX25) - void *ax25_ptr; + struct ax25_dev __rcu *ax25_ptr; #endif #if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; diff --git a/include/net/ax25.h b/include/net/ax25.h index cb622d84cd0cc..4ee141aae0a29 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -231,6 +231,7 @@ typedef struct ax25_dev { #endif refcount_t refcount; bool device_up; + struct rcu_head rcu; } ax25_dev; typedef struct ax25_cb { @@ -290,9 +291,8 @@ static inline void ax25_dev_hold(ax25_dev *ax25_dev) static inline void ax25_dev_put(ax25_dev *ax25_dev) { - if (refcount_dec_and_test(&ax25_dev->refcount)) { - kfree(ax25_dev); - } + if (refcount_dec_and_test(&ax25_dev->refcount)) + kfree_rcu(ax25_dev, rcu); } static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { @@ -335,9 +335,9 @@ void ax25_digi_invert(const ax25_digi *, ax25_digi *); extern spinlock_t ax25_dev_lock; #if IS_ENABLED(CONFIG_AX25) -static inline ax25_dev *ax25_dev_ax25dev(struct net_device *dev) +static inline ax25_dev *ax25_dev_ax25dev(const struct net_device *dev) { - return dev->ax25_ptr; + return rcu_dereference_rtnl(dev->ax25_ptr); } #endif diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d6f9fae06a9d8..aa6c714892ec9 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -467,7 +467,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) goto out_put; } -static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev) +static void ax25_fillin_cb_from_dev(ax25_cb *ax25, const ax25_dev *ax25_dev) { ax25->rtt = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2; ax25->t1 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]); @@ -677,22 +677,22 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } - rtnl_lock(); - dev = __dev_get_by_name(&init_net, devname); + rcu_read_lock(); + dev = dev_get_by_name_rcu(&init_net, devname); if (!dev) { - rtnl_unlock(); + rcu_read_unlock(); res = -ENODEV; break; } ax25->ax25_dev = ax25_dev_ax25dev(dev); if (!ax25->ax25_dev) { - rtnl_unlock(); + rcu_read_unlock(); res = -ENODEV; break; } ax25_fillin_cb(ax25, ax25->ax25_dev); - rtnl_unlock(); + rcu_read_unlock(); break; default: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 9efd6690b3443..3733c0254a508 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -90,7 +90,7 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); list_add(&ax25_dev->list, &ax25_dev_list); - dev->ax25_ptr = ax25_dev; + rcu_assign_pointer(dev->ax25_ptr, ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); @@ -125,7 +125,7 @@ void ax25_dev_device_down(struct net_device *dev) } } - dev->ax25_ptr = NULL; + RCU_INIT_POINTER(dev->ax25_ptr, NULL); spin_unlock_bh(&ax25_dev_lock); netdev_put(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 36249776c021e..215d4ccf12b91 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -122,6 +122,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) if (dev == NULL) dev = skb->dev; + rcu_read_lock(); if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) { kfree_skb(skb); goto put; @@ -202,7 +203,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) ax25_queue_xmit(skb, dev); put: - + rcu_read_unlock(); ax25_route_lock_unuse(); return NETDEV_TX_OK; } diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 3db76d2470e95..8bca2ace98e51 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -39,10 +39,14 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *sr * specified. */ if (paclen == 0) { - if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + rcu_read_lock(); + ax25_dev = ax25_dev_ax25dev(dev); + if (!ax25_dev) { + rcu_read_unlock(); return NULL; - + } paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + rcu_read_unlock(); } /* @@ -53,13 +57,19 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *sr return ax25; /* It already existed */ } - if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + rcu_read_lock(); + ax25_dev = ax25_dev_ax25dev(dev); + if (!ax25_dev) { + rcu_read_unlock(); return NULL; + } - if ((ax25 = ax25_create_cb()) == NULL) + if ((ax25 = ax25_create_cb()) == NULL) { + rcu_read_unlock(); return NULL; - + } ax25_fillin_cb(ax25, ax25_dev); + rcu_read_unlock(); ax25->source_addr = *src; ax25->dest_addr = *dest; @@ -358,7 +368,9 @@ void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned char *ptr; + rcu_read_lock(); skb->protocol = ax25_type_trans(skb, ax25_fwd_dev(dev)); + rcu_read_unlock(); ptr = skb_push(skb, 1); *ptr = 0x00; /* KISS */ diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index b7c4d656a94b7..69de75db0c9c2 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -406,6 +406,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) ax25_route_lock_unuse(); return -EHOSTUNREACH; } + rcu_read_lock(); if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) { err = -EHOSTUNREACH; goto put; @@ -442,6 +443,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) } put: + rcu_read_unlock(); ax25_route_lock_unuse(); return err; } From 8c817eb26230dc0ae553cee16ff43a4a895f6756 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 3 Jan 2025 11:51:47 -0800 Subject: [PATCH 1176/1450] pds_core: limit loop over fw name list Add an array size limit to the for-loop to be sure we don't try to reference a fw_version string off the end of the fw info names array. We know that our firmware only has a limited number of firmware slot names, but we shouldn't leave this unchecked. Fixes: 45d76f492938 ("pds_core: set up device and adminq") Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Reviewed-by: Brett Creeley Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250103195147.7408-1-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c index 2681889162a25..44971e71991ff 100644 --- a/drivers/net/ethernet/amd/pds_core/devlink.c +++ b/drivers/net/ethernet/amd/pds_core/devlink.c @@ -118,7 +118,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req, if (err && err != -EIO) return err; - listlen = fw_list.num_fw_slots; + listlen = min(fw_list.num_fw_slots, ARRAY_SIZE(fw_list.fw_names)); for (i = 0; i < listlen; i++) { if (i < ARRAY_SIZE(fw_slotnames)) strscpy(buf, fw_slotnames[i], sizeof(buf)); From 4475d56145f368d065b05da3a5599d5620ca9408 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jan 2025 10:11:48 +0000 Subject: [PATCH 1177/1450] net: hsr: remove one synchronize_rcu() from hsr_del_port() Use kfree_rcu() instead of synchronize_rcu()+kfree(). This might allow syzbot to fuzz HSR a bit faster... Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250103101148.3594545-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_main.h | 1 + net/hsr/hsr_slave.c | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index fcfeb79bb0401..7d7551e6f0b02 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -163,6 +163,7 @@ struct hsr_port { struct net_device *dev; struct hsr_priv *hsr; enum hsr_port_type type; + struct rcu_head rcu; }; struct hsr_frame_info; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 464f683e016db..006d6ef97e53f 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -235,7 +235,5 @@ void hsr_del_port(struct hsr_port *port) netdev_upper_dev_unlink(port->dev, master->dev); } - synchronize_rcu(); - - kfree(port); + kfree_rcu(port, rcu); } From fbb9a9d263a68f60a16c8ba5a51d6198d67171cd Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:31 +0000 Subject: [PATCH 1178/1450] net: phylink: add support for PCS supported_interfaces bitmap Add support for the PCS to specify which interfaces it supports, which can be used by MAC drivers to build the main supported_interfaces bitmap. Phylink also validates that the PCS returned by the MAC driver supports the interface that the MAC was asked for. An empty supported_interfaces bitmap from the PCS indicates that it does not provide this information, and we handle that appropriately. Reviewed-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tTffL-007RoD-1Y@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 11 +++++++++++ include/linux/phylink.h | 3 +++ 2 files changed, 14 insertions(+) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 6d50c2fdb190e..31754d5fd6590 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -691,6 +691,17 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl, return -EINVAL; } + /* Ensure that this PCS supports the interface which the MAC + * returned it for. It is an error for the MAC to return a PCS + * that does not support the interface mode. + */ + if (!phy_interface_empty(pcs->supported_interfaces) && + !test_bit(state->interface, pcs->supported_interfaces)) { + phylink_err(pl, "MAC returned PCS which does not support %s\n", + phy_modes(state->interface)); + return -EINVAL; + } + /* Validate the link parameters with the PCS */ if (pcs->ops->pcs_validate) { ret = pcs->ops->pcs_validate(pcs, supported, state); diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5462cc6a37dc2..4b7a20620b496 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -393,6 +393,8 @@ struct phylink_pcs_ops; /** * struct phylink_pcs - PHYLINK PCS instance + * @supported_interfaces: describing which PHY_INTERFACE_MODE_xxx + * are supported by this PCS. * @ops: a pointer to the &struct phylink_pcs_ops structure * @phylink: pointer to &struct phylink_config * @neg_mode: provide PCS neg mode via "mode" argument @@ -409,6 +411,7 @@ struct phylink_pcs_ops; * the PCS driver. */ struct phylink_pcs { + DECLARE_PHY_INTERFACE_MASK(supported_interfaces); const struct phylink_pcs_ops *ops; struct phylink *phylink; bool neg_mode; From 906909fabb81dedf93a786c2d7247cab12e0a232 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:36 +0000 Subject: [PATCH 1179/1450] net: pcs: xpcs: fill in PCS supported_interfaces Fill in the new PCS supported_interfaces member with the interfaces that XPCS supports. Reviewed-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tTffQ-007RoJ-4u@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-xpcs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index f70ca39f0905b..cf41dc5e74e81 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -1446,6 +1446,8 @@ static struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev) if (ret) goto out_clear_clks; + xpcs_get_interfaces(xpcs, xpcs->pcs.supported_interfaces); + if (xpcs->info.pma == WX_TXGBE_XPCS_PMA_10G_ID) xpcs->pcs.poll = false; else From b87d4ee16bb4f6335032839a1173d8bb177939a9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:41 +0000 Subject: [PATCH 1180/1450] net: pcs: mtk-lynxi: fill in PCS supported_interfaces Fill in the new PCS supported_interfaces member with the interfaces that the Mediatek LynxI supports. Signed-off-by: Russell King (Oracle) Acked-by: Daniel Golle Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1tTffV-007RoP-8D@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-mtk-lynxi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/pcs/pcs-mtk-lynxi.c b/drivers/net/pcs/pcs-mtk-lynxi.c index ed91cd7a406ae..4fe0fb6d12a40 100644 --- a/drivers/net/pcs/pcs-mtk-lynxi.c +++ b/drivers/net/pcs/pcs-mtk-lynxi.c @@ -307,6 +307,10 @@ struct phylink_pcs *mtk_pcs_lynxi_create(struct device *dev, mpcs->pcs.poll = true; mpcs->interface = PHY_INTERFACE_MODE_NA; + __set_bit(PHY_INTERFACE_MODE_SGMII, mpcs->pcs.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_1000BASEX, mpcs->pcs.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_2500BASEX, mpcs->pcs.supported_interfaces); + return &mpcs->pcs; } EXPORT_SYMBOL(mtk_pcs_lynxi_create); From b0f88c1b9a539dc91b83ac90345999273ee7dfd0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:46 +0000 Subject: [PATCH 1181/1450] net: pcs: lynx: fill in PCS supported_interfaces Fill in the new PCS supported_interfaces member with the interfaces that Lynx supports. Reviewed-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tTffa-007RoV-Bo@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-lynx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/pcs/pcs-lynx.c b/drivers/net/pcs/pcs-lynx.c index 767a8c0714acc..6457190ec6e71 100644 --- a/drivers/net/pcs/pcs-lynx.c +++ b/drivers/net/pcs/pcs-lynx.c @@ -334,9 +334,19 @@ static const struct phylink_pcs_ops lynx_pcs_phylink_ops = { .pcs_link_up = lynx_pcs_link_up, }; +static const phy_interface_t lynx_interfaces[] = { + PHY_INTERFACE_MODE_SGMII, + PHY_INTERFACE_MODE_QSGMII, + PHY_INTERFACE_MODE_1000BASEX, + PHY_INTERFACE_MODE_2500BASEX, + PHY_INTERFACE_MODE_10GBASER, + PHY_INTERFACE_MODE_USXGMII, +}; + static struct phylink_pcs *lynx_pcs_create(struct mdio_device *mdio) { struct lynx_pcs *lynx; + int i; lynx = kzalloc(sizeof(*lynx), GFP_KERNEL); if (!lynx) @@ -348,6 +358,9 @@ static struct phylink_pcs *lynx_pcs_create(struct mdio_device *mdio) lynx->pcs.neg_mode = true; lynx->pcs.poll = true; + for (i = 0; i < ARRAY_SIZE(lynx_interfaces); i++) + __set_bit(lynx_interfaces[i], lynx->pcs.supported_interfaces); + return lynx_to_phylink_pcs(lynx); } From d13cefbb108e2e3362587b93ab5adc31c6a8589e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:51 +0000 Subject: [PATCH 1182/1450] net: stmmac: use PCS supported_interfaces Use the PCS' supported_interfaces member to build the MAC level supported_interfaces bitmap. Reviewed-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tTfff-007Roc-Ff@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 99eaec8bac4a9..2f518ec845ecf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1203,6 +1203,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) struct stmmac_mdio_bus_data *mdio_bus_data; int mode = priv->plat->phy_interface; struct fwnode_handle *fwnode; + struct phylink_pcs *pcs; struct phylink *phylink; priv->phylink_config.dev = &priv->dev->dev; @@ -1224,8 +1225,14 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) /* If we have an xpcs, it defines which PHY interfaces are supported. */ if (priv->hw->xpcs) - xpcs_get_interfaces(priv->hw->xpcs, - priv->phylink_config.supported_interfaces); + pcs = xpcs_to_phylink_pcs(priv->hw->xpcs); + else + pcs = priv->hw->phylink_pcs; + + if (pcs) + phy_interface_or(priv->phylink_config.supported_interfaces, + priv->phylink_config.supported_interfaces, + pcs->supported_interfaces); fwnode = priv->plat->port_node; if (!fwnode) From 2410719cdd49d9b062e87dddaf5ec990edafc6e3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:56 +0000 Subject: [PATCH 1183/1450] net: pcs: xpcs: make xpcs_get_interfaces() static xpcs_get_interfaces() should no longer be used outside of the XPCS code, so make it static. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1tTffk-007Roi-JM@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-xpcs.c | 3 +-- include/linux/pcs/pcs-xpcs.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index cf41dc5e74e81..c06b66f40022f 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -594,14 +594,13 @@ static unsigned int xpcs_inband_caps(struct phylink_pcs *pcs, } } -void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces) +static void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces) { const struct dw_xpcs_compat *compat; for (compat = xpcs->desc->compat; compat->supported; compat++) __set_bit(compat->interface, interfaces); } -EXPORT_SYMBOL_GPL(xpcs_get_interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable) { diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index b5b5d17998b82..733f4ddd2ef15 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -50,7 +50,6 @@ struct dw_xpcs; struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs); int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); -void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr); From 020ca0abae4c1f69e71507981844fe99ae154424 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:00 +0200 Subject: [PATCH 1184/1450] net/mlx5: HWS, remove the use of duplicated structs Remove definition in HWS of structs that are already defined in mlx5_ifc.h, and fix the usage of these structs. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/cmd.c | 20 ++++----- .../mellanox/mlx5/core/steering/hws/prm.h | 42 ------------------- 2 files changed, 10 insertions(+), 52 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index c00c138c3366b..13689c0c1a440 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -622,12 +622,12 @@ int mlx5hws_cmd_arg_create(struct mlx5_core_dev *mdev, u32 pd, u32 *arg_id) { + u32 in[MLX5_ST_SZ_DW(create_modify_header_arg_in)] = {0}; u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; - u32 in[MLX5_ST_SZ_DW(create_arg_in)] = {0}; void *attr; int ret; - attr = MLX5_ADDR_OF(create_arg_in, in, hdr); + attr = MLX5_ADDR_OF(create_modify_header_arg_in, in, hdr); MLX5_SET(general_obj_in_cmd_hdr, attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, @@ -635,8 +635,8 @@ int mlx5hws_cmd_arg_create(struct mlx5_core_dev *mdev, MLX5_SET(general_obj_in_cmd_hdr, attr, op_param.create.log_obj_range, log_obj_range); - attr = MLX5_ADDR_OF(create_arg_in, in, arg); - MLX5_SET(arg, attr, access_pd, pd); + attr = MLX5_ADDR_OF(create_modify_header_arg_in, in, arg); + MLX5_SET(modify_header_arg, attr, access_pd, pd); ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); if (ret) { @@ -812,7 +812,7 @@ int mlx5hws_cmd_packet_reformat_create(struct mlx5_core_dev *mdev, struct mlx5hws_cmd_packet_reformat_create_attr *attr, u32 *reformat_id) { - u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_out)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_context_out)] = {0}; size_t insz, cmd_data_sz, cmd_total_sz; void *prctx; void *pdata; @@ -845,7 +845,7 @@ int mlx5hws_cmd_packet_reformat_create(struct mlx5_core_dev *mdev, goto out; } - *reformat_id = MLX5_GET(alloc_packet_reformat_out, out, packet_reformat_id); + *reformat_id = MLX5_GET(alloc_packet_reformat_context_out, out, packet_reformat_id); out: kfree(in); return ret; @@ -854,13 +854,13 @@ int mlx5hws_cmd_packet_reformat_create(struct mlx5_core_dev *mdev, int mlx5hws_cmd_packet_reformat_destroy(struct mlx5_core_dev *mdev, u32 reformat_id) { - u32 out[MLX5_ST_SZ_DW(dealloc_packet_reformat_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_in)] = {0}; int ret; - MLX5_SET(dealloc_packet_reformat_in, in, opcode, + MLX5_SET(dealloc_packet_reformat_context_in, in, opcode, MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT); - MLX5_SET(dealloc_packet_reformat_in, in, + MLX5_SET(dealloc_packet_reformat_context_in, in, packet_reformat_id, reformat_id); ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h index de92cecbeb928..271490a51b965 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/prm.h @@ -390,11 +390,6 @@ struct mlx5_ifc_definer_bits { u8 match_mask[0x160]; }; -struct mlx5_ifc_arg_bits { - u8 rsvd0[0x88]; - u8 access_pd[0x18]; -}; - struct mlx5_ifc_header_modify_pattern_in_bits { u8 modify_field_select[0x40]; @@ -428,11 +423,6 @@ struct mlx5_ifc_create_definer_in_bits { struct mlx5_ifc_definer_bits definer; }; -struct mlx5_ifc_create_arg_in_bits { - struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; - struct mlx5_ifc_arg_bits arg; -}; - struct mlx5_ifc_create_header_modify_pattern_in_bits { struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; struct mlx5_ifc_header_modify_pattern_in_bits pattern; @@ -479,36 +469,4 @@ enum { MLX5_IFC_MODIFY_FLOW_TABLE_MISS_ACTION_GOTO_TBL = 1, }; -struct mlx5_ifc_alloc_packet_reformat_out_bits { - u8 status[0x8]; - u8 reserved_at_8[0x18]; - - u8 syndrome[0x20]; - - u8 packet_reformat_id[0x20]; - - u8 reserved_at_60[0x20]; -}; - -struct mlx5_ifc_dealloc_packet_reformat_in_bits { - u8 opcode[0x10]; - u8 reserved_at_10[0x10]; - - u8 reserved_at_20[0x10]; - u8 op_mod[0x10]; - - u8 packet_reformat_id[0x20]; - - u8 reserved_at_60[0x20]; -}; - -struct mlx5_ifc_dealloc_packet_reformat_out_bits { - u8 status[0x8]; - u8 reserved_at_8[0x18]; - - u8 syndrome[0x20]; - - u8 reserved_at_40[0x40]; -}; - #endif /* MLX5_PRM_H_ */ From 0647f27a5facedf6842c67b9909a23f577bd3d08 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:01 +0200 Subject: [PATCH 1185/1450] net/mlx5: HWS, remove implementation of unused FW commands Remove functions that manage alias objects - they are not used. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/cmd.c | 67 ------------------- .../mellanox/mlx5/core/steering/hws/cmd.h | 11 --- 2 files changed, 78 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index 13689c0c1a440..6fd7747f08ec8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -889,73 +889,6 @@ int mlx5hws_cmd_sq_modify_rdy(struct mlx5_core_dev *mdev, u32 sqn) return ret; } -int mlx5hws_cmd_allow_other_vhca_access(struct mlx5_core_dev *mdev, - struct mlx5hws_cmd_allow_other_vhca_access_attr *attr) -{ - u32 out[MLX5_ST_SZ_DW(allow_other_vhca_access_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(allow_other_vhca_access_in)] = {0}; - void *key; - int ret; - - MLX5_SET(allow_other_vhca_access_in, - in, opcode, MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS); - MLX5_SET(allow_other_vhca_access_in, - in, object_type_to_be_accessed, attr->obj_type); - MLX5_SET(allow_other_vhca_access_in, - in, object_id_to_be_accessed, attr->obj_id); - - key = MLX5_ADDR_OF(allow_other_vhca_access_in, in, access_key); - memcpy(key, attr->access_key, sizeof(attr->access_key)); - - ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); - if (ret) - mlx5_core_err(mdev, "Failed to execute ALLOW_OTHER_VHCA_ACCESS command\n"); - - return ret; -} - -int mlx5hws_cmd_alias_obj_create(struct mlx5_core_dev *mdev, - struct mlx5hws_cmd_alias_obj_create_attr *alias_attr, - u32 *obj_id) -{ - u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; - u32 in[MLX5_ST_SZ_DW(create_alias_obj_in)] = {0}; - void *attr; - void *key; - int ret; - - attr = MLX5_ADDR_OF(create_alias_obj_in, in, hdr); - MLX5_SET(general_obj_in_cmd_hdr, - attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); - MLX5_SET(general_obj_in_cmd_hdr, - attr, obj_type, alias_attr->obj_type); - MLX5_SET(general_obj_in_cmd_hdr, attr, op_param.create.alias_object, 1); - - attr = MLX5_ADDR_OF(create_alias_obj_in, in, alias_ctx); - MLX5_SET(alias_context, attr, vhca_id_to_be_accessed, alias_attr->vhca_id); - MLX5_SET(alias_context, attr, object_id_to_be_accessed, alias_attr->obj_id); - - key = MLX5_ADDR_OF(alias_context, attr, access_key); - memcpy(key, alias_attr->access_key, sizeof(alias_attr->access_key)); - - ret = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); - if (ret) { - mlx5_core_err(mdev, "Failed to create ALIAS OBJ\n"); - goto out; - } - - *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); -out: - return ret; -} - -int mlx5hws_cmd_alias_obj_destroy(struct mlx5_core_dev *mdev, - u16 obj_type, - u32 obj_id) -{ - return hws_cmd_general_obj_destroy(mdev, obj_type, obj_id); -} - int mlx5hws_cmd_generate_wqe(struct mlx5_core_dev *mdev, struct mlx5hws_cmd_generate_wqe_attr *attr, struct mlx5_cqe64 *ret_cqe) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h index 434f62b0904ec..038f588907855 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h @@ -334,14 +334,6 @@ mlx5hws_cmd_forward_tbl_create(struct mlx5_core_dev *mdev, void mlx5hws_cmd_forward_tbl_destroy(struct mlx5_core_dev *mdev, struct mlx5hws_cmd_forward_tbl *tbl); -int mlx5hws_cmd_alias_obj_create(struct mlx5_core_dev *mdev, - struct mlx5hws_cmd_alias_obj_create_attr *alias_attr, - u32 *obj_id); - -int mlx5hws_cmd_alias_obj_destroy(struct mlx5_core_dev *mdev, - u16 obj_type, - u32 obj_id); - int mlx5hws_cmd_sq_modify_rdy(struct mlx5_core_dev *mdev, u32 sqn); int mlx5hws_cmd_query_caps(struct mlx5_core_dev *mdev, @@ -352,9 +344,6 @@ void mlx5hws_cmd_set_attr_connect_miss_tbl(struct mlx5hws_context *ctx, enum mlx5hws_table_type type, struct mlx5hws_cmd_ft_modify_attr *ft_attr); -int mlx5hws_cmd_allow_other_vhca_access(struct mlx5_core_dev *mdev, - struct mlx5hws_cmd_allow_other_vhca_access_attr *attr); - int mlx5hws_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_function, u16 vport_number, u16 *gvmi); From 0a1ef807a403b2f386a571133eb35e25c6511808 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:02 +0200 Subject: [PATCH 1186/1450] net/mlx5: HWS, denote how refcounts are protected Some HWS structs have refcounts that are just u32. Comment how they are protected and add '__must_hold()' annotation where applicable. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Erez Shitrit Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h | 2 +- .../net/ethernet/mellanox/mlx5/core/steering/hws/definer.h | 2 +- .../net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c | 3 ++- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h index e8f562c318267..4669c9fbcfb25 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h @@ -70,12 +70,12 @@ struct mlx5hws_action_default_stc { struct mlx5hws_pool_chunk nop_dw6; struct mlx5hws_pool_chunk nop_dw7; struct mlx5hws_pool_chunk default_hit; - u32 refcount; + u32 refcount; /* protected by context ctrl lock */ }; struct mlx5hws_action_shared_stc { struct mlx5hws_pool_chunk stc_chunk; - u32 refcount; + u32 refcount; /* protected by context ctrl lock */ }; struct mlx5hws_actions_apply_data { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h index 038f588907855..610c63d81ad95 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h @@ -63,7 +63,7 @@ struct mlx5hws_cmd_forward_tbl { u8 type; u32 ft_id; u32 fg_id; - u32 refcount; + u32 refcount; /* protected by context ctrl lock */ }; struct mlx5hws_cmd_rtc_create_attr { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h index 9432d5084def3..5c1a2086efbac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.h @@ -785,7 +785,7 @@ struct mlx5hws_definer_cache { struct mlx5hws_definer_cache_item { struct mlx5hws_definer definer; - u32 refcount; + u32 refcount; /* protected by context ctrl lock */ struct list_head list_node; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h index 27ca93385b084..8ddb519800445 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.h @@ -31,7 +31,7 @@ struct mlx5hws_pattern_cache_item { u8 *data; u16 num_of_actions; } mh_data; - u32 refcount; + u32 refcount; /* protected by pattern_cache lock */ struct list_head ptrn_list_node; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c index 9576e02d00c3d..5b183739d5fd0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c @@ -37,6 +37,7 @@ static void hws_table_set_cap_attr(struct mlx5hws_table *tbl, } static int hws_table_up_default_fdb_miss_tbl(struct mlx5hws_table *tbl) +__must_hold(&tbl->ctx->ctrl_lock) { struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; struct mlx5hws_cmd_set_fte_attr fte_attr = {0}; @@ -70,7 +71,6 @@ static int hws_table_up_default_fdb_miss_tbl(struct mlx5hws_table *tbl) return -EINVAL; } - /* ctx->ctrl_lock must be held here */ ctx->common_res[tbl_type].default_miss = default_miss; ctx->common_res[tbl_type].default_miss->refcount++; @@ -79,6 +79,7 @@ static int hws_table_up_default_fdb_miss_tbl(struct mlx5hws_table *tbl) /* Called under ctx->ctrl_lock */ static void hws_table_down_default_fdb_miss_tbl(struct mlx5hws_table *tbl) +__must_hold(&tbl->ctx->ctrl_lock) { struct mlx5hws_cmd_forward_tbl *default_miss; struct mlx5hws_context *ctx = tbl->ctx; From c86963aae5b83a865a552408b40e743c3610bd9f Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:03 +0200 Subject: [PATCH 1187/1450] net/mlx5: HWS, simplify allocations as we support only FDB In pools, STCs and actions: no need to allocate array for various table types, as HWS is used to manage only FDB flow tables. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Erez Shitrit Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/action.c | 107 +++++++++--------- .../mellanox/mlx5/core/steering/hws/action.h | 2 +- .../mellanox/mlx5/core/steering/hws/cmd.c | 2 +- .../mellanox/mlx5/core/steering/hws/context.c | 29 ++--- .../mellanox/mlx5/core/steering/hws/context.h | 4 +- .../mellanox/mlx5/core/steering/hws/debug.c | 36 +++--- .../mellanox/mlx5/core/steering/hws/matcher.c | 4 +- .../mellanox/mlx5/core/steering/hws/rule.c | 2 +- .../mellanox/mlx5/core/steering/hws/table.c | 13 +-- 9 files changed, 87 insertions(+), 112 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index a897cdc60fdbd..67d4f40cbd836 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -11,31 +11,29 @@ /* This is the longest supported action sequence for FDB table: * DECAP, POP_VLAN, MODIFY, CTR, ASO, PUSH_VLAN, MODIFY, ENCAP, Term. */ -static const u32 action_order_arr[MLX5HWS_TABLE_TYPE_MAX][MLX5HWS_ACTION_TYP_MAX] = { - [MLX5HWS_TABLE_TYPE_FDB] = { - BIT(MLX5HWS_ACTION_TYP_REMOVE_HEADER) | - BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L2_TO_L2) | - BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2), - BIT(MLX5HWS_ACTION_TYP_POP_VLAN), - BIT(MLX5HWS_ACTION_TYP_POP_VLAN), - BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR), - BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN), - BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN), - BIT(MLX5HWS_ACTION_TYP_INSERT_HEADER) | - BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2) | - BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3), - BIT(MLX5HWS_ACTION_TYP_CTR), - BIT(MLX5HWS_ACTION_TYP_TAG), - BIT(MLX5HWS_ACTION_TYP_ASO_METER), - BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR), - BIT(MLX5HWS_ACTION_TYP_TBL) | - BIT(MLX5HWS_ACTION_TYP_VPORT) | - BIT(MLX5HWS_ACTION_TYP_DROP) | - BIT(MLX5HWS_ACTION_TYP_SAMPLER) | - BIT(MLX5HWS_ACTION_TYP_RANGE) | - BIT(MLX5HWS_ACTION_TYP_DEST_ARRAY), - BIT(MLX5HWS_ACTION_TYP_LAST), - }, +static const u32 action_order_arr[MLX5HWS_ACTION_TYP_MAX] = { + BIT(MLX5HWS_ACTION_TYP_REMOVE_HEADER) | + BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L2_TO_L2) | + BIT(MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2), + BIT(MLX5HWS_ACTION_TYP_POP_VLAN), + BIT(MLX5HWS_ACTION_TYP_POP_VLAN), + BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR), + BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN), + BIT(MLX5HWS_ACTION_TYP_PUSH_VLAN), + BIT(MLX5HWS_ACTION_TYP_INSERT_HEADER) | + BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L2) | + BIT(MLX5HWS_ACTION_TYP_REFORMAT_L2_TO_TNL_L3), + BIT(MLX5HWS_ACTION_TYP_CTR), + BIT(MLX5HWS_ACTION_TYP_TAG), + BIT(MLX5HWS_ACTION_TYP_ASO_METER), + BIT(MLX5HWS_ACTION_TYP_MODIFY_HDR), + BIT(MLX5HWS_ACTION_TYP_TBL) | + BIT(MLX5HWS_ACTION_TYP_VPORT) | + BIT(MLX5HWS_ACTION_TYP_DROP) | + BIT(MLX5HWS_ACTION_TYP_SAMPLER) | + BIT(MLX5HWS_ACTION_TYP_RANGE) | + BIT(MLX5HWS_ACTION_TYP_DEST_ARRAY), + BIT(MLX5HWS_ACTION_TYP_LAST), }; static const char * const mlx5hws_action_type_str[] = { @@ -83,8 +81,8 @@ static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx, int ret; mutex_lock(&ctx->ctrl_lock); - if (ctx->common_res[tbl_type].shared_stc[stc_type]) { - ctx->common_res[tbl_type].shared_stc[stc_type]->refcount++; + if (ctx->common_res.shared_stc[stc_type]) { + ctx->common_res.shared_stc[stc_type]->refcount++; mutex_unlock(&ctx->ctrl_lock); return 0; } @@ -124,8 +122,8 @@ static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx, goto free_shared_stc; } - ctx->common_res[tbl_type].shared_stc[stc_type] = shared_stc; - ctx->common_res[tbl_type].shared_stc[stc_type]->refcount = 1; + ctx->common_res.shared_stc[stc_type] = shared_stc; + ctx->common_res.shared_stc[stc_type]->refcount = 1; mutex_unlock(&ctx->ctrl_lock); @@ -178,16 +176,16 @@ static void hws_action_put_shared_stc(struct mlx5hws_action *action, } mutex_lock(&ctx->ctrl_lock); - if (--ctx->common_res[tbl_type].shared_stc[stc_type]->refcount) { + if (--ctx->common_res.shared_stc[stc_type]->refcount) { mutex_unlock(&ctx->ctrl_lock); return; } - shared_stc = ctx->common_res[tbl_type].shared_stc[stc_type]; + shared_stc = ctx->common_res.shared_stc[stc_type]; mlx5hws_action_free_single_stc(ctx, tbl_type, &shared_stc->stc_chunk); kfree(shared_stc); - ctx->common_res[tbl_type].shared_stc[stc_type] = NULL; + ctx->common_res.shared_stc[stc_type] = NULL; mutex_unlock(&ctx->ctrl_lock); } @@ -206,10 +204,10 @@ bool mlx5hws_action_check_combo(struct mlx5hws_context *ctx, enum mlx5hws_action_type *user_actions, enum mlx5hws_table_type table_type) { - const u32 *order_arr = action_order_arr[table_type]; + const u32 *order_arr = action_order_arr; + bool valid_combo; u8 order_idx = 0; u8 user_idx = 0; - bool valid_combo; if (table_type >= MLX5HWS_TABLE_TYPE_MAX) { mlx5hws_err(ctx, "Invalid table_type %d", table_type); @@ -321,8 +319,8 @@ int mlx5hws_action_alloc_single_stc(struct mlx5hws_context *ctx, __must_hold(&ctx->ctrl_lock) { struct mlx5hws_cmd_stc_modify_attr cleanup_stc_attr = {0}; - struct mlx5hws_pool *stc_pool = ctx->stc_pool[table_type]; struct mlx5hws_cmd_stc_modify_attr fixup_stc_attr = {0}; + struct mlx5hws_pool *stc_pool = ctx->stc_pool; bool use_fixup; u32 obj_0_id; int ret; @@ -387,8 +385,8 @@ void mlx5hws_action_free_single_stc(struct mlx5hws_context *ctx, struct mlx5hws_pool_chunk *stc) __must_hold(&ctx->ctrl_lock) { - struct mlx5hws_pool *stc_pool = ctx->stc_pool[table_type]; struct mlx5hws_cmd_stc_modify_attr stc_attr = {0}; + struct mlx5hws_pool *stc_pool = ctx->stc_pool; u32 obj_id; /* Modify the STC not to point to an object */ @@ -561,7 +559,7 @@ hws_action_create_stcs(struct mlx5hws_action *action, u32 obj_id) if (action->flags & MLX5HWS_ACTION_FLAG_HWS_FDB) { ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, MLX5HWS_TABLE_TYPE_FDB, - &action->stc[MLX5HWS_TABLE_TYPE_FDB]); + &action->stc); if (ret) goto out_err; } @@ -585,7 +583,7 @@ hws_action_destroy_stcs(struct mlx5hws_action *action) if (action->flags & MLX5HWS_ACTION_FLAG_HWS_FDB) mlx5hws_action_free_single_stc(ctx, MLX5HWS_TABLE_TYPE_FDB, - &action->stc[MLX5HWS_TABLE_TYPE_FDB]); + &action->stc); mutex_unlock(&ctx->ctrl_lock); } @@ -1639,8 +1637,8 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, false); /* STC is a single resource (obj_id), use any STC for the ID */ - stc_pool = ctx->stc_pool[MLX5HWS_TABLE_TYPE_FDB]; - default_stc = ctx->common_res[MLX5HWS_TABLE_TYPE_FDB].default_stc; + stc_pool = ctx->stc_pool; + default_stc = ctx->common_res.default_stc; obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); rtc_attr.stc_base = obj_id; @@ -1731,7 +1729,7 @@ hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx, ste_attr.used_id_rtc_0 = &used_rtc_0_id; ste_attr.used_id_rtc_1 = &used_rtc_1_id; - common_res = &ctx->common_res[MLX5HWS_TABLE_TYPE_FDB]; + common_res = &ctx->common_res; /* init an empty match STE which will always hit */ ste_attr.wqe_ctrl = &wqe_ctrl; @@ -1750,7 +1748,7 @@ hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx, wqe_ctrl.stc_ix[MLX5HWS_ACTION_STC_IDX_CTRL] |= htonl(MLX5HWS_ACTION_STC_IDX_LAST_COMBO2 << 29); wqe_ctrl.stc_ix[MLX5HWS_ACTION_STC_IDX_HIT] = - htonl(hit_ft_action->stc[MLX5HWS_TABLE_TYPE_FDB].offset); + htonl(hit_ft_action->stc.offset); wqe_data_arr = (__force __be32 *)&range_wqe_data; @@ -1843,7 +1841,7 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx, stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, MLX5HWS_TABLE_TYPE_FDB, - &action->stc[MLX5HWS_TABLE_TYPE_FDB]); + &action->stc); if (ret) goto error_unlock; @@ -1970,8 +1968,8 @@ __must_hold(&ctx->ctrl_lock) struct mlx5hws_action_default_stc *default_stc; int ret; - if (ctx->common_res[tbl_type].default_stc) { - ctx->common_res[tbl_type].default_stc->refcount++; + if (ctx->common_res.default_stc) { + ctx->common_res.default_stc->refcount++; return 0; } @@ -2023,8 +2021,8 @@ __must_hold(&ctx->ctrl_lock) goto free_nop_dw7; } - ctx->common_res[tbl_type].default_stc = default_stc; - ctx->common_res[tbl_type].default_stc->refcount++; + ctx->common_res.default_stc = default_stc; + ctx->common_res.default_stc->refcount++; return 0; @@ -2046,9 +2044,7 @@ __must_hold(&ctx->ctrl_lock) { struct mlx5hws_action_default_stc *default_stc; - default_stc = ctx->common_res[tbl_type].default_stc; - - default_stc = ctx->common_res[tbl_type].default_stc; + default_stc = ctx->common_res.default_stc; if (--default_stc->refcount) return; @@ -2058,7 +2054,7 @@ __must_hold(&ctx->ctrl_lock) mlx5hws_action_free_single_stc(ctx, tbl_type, &default_stc->nop_dw5); mlx5hws_action_free_single_stc(ctx, tbl_type, &default_stc->nop_ctr); kfree(default_stc); - ctx->common_res[tbl_type].default_stc = NULL; + ctx->common_res.default_stc = NULL; } static void hws_action_modify_write(struct mlx5hws_send_engine *queue, @@ -2150,8 +2146,7 @@ hws_action_apply_stc(struct mlx5hws_actions_apply_data *apply, { struct mlx5hws_action *action = apply->rule_action[action_idx].action; - apply->wqe_ctrl->stc_ix[stc_idx] = - htonl(action->stc[apply->tbl_type].offset); + apply->wqe_ctrl->stc_ix[stc_idx] = htonl(action->stc.offset); } static void @@ -2181,7 +2176,7 @@ hws_action_setter_modify_header(struct mlx5hws_actions_apply_data *apply, rule_action = &apply->rule_action[setter->idx_double]; action = rule_action->action; - stc_idx = htonl(action->stc[apply->tbl_type].offset); + stc_idx = htonl(action->stc.offset); apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW6] = stc_idx; apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW7] = 0; @@ -2240,7 +2235,7 @@ hws_action_setter_insert_ptr(struct mlx5hws_actions_apply_data *apply, apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW6] = 0; apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx); - stc_idx = htonl(action->stc[apply->tbl_type].offset); + stc_idx = htonl(action->stc.offset); apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW6] = stc_idx; apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW7] = 0; @@ -2272,7 +2267,7 @@ hws_action_setter_tnl_l3_to_l2(struct mlx5hws_actions_apply_data *apply, apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW6] = 0; apply->wqe_data[MLX5HWS_ACTION_OFFSET_DW7] = htonl(arg_idx); - stc_idx = htonl(action->stc[apply->tbl_type].offset); + stc_idx = htonl(action->stc.offset); apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW6] = stc_idx; apply->wqe_ctrl->stc_ix[MLX5HWS_ACTION_STC_IDX_DW7] = 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h index 4669c9fbcfb25..6d1592c49e0c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h @@ -124,7 +124,7 @@ struct mlx5hws_action { struct mlx5hws_context *ctx; union { struct { - struct mlx5hws_pool_chunk stc[MLX5HWS_TABLE_TYPE_MAX]; + struct mlx5hws_pool_chunk stc; union { struct { u32 pat_id; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index 6fd7747f08ec8..9b71ff80831d7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -359,7 +359,7 @@ void mlx5hws_cmd_set_attr_connect_miss_tbl(struct mlx5hws_context *ctx, ft_attr->type = fw_ft_type; ft_attr->table_miss_action = MLX5_IFC_MODIFY_FLOW_TABLE_MISS_ACTION_GOTO_TBL; - default_miss_tbl = ctx->common_res[type].default_miss->ft_id; + default_miss_tbl = ctx->common_res.default_miss->ft_id; if (!default_miss_tbl) { pr_warn("HWS: no flow table ID for default miss\n"); return; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c index 4a8928f33bb99..9cda2774fd64a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c @@ -23,7 +23,6 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx) struct mlx5hws_pool_attr pool_attr = {0}; u8 max_log_sz; int ret; - int i; ret = mlx5hws_pat_init_pattern_cache(&ctx->pattern_cache); if (ret) @@ -39,23 +38,17 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx) max_log_sz = min(MLX5HWS_POOL_STC_LOG_SZ, ctx->caps->stc_alloc_log_max); pool_attr.alloc_log_sz = max(max_log_sz, ctx->caps->stc_alloc_log_gran); - for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) { - pool_attr.table_type = i; - ctx->stc_pool[i] = mlx5hws_pool_create(ctx, &pool_attr); - if (!ctx->stc_pool[i]) { - mlx5hws_err(ctx, "Failed to allocate STC pool [%d]", i); - ret = -ENOMEM; - goto free_stc_pools; - } + pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB; + ctx->stc_pool = mlx5hws_pool_create(ctx, &pool_attr); + if (!ctx->stc_pool) { + mlx5hws_err(ctx, "Failed to allocate STC pool\n"); + ret = -ENOMEM; + goto uninit_cache; } return 0; -free_stc_pools: - for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) - if (ctx->stc_pool[i]) - mlx5hws_pool_destroy(ctx->stc_pool[i]); - +uninit_cache: mlx5hws_definer_uninit_cache(ctx->definer_cache); uninit_pat_cache: mlx5hws_pat_uninit_pattern_cache(ctx->pattern_cache); @@ -64,12 +57,8 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx) static void hws_context_pools_uninit(struct mlx5hws_context *ctx) { - int i; - - for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) { - if (ctx->stc_pool[i]) - mlx5hws_pool_destroy(ctx->stc_pool[i]); - } + if (ctx->stc_pool) + mlx5hws_pool_destroy(ctx->stc_pool); mlx5hws_definer_uninit_cache(ctx->definer_cache); mlx5hws_pat_uninit_pattern_cache(ctx->pattern_cache); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h index 1c9cc4fba083d..38c3647444adc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h @@ -38,8 +38,8 @@ struct mlx5hws_context { struct mlx5_core_dev *mdev; struct mlx5hws_cmd_query_caps *caps; u32 pd_num; - struct mlx5hws_pool *stc_pool[MLX5HWS_TABLE_TYPE_MAX]; - struct mlx5hws_context_common_res common_res[MLX5HWS_TABLE_TYPE_MAX]; + struct mlx5hws_pool *stc_pool; + struct mlx5hws_context_common_res common_res; struct mlx5hws_pattern_cache *pattern_cache; struct mlx5hws_definer_cache *definer_cache; struct mutex ctrl_lock; /* control lock to protect the whole context */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c index 5b200b4bc1a87..60ada3143d60d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c @@ -368,9 +368,10 @@ static int hws_debug_dump_context_info(struct seq_file *f, struct mlx5hws_contex static int hws_debug_dump_context_stc_resource(struct seq_file *f, struct mlx5hws_context *ctx, - u32 tbl_type, struct mlx5hws_pool_resource *resource) { + u32 tbl_type = MLX5HWS_TABLE_TYPE_BASE + MLX5HWS_TABLE_TYPE_FDB; + seq_printf(f, "%d,0x%llx,%u,%u\n", MLX5HWS_DEBUG_RES_TYPE_CONTEXT_STC, HWS_PTR_TO_ID(ctx), @@ -382,31 +383,22 @@ static int hws_debug_dump_context_stc_resource(struct seq_file *f, static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context *ctx) { - struct mlx5hws_pool *stc_pool; - u32 table_type; + struct mlx5hws_pool *stc_pool = ctx->stc_pool; int ret; - int i; - - for (i = 0; i < MLX5HWS_TABLE_TYPE_MAX; i++) { - stc_pool = ctx->stc_pool[i]; - table_type = MLX5HWS_TABLE_TYPE_BASE + i; - if (!stc_pool) - continue; + if (!stc_pool) + return 0; - if (stc_pool->resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, table_type, - stc_pool->resource[0]); - if (ret) - return ret; - } + if (stc_pool->resource[0]) { + ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->resource[0]); + if (ret) + return ret; + } - if (i == MLX5HWS_TABLE_TYPE_FDB && stc_pool->mirror_resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, table_type, - stc_pool->mirror_resource[0]); - if (ret) - return ret; - } + if (stc_pool->mirror_resource[0]) { + ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->mirror_resource[0]); + if (ret) + return ret; } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index 1bb3a6f8c3cda..e40193f30c547 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -318,8 +318,8 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, false); /* STC is a single resource (obj_id), use any STC for the ID */ - stc_pool = ctx->stc_pool[tbl->type]; - default_stc = ctx->common_res[tbl->type].default_stc; + stc_pool = ctx->stc_pool; + default_stc = ctx->common_res.default_stc; obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); rtc_attr.stc_base = obj_id; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c index e20c67a04203f..14f6307a1772a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c @@ -315,7 +315,7 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule, /* Init default action apply */ apply->tbl_type = tbl->type; - apply->common_res = &ctx->common_res[tbl->type]; + apply->common_res = &ctx->common_res; apply->jump_to_action_stc = matcher->action_ste[0].stc.offset; apply->require_dep = 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c index 5b183739d5fd0..967d67ec10e37 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c @@ -49,8 +49,8 @@ __must_hold(&tbl->ctx->ctrl_lock) if (tbl->type != MLX5HWS_TABLE_TYPE_FDB) return 0; - if (ctx->common_res[tbl_type].default_miss) { - ctx->common_res[tbl_type].default_miss->refcount++; + if (ctx->common_res.default_miss) { + ctx->common_res.default_miss->refcount++; return 0; } @@ -71,8 +71,8 @@ __must_hold(&tbl->ctx->ctrl_lock) return -EINVAL; } - ctx->common_res[tbl_type].default_miss = default_miss; - ctx->common_res[tbl_type].default_miss->refcount++; + ctx->common_res.default_miss = default_miss; + ctx->common_res.default_miss->refcount++; return 0; } @@ -83,17 +83,16 @@ __must_hold(&tbl->ctx->ctrl_lock) { struct mlx5hws_cmd_forward_tbl *default_miss; struct mlx5hws_context *ctx = tbl->ctx; - u8 tbl_type = tbl->type; if (tbl->type != MLX5HWS_TABLE_TYPE_FDB) return; - default_miss = ctx->common_res[tbl_type].default_miss; + default_miss = ctx->common_res.default_miss; if (--default_miss->refcount) return; mlx5hws_cmd_forward_tbl_destroy(ctx->mdev, default_miss); - ctx->common_res[tbl_type].default_miss = NULL; + ctx->common_res.default_miss = NULL; } static int hws_table_connect_to_default_miss_tbl(struct mlx5hws_table *tbl, u32 ft_id) From cc611ab6c712eaa1ed3fd4321d91e66cfe3245a3 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:04 +0200 Subject: [PATCH 1188/1450] net/mlx5: HWS, add error message on failure to move rules Add error message for failure to move rules from old matcher to new one during rehash. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/steering/hws/bwc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index baacf662c0ab8..af8ab8750c703 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -615,8 +615,12 @@ static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_match ret = hws_bwc_queue_poll(ctx, rule_attr.queue_id, &pending_rules[i], false); - if (unlikely(ret)) + if (unlikely(ret)) { + mlx5hws_err(ctx, + "Moving BWC rule failed during rehash (%d)\n", + ret); goto free_bwc_rules; + } } } } while (!all_done); @@ -629,8 +633,11 @@ static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_match mlx5hws_send_engine_flush_queue(&ctx->send_queue[queue_id]); ret = hws_bwc_queue_poll(ctx, queue_id, &pending_rules[i], true); - if (unlikely(ret)) + if (unlikely(ret)) { + mlx5hws_err(ctx, + "Moving BWC rule failed during rehash (%d)\n", ret); goto free_bwc_rules; + } } } From 1ce840c7a659aa53a31ef49f0271b4fd0dc10296 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:05 +0200 Subject: [PATCH 1189/1450] net/mlx5: HWS, change error flow on matcher disconnect Currently, when firmware failure occurs during matcher disconnect flow, the error flow of the function reconnects the matcher back and returns an error, which continues running the calling function and eventually frees the matcher that is being disconnected. This leads to a case where we have a freed matcher on the matchers list, which in turn leads to use-after-free and eventual crash. This patch fixes that by not trying to reconnect the matcher back when some FW command fails during disconnect. Note that we're dealing here with FW error. We can't overcome this problem. This might lead to bad steering state (e.g. wrong connection between matchers), and will also lead to resource leakage, as it is the case with any other error handling during resource destruction. However, the goal here is to allow the driver to continue and not crash the machine with use-after-free error. Signed-off-by: Yevgeny Kliteynik Signed-off-by: Itamar Gozlan Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/matcher.c | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index e40193f30c547..fea2a945b0db4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -165,14 +165,14 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher) next->match_ste.rtc_0_id, next->match_ste.rtc_1_id); if (ret) { - mlx5hws_err(tbl->ctx, "Failed to disconnect matcher\n"); - goto matcher_reconnect; + mlx5hws_err(tbl->ctx, "Fatal error, failed to disconnect matcher\n"); + return ret; } } else { ret = mlx5hws_table_connect_to_miss_table(tbl, tbl->default_miss.miss_tbl); if (ret) { - mlx5hws_err(tbl->ctx, "Failed to disconnect last matcher\n"); - goto matcher_reconnect; + mlx5hws_err(tbl->ctx, "Fatal error, failed to disconnect last matcher\n"); + return ret; } } @@ -180,27 +180,19 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher) if (prev_ft_id == tbl->ft_id) { ret = mlx5hws_table_update_connected_miss_tables(tbl); if (ret) { - mlx5hws_err(tbl->ctx, "Fatal error, failed to update connected miss table\n"); - goto matcher_reconnect; + mlx5hws_err(tbl->ctx, + "Fatal error, failed to update connected miss table\n"); + return ret; } } ret = mlx5hws_table_ft_set_default_next_ft(tbl, prev_ft_id); if (ret) { mlx5hws_err(tbl->ctx, "Fatal error, failed to restore matcher ft default miss\n"); - goto matcher_reconnect; + return ret; } return 0; - -matcher_reconnect: - if (list_empty(&tbl->matchers_list) || !prev) - list_add(&matcher->list_node, &tbl->matchers_list); - else - /* insert after prev matcher */ - list_add(&matcher->list_node, &prev->list_node); - - return ret; } static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher, From ad4da6cc36ace35d80a292bfeaac49e63e9e26eb Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:06 +0200 Subject: [PATCH 1190/1450] net/mlx5: HWS, remove wrong deletion of the miss table list Remove wrong cleanup of the old miss table list and simplify the error flow in the function. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Reviewed-by: Itamar Gozlan Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-8-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/steering/hws/table.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c index 967d67ec10e37..ab12975312320 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c @@ -478,15 +478,9 @@ int mlx5hws_table_set_default_miss(struct mlx5hws_table *tbl, if (old_miss_tbl) list_del_init(&tbl->default_miss.next); - old_miss_tbl = tbl->default_miss.miss_tbl; - if (old_miss_tbl) - list_del_init(&old_miss_tbl->default_miss.head); - if (miss_tbl) list_add(&tbl->default_miss.next, &miss_tbl->default_miss.head); - mutex_unlock(&ctx->ctrl_lock); - return 0; out: mutex_unlock(&ctx->ctrl_lock); return ret; From 05e3c287b98795cf01d829d29841179cef3fb9ce Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:07 +0200 Subject: [PATCH 1191/1450] net/mlx5: HWS, reduce memory consumption of a matcher struct Instead of having a large array of action templates allocated with kmalloc, have smaller array and allocate it with kvmalloc. The size of the array represents the max number of AT attach operations for the same matcher. This number is not expected to be very high. In any case, when the limit is reached, the next attempt to attach new AT will result in creation of a new matcher and moving all the rules to this matcher. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Erez Shitrit Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-9-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h | 8 +++++++- .../ethernet/mellanox/mlx5/core/steering/hws/matcher.c | 8 ++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h index 3d4965213b012..1d27638fa171c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h @@ -8,7 +8,13 @@ #define MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP 1 #define MLX5HWS_BWC_MATCHER_REHASH_PERCENT_TH 70 #define MLX5HWS_BWC_MATCHER_REHASH_BURST_TH 32 -#define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 255 + +/* Max number of AT attach operations for the same matcher. + * When the limit is reached, next attempt to attach new AT + * will result in creation of a new matcher and moving all + * the rules to this matcher. + */ +#define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 8 #define MLX5HWS_BWC_MAX_ACTS 16 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index fea2a945b0db4..4419c72ad3149 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -999,9 +999,9 @@ hws_matcher_set_templates(struct mlx5hws_matcher *matcher, if (!matcher->mt) return -ENOMEM; - matcher->at = kcalloc(num_of_at + matcher->attr.max_num_of_at_attach, - sizeof(*matcher->at), - GFP_KERNEL); + matcher->at = kvcalloc(num_of_at + matcher->attr.max_num_of_at_attach, + sizeof(*matcher->at), + GFP_KERNEL); if (!matcher->at) { mlx5hws_err(ctx, "Failed to allocate action template array\n"); ret = -ENOMEM; @@ -1027,7 +1027,7 @@ hws_matcher_set_templates(struct mlx5hws_matcher *matcher, static void hws_matcher_unset_templates(struct mlx5hws_matcher *matcher) { - kfree(matcher->at); + kvfree(matcher->at); kfree(matcher->mt); } From 61fb92701b8ac9174857c417cfa988adc24e32c2 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:08 +0200 Subject: [PATCH 1192/1450] net/mlx5: HWS, num_of_rules counter on matcher should be atomic Rule counter in matcher's struct is used in two places: 1. As heuristics to decide when the number of rules have crossed a certain percentage threshold and the matcher should be resized. We don't mind here if the number will be off by 1-2 due to concurrency. 2. When destroying matcher, the counter value is checked and the user is warned if it is not 0. Here we lock all the queues, so the counter will be correct. We don't need to always have *exact* number, but we do need this number to not be corrupted, which is what is happening when the counter isn't atomic, due to update by different threads. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Erez Shitrit Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-10-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/bwc.c | 17 +++++++++++------ .../mellanox/mlx5/core/steering/hws/bwc.h | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index af8ab8750c703..40d688ed6153a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -152,6 +152,8 @@ mlx5hws_bwc_matcher_create(struct mlx5hws_table *table, if (!bwc_matcher) return NULL; + atomic_set(&bwc_matcher->num_of_rules, 0); + /* Check if the required match params can be all matched * in single STE, otherwise complex matcher is needed. */ @@ -199,10 +201,12 @@ int mlx5hws_bwc_matcher_destroy_simple(struct mlx5hws_bwc_matcher *bwc_matcher) int mlx5hws_bwc_matcher_destroy(struct mlx5hws_bwc_matcher *bwc_matcher) { - if (bwc_matcher->num_of_rules) + u32 num_of_rules = atomic_read(&bwc_matcher->num_of_rules); + + if (num_of_rules) mlx5hws_err(bwc_matcher->matcher->tbl->ctx, "BWC matcher destroy: matcher still has %d rules\n", - bwc_matcher->num_of_rules); + num_of_rules); mlx5hws_bwc_matcher_destroy_simple(bwc_matcher); @@ -309,7 +313,7 @@ static void hws_bwc_rule_list_add(struct mlx5hws_bwc_rule *bwc_rule, u16 idx) { struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; - bwc_matcher->num_of_rules++; + atomic_inc(&bwc_matcher->num_of_rules); bwc_rule->bwc_queue_idx = idx; list_add(&bwc_rule->list_node, &bwc_matcher->rules[idx]); } @@ -318,7 +322,7 @@ static void hws_bwc_rule_list_remove(struct mlx5hws_bwc_rule *bwc_rule) { struct mlx5hws_bwc_matcher *bwc_matcher = bwc_rule->bwc_matcher; - bwc_matcher->num_of_rules--; + atomic_dec(&bwc_matcher->num_of_rules); list_del_init(&bwc_rule->list_node); } @@ -711,7 +715,8 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher) * Need to check again if we really need rehash. * If the reason for rehash was size, but not any more - skip rehash. */ - if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher, bwc_matcher->num_of_rules)) + if (!hws_bwc_matcher_rehash_size_needed(bwc_matcher, + atomic_read(&bwc_matcher->num_of_rules))) return 0; /* Now we're done all the checking - do the rehash: @@ -804,7 +809,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, } /* check if number of rules require rehash */ - num_of_rules = bwc_matcher->num_of_rules; + num_of_rules = atomic_read(&bwc_matcher->num_of_rules); if (unlikely(hws_bwc_matcher_rehash_size_needed(bwc_matcher, num_of_rules))) { mutex_unlock(queue_lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h index 1d27638fa171c..06c2a30c0d4e7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h @@ -25,7 +25,7 @@ struct mlx5hws_bwc_matcher { u8 num_of_at; u16 priority; u8 size_log; - u32 num_of_rules; /* atomically accessed */ + atomic_t num_of_rules; struct list_head *rules; }; From 2f851d1702dcd1b7124aef1680a091ff3f2ef791 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:09 +0200 Subject: [PATCH 1193/1450] net/mlx5: HWS, separate SQ that HWS uses from the usual traffic SQs Mark the HWS SQ as 'non_wire' so that 'Flow Update' flow won't mix with network traffic. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-11-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index 20fe126ffd221..c680b7f984e11 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -633,6 +633,7 @@ static int hws_send_ring_create_sq(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); MLX5_SET(sqc, sqc, flush_in_error_en, 1); + MLX5_SET(sqc, sqc, non_wire, 1); ts_format = mlx5_is_real_time_sq(mdev) ? MLX5_TIMESTAMP_FORMAT_REAL_TIME : MLX5_TIMESTAMP_FORMAT_FREE_RUNNING; From be482f1d10da781db9445d2753c1e3f1fd82babf Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:10 +0200 Subject: [PATCH 1194/1450] net/mlx5: HWS, fix definer's HWS_SET32 macro for negative offset When bit offset for HWS_SET32 macro is negative, UBSAN complains about the shift-out-of-bounds: UBSAN: shift-out-of-bounds in drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c:177:2 shift exponent -8 is negative Fixes: 74a778b4a63f ("net/mlx5: HWS, added definers handling") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Erez Shitrit Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-12-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c index 8fe96eb76baff..10ece7df1cfaf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c @@ -70,7 +70,7 @@ u32 second_dw_mask = (mask) & ((1 << _bit_off) - 1); \ _HWS_SET32(p, (v) >> _bit_off, byte_off, 0, (mask) >> _bit_off); \ _HWS_SET32(p, (v) & second_dw_mask, (byte_off) + DW_SIZE, \ - (bit_off) % BITS_IN_DW, second_dw_mask); \ + (bit_off + BITS_IN_DW) % BITS_IN_DW, second_dw_mask); \ } else { \ _HWS_SET32(p, v, byte_off, (bit_off), (mask)); \ } \ From a105db854cf2e495caaa17f00ac0321b503def9b Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 2 Jan 2025 20:14:11 +0200 Subject: [PATCH 1195/1450] net/mlx5: HWS, handle returned error value in pool alloc Handle all negative return values as errors, not just -1. The code previously treated -ENOMEM (and potentially other negative values) as valid segment numbers, leading to incorrect behavior. This fix ensures that any negative return value is treated as an error. Signed-off-by: Vlad Dogaru Signed-off-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-13-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index fed2d913f3b89..50a81d360bb2f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -183,7 +183,7 @@ static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool, *seg = -1; /* Find the next free place from the buddy array */ - while (*seg == -1) { + while (*seg < 0) { for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { buddy = hws_pool_buddy_get_next_buddy(pool, i, order, @@ -194,7 +194,7 @@ static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool, } *seg = mlx5hws_buddy_alloc_mem(buddy, order); - if (*seg != -1) + if (*seg >= 0) goto found; if (pool->flags & MLX5HWS_POOL_FLAGS_ONE_RESOURCE) { From 85ab9ea32548c0fff1c8e07b4fbfc185f615f9f1 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:12 +0200 Subject: [PATCH 1196/1450] net/mlx5: HWS, use the right size when writing arg data When writing arg data, wrong size was used - fixing this. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-14-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c index 06db5e4726aee..d9dc4f2d0dc68 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c @@ -344,7 +344,7 @@ void mlx5hws_arg_write(struct mlx5hws_send_engine *queue, mlx5hws_send_engine_post_req_wqe(&ctrl, (void *)&wqe_ctrl, &wqe_len); memset(wqe_ctrl, 0, wqe_len); mlx5hws_send_engine_post_req_wqe(&ctrl, (void *)&wqe_arg, &wqe_len); - memcpy(wqe_arg, arg_data, wqe_len); + memcpy(wqe_arg, arg_data, MLX5HWS_ARG_DATA_SIZE); send_attr.id = arg_idx++; mlx5hws_send_engine_post_end(&ctrl, &send_attr); From 663e61225c4019441cd5c9d3cc35dfc293271482 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 2 Jan 2025 20:14:13 +0200 Subject: [PATCH 1197/1450] net/mlx5: HWS, support flow sampler destination Since sampler isn't currently supported via HWS, use a FW island that forwards any packets to the supplied sampler. Signed-off-by: Vlad Dogaru Signed-off-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-15-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/action.c | 52 ++++++++++++++++++- .../mellanox/mlx5/core/steering/hws/action.h | 3 ++ .../mellanox/mlx5/core/steering/hws/cmd.c | 6 +++ 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index 67d4f40cbd836..b5332c54d4fb0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -471,6 +471,7 @@ static void hws_action_fill_stc_attr(struct mlx5hws_action *action, break; case MLX5HWS_ACTION_TYP_TBL: case MLX5HWS_ACTION_TYP_DEST_ARRAY: + case MLX5HWS_ACTION_TYP_SAMPLER: attr->action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_FT; attr->action_offset = MLX5HWS_ACTION_OFFSET_HIT; attr->dest_table_id = obj_id; @@ -1873,7 +1874,50 @@ struct mlx5hws_action * mlx5hws_action_create_flow_sampler(struct mlx5hws_context *ctx, u32 sampler_id, u32 flags) { - mlx5hws_err(ctx, "Flow sampler action - unsupported\n"); + struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; + struct mlx5hws_cmd_set_fte_attr fte_attr = {0}; + struct mlx5hws_cmd_forward_tbl *fw_island; + struct mlx5hws_cmd_set_fte_dest dest; + struct mlx5hws_action *action; + int ret; + + if (flags != (MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED)) { + mlx5hws_err(ctx, "Unsupported flags for flow sampler\n"); + return NULL; + } + + ft_attr.type = FS_FT_FDB; + ft_attr.level = ctx->caps->fdb_ft.max_level - 1; + + dest.destination_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER; + dest.destination_id = sampler_id; + + fte_attr.dests_num = 1; + fte_attr.dests = &dest; + fte_attr.action_flags = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + fte_attr.ignore_flow_level = 1; + + fw_island = mlx5hws_cmd_forward_tbl_create(ctx->mdev, &ft_attr, &fte_attr); + if (!fw_island) + return NULL; + + action = hws_action_create_generic(ctx, flags, + MLX5HWS_ACTION_TYP_SAMPLER); + if (!action) + goto destroy_fw_island; + + ret = hws_action_create_stcs(action, fw_island->ft_id); + if (ret) + goto free_action; + + action->flow_sampler.fw_island = fw_island; + + return action; + +free_action: + kfree(action); +destroy_fw_island: + mlx5hws_cmd_forward_tbl_destroy(ctx->mdev, fw_island); return NULL; } @@ -1912,6 +1956,11 @@ static void hws_action_destroy_hws(struct mlx5hws_action *action) } kfree(action->dest_array.dest_list); break; + case MLX5HWS_ACTION_TYP_SAMPLER: + hws_action_destroy_stcs(action); + mlx5hws_cmd_forward_tbl_destroy(action->ctx->mdev, + action->flow_sampler.fw_island); + break; case MLX5HWS_ACTION_TYP_REFORMAT_TNL_L3_TO_L2: case MLX5HWS_ACTION_TYP_MODIFY_HDR: shared_arg = false; @@ -2429,6 +2478,7 @@ int mlx5hws_action_template_process(struct mlx5hws_action_template *at) case MLX5HWS_ACTION_TYP_DROP: case MLX5HWS_ACTION_TYP_TBL: case MLX5HWS_ACTION_TYP_DEST_ARRAY: + case MLX5HWS_ACTION_TYP_SAMPLER: case MLX5HWS_ACTION_TYP_VPORT: case MLX5HWS_ACTION_TYP_MISS: /* Hit action */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h index 6d1592c49e0c2..64b76075f7f8f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h @@ -165,6 +165,9 @@ struct mlx5hws_action { size_t num_dest; struct mlx5hws_cmd_set_fte_dest *dest_list; } dest_array; + struct { + struct mlx5hws_cmd_forward_tbl *fw_island; + } flow_sampler; struct { u8 type; u8 start_anchor; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index 9b71ff80831d7..487e75476b0ab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -257,6 +257,12 @@ int mlx5hws_cmd_set_fte(struct mlx5_core_dev *mdev, dest->ext_reformat_id); } break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER: + MLX5_SET(dest_format, in_dests, + destination_type, ifc_dest_type); + MLX5_SET(dest_format, in_dests, destination_id, + dest->destination_id); + break; default: ret = -EOPNOTSUPP; goto out; From d74ee6e197a2c2c5b1697d737ccdcaf8cc6c199e Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 2 Jan 2025 20:14:14 +0200 Subject: [PATCH 1198/1450] net/mlx5: HWS, set timeout on polling for completion Consolidate BWC polling for completion into one function and set a time limit on the loop that polls for completion. This can happen only if there is some issue with FW/PCI/HW, such as FW being stuck, PCI issue, etc. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Itamar Gozlan Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250102181415.1477316-16-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/bwc.c | 26 ++++++++++++------- .../mellanox/mlx5/core/steering/hws/bwc.h | 2 ++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 40d688ed6153a..a8d886e921448 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -219,6 +219,8 @@ static int hws_bwc_queue_poll(struct mlx5hws_context *ctx, u32 *pending_rules, bool drain) { + unsigned long timeout = jiffies + + msecs_to_jiffies(MLX5HWS_BWC_POLLING_TIMEOUT * MSEC_PER_SEC); struct mlx5hws_flow_op_result comp[MLX5HWS_BWC_MATCHER_REHASH_BURST_TH]; u16 burst_th = hws_bwc_get_burst_th(ctx, queue_id); bool got_comp = *pending_rules >= burst_th; @@ -254,6 +256,11 @@ static int hws_bwc_queue_poll(struct mlx5hws_context *ctx, } got_comp = !!ret; + + if (unlikely(!got_comp && time_after(jiffies, timeout))) { + mlx5hws_err(ctx, "BWC poll error: polling queue %d - TIMEOUT\n", queue_id); + return -ETIMEDOUT; + } } return err; @@ -338,22 +345,21 @@ hws_bwc_rule_destroy_hws_sync(struct mlx5hws_bwc_rule *bwc_rule, struct mlx5hws_rule_attr *rule_attr) { struct mlx5hws_context *ctx = bwc_rule->bwc_matcher->matcher->tbl->ctx; - struct mlx5hws_flow_op_result completion; + u32 expected_completions = 1; int ret; ret = hws_bwc_rule_destroy_hws_async(bwc_rule, rule_attr); if (unlikely(ret)) return ret; - do { - ret = mlx5hws_send_queue_poll(ctx, rule_attr->queue_id, &completion, 1); - } while (ret != 1); - - if (unlikely(completion.status != MLX5HWS_FLOW_OP_SUCCESS || - (bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETED && - bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETING))) { - mlx5hws_err(ctx, "Failed destroying BWC rule: completion %d, rule status %d\n", - completion.status, bwc_rule->rule->status); + ret = hws_bwc_queue_poll(ctx, rule_attr->queue_id, &expected_completions, true); + if (unlikely(ret)) + return ret; + + if (unlikely(bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETED && + bwc_rule->rule->status != MLX5HWS_RULE_STATUS_DELETING)) { + mlx5hws_err(ctx, "Failed destroying BWC rule: rule status %d\n", + bwc_rule->rule->status); return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h index 06c2a30c0d4e7..f9f569131ddeb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h @@ -18,6 +18,8 @@ #define MLX5HWS_BWC_MAX_ACTS 16 +#define MLX5HWS_BWC_POLLING_TIMEOUT 60 + struct mlx5hws_bwc_matcher { struct mlx5hws_matcher *matcher; struct mlx5hws_match_template *mt; From c8dafb0e4398dacc362832098a04b97da3b0395b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Fri, 3 Jan 2025 20:38:47 -0800 Subject: [PATCH 1199/1450] bnxt_en: Fix possible memory leak when hwrm_req_replace fails When hwrm_req_replace() fails, the driver is not invoking bnxt_req_drop() which could cause a memory leak. Fixes: bbf33d1d9805 ("bnxt_en: update all firmware calls to use the new APIs") Reviewed-by: Pavan Chebbi Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250104043849.3482067-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index b771c84cdd895..0ed26e3a28f45 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -208,7 +208,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev, rc = hwrm_req_replace(bp, req, fw_msg->msg, fw_msg->msg_len); if (rc) - return rc; + goto drop_req; hwrm_req_timeout(bp, req, fw_msg->timeout); resp = hwrm_req_hold(bp, req); @@ -220,6 +220,7 @@ int bnxt_send_msg(struct bnxt_en_dev *edev, memcpy(fw_msg->resp, resp, resp_len); } +drop_req: hwrm_req_drop(bp, req); return rc; } From 40452969a50652e3cbf89dac83d54eebf2206d27 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 3 Jan 2025 20:38:48 -0800 Subject: [PATCH 1200/1450] bnxt_en: Fix DIM shutdown DIM work will call the firmware to adjust the coalescing parameters on the RX rings. We should cancel DIM work before we call the firmware to free the RX rings. Otherwise, FW will reject the call from DIM work if the RX ring has been freed. This will generate an error message like this: bnxt_en 0000:21:00.1 ens2f1np1: hwrm req_type 0x53 seq id 0x6fca error 0x2 and cause unnecessary concern for the user. It is also possible to modify the coalescing parameters of the wrong ring if the ring has been re-allocated. To prevent this, cancel DIM work right before freeing the RX rings. We also have to add a check in NAPI poll to not schedule DIM if the RX rings are shutting down. Check that the VNIC is active before we schedule DIM. The VNIC is always disabled before we free the RX rings. Fixes: 0bc0b97fca73 ("bnxt_en: cleanup DIM work on device shutdown") Reviewed-by: Hongguang Gao Reviewed-by: Kalesh AP Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250104043849.3482067-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 38 ++++++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b86f980fa7ea6..aeaa74f03046c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2897,6 +2897,13 @@ static int bnxt_hwrm_handler(struct bnxt *bp, struct tx_cmp *txcmp) return 0; } +static bool bnxt_vnic_is_active(struct bnxt *bp) +{ + struct bnxt_vnic_info *vnic = &bp->vnic_info[0]; + + return vnic->fw_vnic_id != INVALID_HW_RING_ID && vnic->mru > 0; +} + static irqreturn_t bnxt_msix(int irq, void *dev_instance) { struct bnxt_napi *bnapi = dev_instance; @@ -3164,7 +3171,7 @@ static int bnxt_poll(struct napi_struct *napi, int budget) break; } } - if (bp->flags & BNXT_FLAG_DIM) { + if ((bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) { struct dim_sample dim_sample = {}; dim_update_sample(cpr->event_ctr, @@ -3295,7 +3302,7 @@ static int bnxt_poll_p5(struct napi_struct *napi, int budget) poll_done: cpr_rx = &cpr->cp_ring_arr[0]; if (cpr_rx->cp_ring_type == BNXT_NQ_HDL_TYPE_RX && - (bp->flags & BNXT_FLAG_DIM)) { + (bp->flags & BNXT_FLAG_DIM) && bnxt_vnic_is_active(bp)) { struct dim_sample dim_sample = {}; dim_update_sample(cpr->event_ctr, @@ -7266,6 +7273,26 @@ static int bnxt_hwrm_ring_alloc(struct bnxt *bp) return rc; } +static void bnxt_cancel_dim(struct bnxt *bp) +{ + int i; + + /* DIM work is initialized in bnxt_enable_napi(). Proceed only + * if NAPI is enabled. + */ + if (!bp->bnapi || test_bit(BNXT_STATE_NAPI_DISABLED, &bp->state)) + return; + + /* Make sure NAPI sees that the VNIC is disabled */ + synchronize_net(); + for (i = 0; i < bp->rx_nr_rings; i++) { + struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i]; + struct bnxt_napi *bnapi = rxr->bnapi; + + cancel_work_sync(&bnapi->cp_ring.dim.work); + } +} + static int hwrm_ring_free_send_msg(struct bnxt *bp, struct bnxt_ring_struct *ring, u32 ring_type, int cmpl_ring_id) @@ -7366,6 +7393,7 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path) } } + bnxt_cancel_dim(bp); for (i = 0; i < bp->rx_nr_rings; i++) { bnxt_hwrm_rx_ring_free(bp, &bp->rx_ring[i], close_path); bnxt_hwrm_rx_agg_ring_free(bp, &bp->rx_ring[i], close_path); @@ -11309,8 +11337,6 @@ static void bnxt_disable_napi(struct bnxt *bp) if (bnapi->in_reset) cpr->sw_stats->rx.rx_resets++; napi_disable(&bnapi->napi); - if (bnapi->rx_ring) - cancel_work_sync(&cpr->dim.work); } } @@ -15572,8 +15598,10 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) bnxt_hwrm_vnic_update(bp, vnic, VNIC_UPDATE_REQ_ENABLES_MRU_VALID); } - + /* Make sure NAPI sees that the VNIC is disabled */ + synchronize_net(); rxr = &bp->rx_ring[idx]; + cancel_work_sync(&rxr->bnapi->cp_ring.dim.work); bnxt_hwrm_rx_ring_free(bp, rxr, false); bnxt_hwrm_rx_agg_ring_free(bp, rxr, false); rxr->rx_next_cons = 0; From 4c1224501e9d6c5fd12d83752f1c1b444e0e3418 Mon Sep 17 00:00:00 2001 From: Anumula Murali Mohan Reddy Date: Fri, 3 Jan 2025 14:53:27 +0530 Subject: [PATCH 1201/1450] cxgb4: Avoid removal of uninserted tid During ARP failure, tid is not inserted but _c4iw_free_ep() attempts to remove tid which results in error. This patch fixes the issue by avoiding removal of uninserted tid. Fixes: 59437d78f088 ("cxgb4/chtls: fix ULD connection failures due to wrong TID base") Signed-off-by: Anumula Murali Mohan Reddy Signed-off-by: Potnuri Bharat Teja Link: https://patch.msgid.link/20250103092327.1011925-1-anumula@chelsio.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index bc3af00544064..604dcfd49aa4c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -1799,7 +1799,10 @@ void cxgb4_remove_tid(struct tid_info *t, unsigned int chan, unsigned int tid, struct adapter *adap = container_of(t, struct adapter, tids); struct sk_buff *skb; - WARN_ON(tid_out_of_range(&adap->tids, tid)); + if (tid_out_of_range(&adap->tids, tid)) { + dev_err(adap->pdev_dev, "tid %d out of range\n", tid); + return; + } if (t->tid_tab[tid - adap->tids.tid_base]) { t->tid_tab[tid - adap->tids.tid_base] = NULL; From 21a8a77abb4c5b472072a2f695f89a98c8af1654 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 4 Jan 2025 15:20:43 +0100 Subject: [PATCH 1202/1450] nfc: st21nfca: Drop unneeded null check in st21nfca_tx_work() Variable 'info' is obtained via container_of() of struct work_struct, so it cannot be NULL. Simplify the code and solve Smatch warning: drivers/nfc/st21nfca/dep.c:119 st21nfca_tx_work() warn: can 'info' even be NULL? Signed-off-by: Krzysztof Kozlowski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250104142043.116045-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- drivers/nfc/st21nfca/dep.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c index 1ec651e31064b..3425b68f0ddcf 100644 --- a/drivers/nfc/st21nfca/dep.c +++ b/drivers/nfc/st21nfca/dep.c @@ -116,18 +116,16 @@ static void st21nfca_tx_work(struct work_struct *work) struct nfc_dev *dev; struct sk_buff *skb; - if (info) { - dev = info->hdev->ndev; - skb = info->dep_info.tx_pending; + dev = info->hdev->ndev; + skb = info->dep_info.tx_pending; - device_lock(&dev->dev); + device_lock(&dev->dev); - nfc_hci_send_cmd_async(info->hdev, ST21NFCA_RF_READER_F_GATE, - ST21NFCA_WR_XCHG_DATA, skb->data, skb->len, - info->async_cb, info); - device_unlock(&dev->dev); - kfree_skb(skb); - } + nfc_hci_send_cmd_async(info->hdev, ST21NFCA_RF_READER_F_GATE, + ST21NFCA_WR_XCHG_DATA, skb->data, skb->len, + info->async_cb, info); + device_unlock(&dev->dev); + kfree_skb(skb); } static void st21nfca_im_send_pdu(struct st21nfca_hci_info *info, From 49afc040f4d707a4149a05180edc42bc590641a4 Mon Sep 17 00:00:00 2001 From: Nihar Chaithanya Date: Sat, 4 Jan 2025 22:49:15 +0530 Subject: [PATCH 1203/1450] octeontx2-pf: mcs: Remove dead code and semi-colon from rsrc_name() Every case in the switch-block ends with return statement, and the default: branch handles the cases where rsrc_type is invalid and returns "Unknown", this makes the return statement at the end of the function unreachable and redundant. The semi-colon is not required after the switch-block's curly braces. Remove the semi-colon after the switch-block's curly braces and the return statement at the end of the function. This issue was reported by Coverity Scan. Signed-off-by: Nihar Chaithanya Link: https://patch.msgid.link/20250104171905.13293-1-niharchaithanya@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c index 6cc7a78968fc1..f3b9daffaec3c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c @@ -133,9 +133,7 @@ static const char *rsrc_name(enum mcs_rsrc_type rsrc_type) return "SA"; default: return "Unknown"; - }; - - return "Unknown"; + } } static int cn10k_mcs_alloc_rsrc(struct otx2_nic *pfvf, enum mcs_direction dir, From 51cfbed198ca4aa140babde816387db0e71f09e7 Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Fri, 3 Jan 2025 17:37:33 +0800 Subject: [PATCH 1204/1450] net: stmmac: Set dma_sync_size to zero for discarded frames If a frame is going to be discarded by driver, this frame is never touched by driver and the cache lines never become dirty obviously, page_pool_recycle_direct() wastes CPU cycles on unnecessary calling of page_pool_dma_sync_for_device() to sync entire frame. page_pool_put_page() with sync_size setting to 0 is the proper method. Signed-off-by: Furong Xu <0x1207@gmail.com> Link: https://patch.msgid.link/20250103093733.3872939-1-0x1207@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 2f518ec845ecf..70c8f60d34f2c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5467,7 +5467,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) if (priv->extend_desc) stmmac_rx_extended_status(priv, &priv->xstats, rx_q->dma_erx + entry); if (unlikely(status == discard_frame)) { - page_pool_recycle_direct(rx_q->page_pool, buf->page); + page_pool_put_page(rx_q->page_pool, buf->page, 0, true); buf->page = NULL; error = 1; if (!priv->hwts_rx_en) From 912d6f6697251b0024e56ed24b7873b4800822e7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 3 Jan 2025 06:31:14 -0500 Subject: [PATCH 1205/1450] selftests/net: packetdrill: report benign debug flakes as xfail A few recently added packetdrill tests that are known time sensitive (e.g., because testing timestamping) occasionally fail in debug mode: https://netdev.bots.linux.dev/contest.html?executor=vmksft-packetdrill-dbg These failures are well understood. Correctness of the tests is verified in non-debug mode. Continue running in debug mode also, to keep coverage with debug instrumentation. But, only in debug mode, mark these tests with well understood timing issues as XFAIL (known failing) rather than FAIL when failing. Introduce an allow list xfail_list with known cases. Expand the ktap infrastructure with XFAIL support. Fixes: eab35989cc37 ("selftests/net: packetdrill: import tcp/fast_recovery, tcp/nagle, tcp/timestamping") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20241218100013.0c698629@kernel.org/ Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20250103113142.129251-1-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- .../selftests/kselftest/ktap_helpers.sh | 15 ++++++++++-- .../selftests/net/packetdrill/ksft_runner.sh | 23 +++++++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/kselftest/ktap_helpers.sh b/tools/testing/selftests/kselftest/ktap_helpers.sh index 79a125eb24c2e..05a4618906719 100644 --- a/tools/testing/selftests/kselftest/ktap_helpers.sh +++ b/tools/testing/selftests/kselftest/ktap_helpers.sh @@ -7,6 +7,7 @@ KTAP_TESTNO=1 KTAP_CNT_PASS=0 KTAP_CNT_FAIL=0 +KTAP_CNT_XFAIL=0 KTAP_CNT_SKIP=0 KSFT_PASS=0 @@ -69,6 +70,16 @@ ktap_test_skip() { KTAP_CNT_SKIP=$((KTAP_CNT_SKIP+1)) } +ktap_test_xfail() { + description="$1" + + result="ok" + directive="XFAIL" + __ktap_test "$result" "$description" "$directive" + + KTAP_CNT_XFAIL=$((KTAP_CNT_XFAIL+1)) +} + ktap_test_fail() { description="$1" @@ -99,7 +110,7 @@ ktap_exit_fail_msg() { ktap_finished() { ktap_print_totals - if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP)) -eq "$KSFT_NUM_TESTS" ]; then + if [ $((KTAP_CNT_PASS + KTAP_CNT_SKIP + KTAP_CNT_XFAIL)) -eq "$KSFT_NUM_TESTS" ]; then exit "$KSFT_PASS" else exit "$KSFT_FAIL" @@ -107,5 +118,5 @@ ktap_finished() { } ktap_print_totals() { - echo "# Totals: pass:$KTAP_CNT_PASS fail:$KTAP_CNT_FAIL xfail:0 xpass:0 skip:$KTAP_CNT_SKIP error:0" + echo "# Totals: pass:$KTAP_CNT_PASS fail:$KTAP_CNT_FAIL xfail:$KTAP_CNT_XFAIL xpass:0 skip:$KTAP_CNT_SKIP error:0" } diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh index 4071c133f29e8..ff989c325eefc 100755 --- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -23,7 +23,7 @@ if [ $# -ne 1 ]; then ktap_exit_fail_msg "usage: $0