Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nvidia][hsflowd] Fix Dropmon co-operation issues related to HW stop #73

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/sflow/hsflowd/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ $(addprefix $(DEST)/, $(MAIN_TARGET)): $(DEST)/% :
if [[ $(ENABLE_SFLOW_DROPMON) == y ]]; then
stg repair
stg import -s ../patch/dropmon/series

if [[ $(CONFIGURED_PLATFORM) == mellanox ]]; then
stg import ../patch/dropmon/0001-only-start-stop-sw-drops-for-nvidia.patch
fi
fi

mkdir -p debian
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
From 891ed872a95bc2a77c14bfe859b89ad3db21b7ad Mon Sep 17 00:00:00 2001
From: Vivek Reddy <vkarri@nvidia.com>
Date: Sat, 19 Aug 2023 00:01:28 +0000
Subject: [PATCH] only start/stop sw drops for nvidia

1) Don't start/stop hw drops for nvidia
2) Improve logging
3) Only start/stop sw drops if sw=on

Signed-off-by: vkarri <vkarri@nvidia.com>
---
src/Linux/mod_dropmon.c | 39 +++++++++++++++++++++------------------
1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/Linux/mod_dropmon.c b/src/Linux/mod_dropmon.c
index e8f26e5..869ea0c 100644
--- a/src/Linux/mod_dropmon.c
+++ b/src/Linux/mod_dropmon.c
@@ -143,7 +143,7 @@ extern "C" {
static void setState(EVMod *mod, EnumDropmonState newState) {
HSP_mod_DROPMON *mdata = (HSP_mod_DROPMON *)mod->data;
if(newState != mdata->state) {
- myDebug(1, "dropmon state %s -> %s",
+ myLog(LOG_INFO, "dropmon state %s -> %s",
HSPDropmonStateNames[mdata->state],
HSPDropmonStateNames[newState]);
mdata->state = newState;
@@ -405,42 +405,43 @@ That would allow everything to stay on the stack as it does here, which has nice

static int start_DROPMON(EVMod *mod, bool startIt)
{
+ HSP *sp = (HSP *)EVROOTDATA(mod);
HSP_mod_DROPMON *mdata = (HSP_mod_DROPMON *)mod->data;
setState(mod,
startIt
? HSP_DROPMON_STATE_START
: HSP_DROPMON_STATE_STOP);
-
+
+ if (!sp->dropmon.sw)
+ return 0;
+
struct nlmsghdr nlh = { };
struct genlmsghdr ge = { };
- struct nlattr attr1 = { };
- struct nlattr attr2 = { };
+ struct nlattr attr = { };

- attr1.nla_len = sizeof(attr1);
- attr1.nla_type = NET_DM_ATTR_SW_DROPS;
- attr2.nla_len = sizeof(attr2);
- attr2.nla_type = NET_DM_ATTR_HW_DROPS;
+ /* sflow should only control start/stop SW_DROPS, HW_DROPS is controlled by a different daemon for NVIDIA */
+ attr.nla_len = sizeof(attr);
+ attr.nla_type = NET_DM_ATTR_SW_DROPS;

ge.cmd = startIt
? NET_DM_CMD_START
: NET_DM_CMD_STOP;
ge.version = 1;

- nlh.nlmsg_len = NLMSG_LENGTH(sizeof(ge) + sizeof(attr1) + sizeof(attr2));
+ nlh.nlmsg_len = NLMSG_LENGTH(sizeof(ge) + sizeof(attr));
nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
nlh.nlmsg_type = mdata->family_id;
nlh.nlmsg_seq = ++mdata->nl_seq;
nlh.nlmsg_pid = UTNLGeneric_pid(mod->id);

- struct iovec iov[4] = {
+ struct iovec iov[3] = {
{ .iov_base = &nlh, .iov_len = sizeof(nlh) },
{ .iov_base = &ge, .iov_len = sizeof(ge) },
- { .iov_base = &attr1, .iov_len = sizeof(attr1) },
- { .iov_base = &attr2, .iov_len = sizeof(attr2) },
+ { .iov_base = &attr, .iov_len = sizeof(attr) },
};

struct sockaddr_nl sa = { .nl_family = AF_NETLINK };
- struct msghdr msg = { .msg_name = &sa, .msg_namelen = sizeof(sa), .msg_iov = iov, .msg_iovlen = 4 };
+ struct msghdr msg = { .msg_name = &sa, .msg_namelen = sizeof(sa), .msg_iov = iov, .msg_iovlen = 3 };
return sendmsg(mdata->nl_sock, &msg, 0);
}

@@ -871,9 +872,10 @@ That would allow everything to stay on the stack as it does here, which has nice
mdata->state,
err_msg->error,
strerror(-err_msg->error));
- if(mdata->state == HSP_DROPMON_STATE_CONFIGURE
- || mdata->state == HSP_DROPMON_STATE_START)
+ if(mdata->state == HSP_DROPMON_STATE_START)
mdata->feedControlErrors++;
+ else if (mdata->state == HSP_DROPMON_STATE_CONFIGURE && err_msg->error == -EBUSY)
+ myLog(LOG_INFO, "Configuring DropMon Failed, Module is already in Monitoring State, Continue...");
}
break;
}
@@ -944,11 +946,11 @@ That would allow everything to stay on the stack as it does here, which has nice
// TODO: may want to confirm that none of the parameters were
// changed under our feet too?
if(mdata->feedControlErrors > 0) {
- myDebug(1, "dropmon: detected feed-control errors: %u", mdata->feedControlErrors);
- myDebug(1, "dropmon: assume external control - not stopping feed");
+ myLog(LOG_INFO, "dropmon: detected feed-control errors: %u", mdata->feedControlErrors);
+ myLog(LOG_INFO, "dropmon: assume external control - not stopping feed");
}
else {
- myDebug(1, "dropmon: graceful shutdown: turning off feed");
+ myLog(LOG_INFO, "dropmon: graceful shutdown: turning off feed");
start_DROPMON(mod, NO);
}
}
@@ -1022,6 +1024,7 @@ That would allow everything to stay on the stack as it does here, which has nice
// failure if the channel was already configured externally.
// TODO: should probably wait for answer before ploughing
// ahead with this start_DROPMON call.
+ // Configure SW_DROPS only if sp->dropmon.sw == YES
start_DROPMON(mod, YES);
break;
case HSP_DROPMON_STATE_START:
--
2.17.1