Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

redfishpower: adapt status polling interval #167

Merged
merged 1 commit into from
Apr 8, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 49 additions & 6 deletions src/redfishpower/redfishpower.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,7 @@ static zhashx_t *test_power_status;
/* Per documentation, wait incremental time then proceed if timeout < 0 */
#define INCREMENTAL_WAIT 500

/* in usec
*
* status polling interval of 1 second may seem long, but testing
* shows wait ranges from a few seconds to 20 seconds
*/
/* in usec */
#define STATUS_POLLING_INTERVAL_DEFAULT 1000000

#define MS_IN_SEC 1000
Expand Down Expand Up @@ -120,10 +116,13 @@ struct powermsg {
* timeout - when the overall power command times out
*
Comment on lines 116 to 117
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commit message should say what hardware the observations were based on and perhaps have a bit more detail about the observations themselves? Without that context, when some hardware that behaves differently comes a long, we can't make an informed decision about how to move forward.

* delaystart - if message should be sent after a wait
*
* poll_count - number of poll attempts
*/
struct timeval start;
struct timeval timeout;
struct timeval delaystart;
int poll_count;

/* zlistx handle */
void *handle;
Expand Down Expand Up @@ -318,6 +317,7 @@ static struct powermsg *powermsg_create(CURLM *mh,
const char *postdata,
struct timeval *start,
long int delay_usec,
int poll_count,
int output_result,
int state)
{
Expand Down Expand Up @@ -363,6 +363,9 @@ static struct powermsg *powermsg_create(CURLM *mh,
waitdelay.tv_usec = delay_usec;
timeradd(&now, &waitdelay, &pm->delaystart);
}

pm->poll_count = poll_count;

return pm;
}

Expand Down Expand Up @@ -417,6 +420,7 @@ static struct powermsg *stat_cmd_plug(CURLM * mh,
NULL,
NULL,
0,
0,
output_result,
STATE_SEND_POWERCMD);
if (verbose > 1)
Expand Down Expand Up @@ -768,6 +772,7 @@ struct powermsg *power_cmd_plug(CURLM * mh,
postdata,
NULL,
0,
0,
OUTPUT_RESULT,
STATE_SEND_POWERCMD);
if (verbose > 1)
Expand Down Expand Up @@ -939,13 +944,50 @@ static void send_status_poll(struct powermsg *pm)
{
struct powermsg *nextpm;
char *path = NULL;
long int poll_delay;

get_path(CMD_STAT, pm->plugname, &path, NULL);
if (!path) {
printf("%s: %s path not set\n", pm->plugname, CMD_STAT);
return;
}

/* testing a range of hardware shows that the amount of time it
* takes to complete an on/off falls into two bands. Either it
* completes in the 2-8 second range OR it takes 20-60 seconds.
*
* Some example timings from a HPE Cray Supercomputing EX Chassis
*
* - Turn switch off - 1.18 seconds
* - Turn switch on - 4.5 seconds
* - Turn blade off - 1.18 seconds
* - Turn blade on - 3.76 seconds
* - Turn node off - 6.86 seconds
* - Turn node on - 54.53 seconds
*
* (achu: Going off memory, the Supermicro H12DSG-O-CPU took
* around 20 seconds for on/off.)
*
* To get the best turn around time for the quick end of that range
* and avoid excessive polling on the other end, we will do a slightly
* altered 'exponential backoff' delay.
*
* We delay 1 second each of the first 4 polls.
* We delay 2 seconds for the 5th and 6th poll.
* We delay 4 seconds afterwards.
*
* Special note, testing shows that powering on a "on" node can
* also lead to a temporary entrance into the "PoweringOn" state.
* So we also want a quick turnaround for that case, which is
* typically only 1-2 seconds.
*/
if (pm->poll_count < 4)
poll_delay = status_polling_interval;
else if (pm->poll_count < 6)
poll_delay = status_polling_interval * 2;
else
poll_delay = status_polling_interval * 4;

/* issue a follow on stat to wait until the on/off is complete.
* note that we set the initial start time of this new command to
* the original on/off, so we can timeout correctly
Expand All @@ -961,7 +1003,8 @@ static void send_status_poll(struct powermsg *pm)
path,
NULL,
&pm->start,
status_polling_interval,
poll_delay,
pm->poll_count + 1,
OUTPUT_RESULT,
STATE_WAIT_UNTIL_ON_OFF);
if (!(nextpm->handle = zlistx_add_end(delayedcmds, nextpm)))
Expand Down
Loading