Skip to content

Commit

Permalink
redfishpower: adapt status polling interval
Browse files Browse the repository at this point in the history
Problem: The status polling interval is hard coded to 1 second long.
This can result in an excessive number of polling messages being sent
when it is known that some hardware takes 20-60 seconds to complete
a power operation.

As an example, on an HPE Cray Supercomputing EX chassis, the power on
of a node takes around 50 seconds, while a power off takes around 6 seconds.

Solution: Support a modified "exponential backoff" of the status polling
interval.  The modified algorithm is based on observations of how long it
typically takes to complete power operations on hardware.  The status
polling interval begins at one second, but it gets capped at 4 seconds.
  • Loading branch information
chu11 committed Apr 8, 2024
1 parent 8a8a848 commit ebb1c36
Showing 1 changed file with 49 additions and 6 deletions.
55 changes: 49 additions & 6 deletions src/redfishpower/redfishpower.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,7 @@ static zhashx_t *test_power_status;
/* Per documentation, wait incremental time then proceed if timeout < 0 */
#define INCREMENTAL_WAIT 500

/* in usec
*
* status polling interval of 1 second may seem long, but testing
* shows wait ranges from a few seconds to 20 seconds
*/
/* in usec */
#define STATUS_POLLING_INTERVAL_DEFAULT 1000000

#define MS_IN_SEC 1000
Expand Down Expand Up @@ -120,10 +116,13 @@ struct powermsg {
* timeout - when the overall power command times out
*
* delaystart - if message should be sent after a wait
*
* poll_count - number of poll attempts
*/
struct timeval start;
struct timeval timeout;
struct timeval delaystart;
int poll_count;

/* zlistx handle */
void *handle;
Expand Down Expand Up @@ -318,6 +317,7 @@ static struct powermsg *powermsg_create(CURLM *mh,
const char *postdata,
struct timeval *start,
long int delay_usec,
int poll_count,
int output_result,
int state)
{
Expand Down Expand Up @@ -363,6 +363,9 @@ static struct powermsg *powermsg_create(CURLM *mh,
waitdelay.tv_usec = delay_usec;
timeradd(&now, &waitdelay, &pm->delaystart);
}

pm->poll_count = poll_count;

return pm;
}

Expand Down Expand Up @@ -417,6 +420,7 @@ static struct powermsg *stat_cmd_plug(CURLM * mh,
NULL,
NULL,
0,
0,
output_result,
STATE_SEND_POWERCMD);
if (verbose > 1)
Expand Down Expand Up @@ -768,6 +772,7 @@ struct powermsg *power_cmd_plug(CURLM * mh,
postdata,
NULL,
0,
0,
OUTPUT_RESULT,
STATE_SEND_POWERCMD);
if (verbose > 1)
Expand Down Expand Up @@ -939,13 +944,50 @@ static void send_status_poll(struct powermsg *pm)
{
struct powermsg *nextpm;
char *path = NULL;
long int poll_delay;

get_path(CMD_STAT, pm->plugname, &path, NULL);
if (!path) {
printf("%s: %s path not set\n", pm->plugname, CMD_STAT);
return;
}

/* testing a range of hardware shows that the amount of time it
* takes to complete an on/off falls into two bands. Either it
* completes in the 2-5 second range OR it takes 20-60 seconds.
*
* Some example timings from a HPE Cray Supercomputing EX Chassis
*
* - Turn switch off - 1.18 seconds
* - Turn switch on - 4.5 seconds
* - Turn blade off - 1.18 seconds
* - Turn blade on - 3.76 seconds
* - Turn node off - 6.86 seconds
* - Turn node on - 54.53 seconds
*
* (achu: Going off memory, the Supermicro H12DSG-O-CPU took
* around 20 seconds for on/off.)
*
* To get the best turn around time for the quick end of that range
* and avoid excessive polling on the other end, we will do a slightly
* altered 'exponential backoff' delay.
*
* We delay 1 second each of the first 4 polls.
* We delay 2 seconds for the 5th and 6th poll.
* We delay 4 seconds afterwards.
*
* Special note, testing shows that powering on a "on" node can
* also lead to a temporary entrance into the "PoweringOn" state.
* So we also want a quick turnaround for that case, which is
* typically only 1-2 seconds.
*/
if (pm->poll_count < 4)
poll_delay = status_polling_interval;
else if (pm->poll_count < 6)
poll_delay = status_polling_interval * 2;
else
poll_delay = status_polling_interval * 4;

/* issue a follow on stat to wait until the on/off is complete.
* note that we set the initial start time of this new command to
* the original on/off, so we can timeout correctly
Expand All @@ -961,7 +1003,8 @@ static void send_status_poll(struct powermsg *pm)
path,
NULL,
&pm->start,
status_polling_interval,
poll_delay,
pm->poll_count + 1,
OUTPUT_RESULT,
STATE_WAIT_UNTIL_ON_OFF);
if (!(nextpm->handle = zlistx_add_end(delayedcmds, nextpm)))
Expand Down

0 comments on commit ebb1c36

Please sign in to comment.