Skip to content

Commit c9c442a

Browse files
authored
Bootloop detection & recovery (#4793)
* added boot loop detection and config backup * automatic OTA rollback if loading backup does not fix it * added new file handling functions * adding verification of json files, added config restore at bootup if broken * added function to compare contents of two files for future use (currently not used)
1 parent b8b59b2 commit c9c442a

File tree

6 files changed

+327
-2
lines changed

6 files changed

+327
-2
lines changed

wled00/cfg.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,9 +772,30 @@ bool deserializeConfig(JsonObject doc, bool fromFS) {
772772
return (doc["sv"] | true);
773773
}
774774

775-
776775
static const char s_cfg_json[] PROGMEM = "/cfg.json";
777776

777+
bool backupConfig() {
778+
return backupFile(s_cfg_json);
779+
}
780+
781+
bool restoreConfig() {
782+
return restoreFile(s_cfg_json);
783+
}
784+
785+
bool verifyConfig() {
786+
return validateJsonFile(s_cfg_json);
787+
}
788+
789+
// rename config file and reboot
790+
void resetConfig() {
791+
DEBUG_PRINTLN(F("Reset config"));
792+
char backupname[32];
793+
strcpy(backupname, s_cfg_json);
794+
strcat(backupname, ".rst.json");
795+
WLED_FS.rename(s_cfg_json, backupname);
796+
doReboot = true;
797+
}
798+
778799
bool deserializeConfigFromFS() {
779800
[[maybe_unused]] bool success = deserializeConfigSec();
780801
#ifdef WLED_ADD_EEPROM_SUPPORT
@@ -800,6 +821,7 @@ bool deserializeConfigFromFS() {
800821

801822
void serializeConfigToFS() {
802823
serializeConfigSec();
824+
backupConfig(); // backup before writing new config
803825

804826
DEBUG_PRINTLN(F("Writing settings to /cfg.json..."));
805827

wled00/fcn_declare.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ void handleIO();
2424
void IRAM_ATTR touchButtonISR();
2525

2626
//cfg.cpp
27+
bool backupConfig();
28+
bool restoreConfig();
29+
bool verifyConfig();
30+
void resetConfig();
2731
bool deserializeConfig(JsonObject doc, bool fromFS = false);
2832
bool deserializeConfigFromFS();
2933
bool deserializeConfigSec();
@@ -223,6 +227,11 @@ inline bool writeObjectToFileUsingId(const String &file, uint16_t id, const Json
223227
inline bool writeObjectToFile(const String &file, const char* key, const JsonDocument* content) { return writeObjectToFile(file.c_str(), key, content); };
224228
inline bool readObjectFromFileUsingId(const String &file, uint16_t id, JsonDocument* dest, const JsonDocument* filter = nullptr) { return readObjectFromFileUsingId(file.c_str(), id, dest); };
225229
inline bool readObjectFromFile(const String &file, const char* key, JsonDocument* dest, const JsonDocument* filter = nullptr) { return readObjectFromFile(file.c_str(), key, dest); };
230+
bool copyFile(const char* src_path, const char* dst_path);
231+
bool backupFile(const char* filename);
232+
bool restoreFile(const char* filename);
233+
bool validateJsonFile(const char* filename);
234+
void dumpFilesToSerial();
226235

227236
//hue.cpp
228237
void handleHue();
@@ -580,6 +589,10 @@ extern "C" {
580589
#define d_free free
581590
#endif
582591

592+
void handleBootLoop(); // detect and handle bootloops
593+
#ifndef ESP8266
594+
void bootloopCheckOTA(); // swap boot image if bootloop is detected instead of restoring config
595+
#endif
583596
// RAII guard class for the JSON Buffer lock
584597
// Modeled after std::lock_guard
585598
class JSONBufferGuard {

wled00/file.cpp

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,3 +439,156 @@ bool handleFileRead(AsyncWebServerRequest* request, String path){
439439
}
440440
return false;
441441
}
442+
443+
// copy a file, delete destination file if incomplete to prevent corrupted files
444+
bool copyFile(const char* src_path, const char* dst_path) {
445+
DEBUG_PRINTF("copyFile from %s to %s\n", src_path, dst_path);
446+
if(!WLED_FS.exists(src_path)) {
447+
DEBUG_PRINTLN(F("file not found"));
448+
return false;
449+
}
450+
451+
bool success = true; // is set to false on error
452+
File src = WLED_FS.open(src_path, "r");
453+
File dst = WLED_FS.open(dst_path, "w");
454+
455+
if (src && dst) {
456+
uint8_t buf[128]; // copy file in 128-byte blocks
457+
while (src.available() > 0) {
458+
size_t bytesRead = src.read(buf, sizeof(buf));
459+
if (bytesRead == 0) {
460+
success = false;
461+
break; // error, no data read
462+
}
463+
size_t bytesWritten = dst.write(buf, bytesRead);
464+
if (bytesWritten != bytesRead) {
465+
success = false;
466+
break; // error, not all data written
467+
}
468+
}
469+
} else {
470+
success = false; // error, could not open files
471+
}
472+
if(src) src.close();
473+
if(dst) dst.close();
474+
if (!success) {
475+
DEBUG_PRINTLN(F("copy failed"));
476+
WLED_FS.remove(dst_path); // delete incomplete file
477+
}
478+
return success;
479+
}
480+
481+
// compare two files, return true if identical
482+
bool compareFiles(const char* path1, const char* path2) {
483+
DEBUG_PRINTF("compareFile %s and %s\n", path1, path2);
484+
if (!WLED_FS.exists(path1) || !WLED_FS.exists(path2)) {
485+
DEBUG_PRINTLN(F("file not found"));
486+
return false;
487+
}
488+
489+
bool identical = true; // set to false on mismatch
490+
File f1 = WLED_FS.open(path1, "r");
491+
File f2 = WLED_FS.open(path2, "r");
492+
493+
if (f1 && f2) {
494+
uint8_t buf1[128], buf2[128];
495+
while (f1.available() > 0 || f2.available() > 0) {
496+
size_t len1 = f1.read(buf1, sizeof(buf1));
497+
size_t len2 = f2.read(buf2, sizeof(buf2));
498+
499+
if (len1 != len2) {
500+
identical = false;
501+
break; // files differ in size or read failed
502+
}
503+
504+
if (memcmp(buf1, buf2, len1) != 0) {
505+
identical = false;
506+
break; // files differ in content
507+
}
508+
}
509+
} else {
510+
identical = false; // error opening files
511+
}
512+
513+
if (f1) f1.close();
514+
if (f2) f2.close();
515+
return identical;
516+
}
517+
518+
static const char s_backup_json[] PROGMEM = "/bkp.";
519+
520+
bool backupFile(const char* filename) {
521+
DEBUG_PRINTF("backup %s \n", filename);
522+
if (!validateJsonFile(filename)) {
523+
DEBUG_PRINTLN(F("broken file"));
524+
return false;
525+
}
526+
char backupname[32];
527+
snprintf(backupname, sizeof(backupname), "%s%s", s_backup_json, filename + 1); // skip leading '/' in filename
528+
529+
if (copyFile(filename, backupname)) {
530+
DEBUG_PRINTLN(F("backup ok"));
531+
return true;
532+
}
533+
DEBUG_PRINTLN(F("backup failed"));
534+
return false;
535+
}
536+
537+
bool restoreFile(const char* filename) {
538+
DEBUG_PRINTF("restore %s \n", filename);
539+
char backupname[32];
540+
snprintf(backupname, sizeof(backupname), "%s%s", s_backup_json, filename + 1); // skip leading '/' in filename
541+
542+
if (!WLED_FS.exists(backupname)) {
543+
DEBUG_PRINTLN(F("no backup found"));
544+
return false;
545+
}
546+
547+
if (!validateJsonFile(backupname)) {
548+
DEBUG_PRINTLN(F("broken backup"));
549+
return false;
550+
}
551+
552+
if (copyFile(backupname, filename)) {
553+
DEBUG_PRINTLN(F("restore ok"));
554+
return true;
555+
}
556+
DEBUG_PRINTLN(F("restore failed"));
557+
return false;
558+
}
559+
560+
bool validateJsonFile(const char* filename) {
561+
if (!WLED_FS.exists(filename)) return false;
562+
File file = WLED_FS.open(filename, "r");
563+
if (!file) return false;
564+
StaticJsonDocument<0> doc, filter; // https://arduinojson.org/v6/how-to/validate-json/
565+
bool result = deserializeJson(doc, file, DeserializationOption::Filter(filter)) == DeserializationError::Ok;
566+
file.close();
567+
if (!result) {
568+
DEBUG_PRINTF("Invalid JSON file %s\n", filename);
569+
} else {
570+
DEBUG_PRINTF("Valid JSON file %s\n", filename);
571+
}
572+
return result;
573+
}
574+
575+
// print contents of all files in root dir to Serial except wsec files
576+
void dumpFilesToSerial() {
577+
File rootdir = WLED_FS.open("/", "r");
578+
File rootfile = rootdir.openNextFile();
579+
while (rootfile) {
580+
size_t len = strlen(rootfile.name());
581+
// skip files starting with "wsec" and dont end in .json
582+
if (strncmp(rootfile.name(), "wsec", 4) != 0 && len >= 6 && strcmp(rootfile.name() + len - 5, ".json") == 0) {
583+
Serial.println(rootfile.name());
584+
while (rootfile.available()) {
585+
Serial.write(rootfile.read());
586+
}
587+
Serial.println();
588+
Serial.println();
589+
}
590+
rootfile.close();
591+
rootfile = rootdir.openNextFile();
592+
}
593+
}
594+

wled00/util.cpp

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
#include "wled.h"
22
#include "fcn_declare.h"
33
#include "const.h"
4+
#ifdef ESP8266
5+
#include "user_interface.h" // for bootloop detection
6+
#elif ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0)
7+
#include "esp32/rtc.h" // for bootloop detection
8+
#include <Update.h>
9+
#endif
410

511

612
//helper to get int value at a position in string
@@ -706,6 +712,125 @@ void *realloc_malloc(void *ptr, size_t size) {
706712
}
707713
#endif
708714

715+
// bootloop detection and handling
716+
// checks if the ESP reboots multiple times due to a crash or watchdog timeout
717+
// if a bootloop is detected: restore settings from backup, then reset settings, then switch boot image (and repeat)
718+
719+
#define BOOTLOOP_THRESHOLD 5 // number of consecutive crashes to trigger bootloop detection
720+
#define BOOTLOOP_ACTION_RESTORE 0 // default action: restore config from /cfg.bak
721+
#define BOOTLOOP_ACTION_RESET 1 // if restore does not work, reset config (rename /cfg.json to /cfg.fault)
722+
#define BOOTLOOP_ACTION_OTA 2 // swap the boot partition
723+
#define BOOTLOOP_ACTION_DUMP 3 // nothing seems to help, dump files to serial and reboot (until hardware reset)
724+
#ifdef ESP8266
725+
#define BOOTLOOP_INTERVAL_TICKS (5 * 160000) // time limit between crashes: ~5 seconds in RTC ticks
726+
#define BOOT_TIME_IDX 0 // index in RTC memory for boot time
727+
#define CRASH_COUNTER_IDX 1 // index in RTC memory for crash counter
728+
#define ACTIONT_TRACKER_IDX 2 // index in RTC memory for boot action
729+
#else
730+
#define BOOTLOOP_INTERVAL_TICKS 5000 // time limit between crashes: ~5 seconds in milliseconds
731+
// variables in RTC_NOINIT memory persist between reboots (but not on hardware reset)
732+
RTC_NOINIT_ATTR static uint32_t bl_last_boottime;
733+
RTC_NOINIT_ATTR static uint32_t bl_crashcounter;
734+
RTC_NOINIT_ATTR static uint32_t bl_actiontracker;
735+
void bootloopCheckOTA() { bl_actiontracker = BOOTLOOP_ACTION_OTA; } // swap boot image if bootloop is detected instead of restoring config
736+
#endif
737+
738+
// detect bootloop by checking the reset reason and the time since last boot
739+
static bool detectBootLoop() {
740+
#if !defined(ESP8266)
741+
#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0)
742+
uint32_t rtctime = esp_rtc_get_time_us() / 1000; // convert to milliseconds
743+
esp_reset_reason_t reason = esp_reset_reason();
744+
745+
if (!(reason == ESP_RST_PANIC || reason == ESP_RST_WDT || reason == ESP_RST_INT_WDT || reason == ESP_RST_TASK_WDT)) {
746+
// no crash detected, init variables
747+
bl_crashcounter = 0;
748+
bl_last_boottime = rtctime;
749+
if(reason != ESP_RST_SW)
750+
bl_actiontracker = BOOTLOOP_ACTION_RESTORE; // init action tracker if not an intentional reboot (e.g. from OTA or bootloop handler)
751+
} else if (reason == ESP_RST_BROWNOUT) {
752+
// crash due to brownout can't be detected unless using flash memory to store bootloop variables
753+
// this is a simpler way to preemtively revert the config in case current brownout is caused by a bad choice of settings
754+
DEBUG_PRINTLN(F("brownout detected"));
755+
//restoreConfig(); // TODO: blindly restoring config if brownout detected is a bad idea, need a better way (if at all)
756+
} else {
757+
uint32_t rebootinterval = rtctime - bl_last_boottime;
758+
bl_last_boottime = rtctime; // store current runtime for next reboot
759+
if (rebootinterval < BOOTLOOP_INTERVAL_TICKS) {
760+
bl_crashcounter++;
761+
if (bl_crashcounter >= BOOTLOOP_THRESHOLD) {
762+
DEBUG_PRINTLN(F("!BOOTLOOP DETECTED!"));
763+
bl_crashcounter = 0;
764+
return true;
765+
}
766+
}
767+
}
768+
#endif
769+
#else // ESP8266
770+
rst_info* resetreason = system_get_rst_info();
771+
uint32_t bl_last_boottime;
772+
uint32_t bl_crashcounter;
773+
uint32_t bl_actiontracker;
774+
uint32_t rtctime = system_get_rtc_time();
775+
776+
if (!(resetreason->reason == REASON_EXCEPTION_RST || resetreason->reason == REASON_WDT_RST)) {
777+
// no crash detected, init variables
778+
bl_crashcounter = 0;
779+
ESP.rtcUserMemoryWrite(BOOT_TIME_IDX, &rtctime, sizeof(uint32_t));
780+
ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
781+
if(resetreason->reason != REASON_SOFT_RESTART) {
782+
bl_actiontracker = BOOTLOOP_ACTION_RESTORE; // init action tracker if not an intentional reboot (e.g. from OTA or bootloop handler)
783+
ESP.rtcUserMemoryWrite(ACTIONT_TRACKER_IDX, &bl_actiontracker, sizeof(uint32_t));
784+
}
785+
} else {
786+
// system has crashed
787+
ESP.rtcUserMemoryRead(BOOT_TIME_IDX, &bl_last_boottime, sizeof(uint32_t));
788+
ESP.rtcUserMemoryRead(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
789+
uint32_t rebootinterval = rtctime - bl_last_boottime;
790+
ESP.rtcUserMemoryWrite(BOOT_TIME_IDX, &rtctime, sizeof(uint32_t)); // store current ticks for next reboot
791+
if (rebootinterval < BOOTLOOP_INTERVAL_TICKS) {
792+
bl_crashcounter++;
793+
ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
794+
if (bl_crashcounter >= BOOTLOOP_THRESHOLD) {
795+
DEBUG_PRINTLN(F("BOOTLOOP DETECTED"));
796+
bl_crashcounter = 0;
797+
ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
798+
return true;
799+
}
800+
}
801+
}
802+
#endif
803+
return false; // no bootloop detected
804+
}
805+
806+
void handleBootLoop() {
807+
DEBUG_PRINTLN(F("checking for bootloop"));
808+
if (!detectBootLoop()) return; // no bootloop detected
809+
#ifdef ESP8266
810+
uint32_t bl_actiontracker;
811+
ESP.rtcUserMemoryRead(ACTIONT_TRACKER_IDX, &bl_actiontracker, sizeof(uint32_t));
812+
#endif
813+
if (bl_actiontracker == BOOTLOOP_ACTION_RESTORE) {
814+
restoreConfig(); // note: if this fails, could reset immediately. instead just let things play out and save a few lines of code
815+
bl_actiontracker = BOOTLOOP_ACTION_RESET; // reset config if it keeps bootlooping
816+
} else if (bl_actiontracker == BOOTLOOP_ACTION_RESET) {
817+
resetConfig();
818+
bl_actiontracker = BOOTLOOP_ACTION_OTA; // swap boot partition if it keeps bootlooping. On ESP8266 this is the same as BOOTLOOP_ACTION_NONE
819+
}
820+
#ifndef ESP8266
821+
else if (bl_actiontracker == BOOTLOOP_ACTION_OTA) {
822+
if(Update.canRollBack()) {
823+
DEBUG_PRINTLN(F("Swapping boot partition..."));
824+
Update.rollBack(); // swap boot partition
825+
}
826+
bl_actiontracker = BOOTLOOP_ACTION_DUMP; // out of options
827+
}
828+
#endif
829+
else
830+
dumpFilesToSerial();
831+
ESP.restart(); // restart cleanly and don't wait for another crash
832+
}
833+
709834
/*
710835
* Fixed point integer based Perlin noise functions by @dedehai
711836
* Note: optimized for speed and to mimic fastled inoise functions, not for accuracy or best randomness

wled00/wled.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,9 @@ void WLED::setup()
410410
DEBUGFS_PRINTLN(F("FS failed!"));
411411
errorFlag = ERR_FS_BEGIN;
412412
}
413+
414+
handleBootLoop(); // check for bootloop and take action (requires WLED_FS)
415+
413416
#ifdef WLED_ADD_EEPROM_SUPPORT
414417
else deEEP();
415418
#else
@@ -425,6 +428,11 @@ void WLED::setup()
425428
WLED_SET_AP_SSID(); // otherwise it is empty on first boot until config is saved
426429
multiWiFi.push_back(WiFiConfig(CLIENT_SSID,CLIENT_PASS)); // initialise vector with default WiFi
427430

431+
if(!verifyConfig()) {
432+
if(!restoreConfig()) {
433+
resetConfig();
434+
}
435+
}
428436
DEBUG_PRINTLN(F("Reading config"));
429437
bool needsCfgSave = deserializeConfigFromFS();
430438
DEBUG_PRINTF_P(PSTR("heap %u\n"), ESP.getFreeHeap());

0 commit comments

Comments
 (0)