fix(panic_handler): Updated panic handler to use RTC WDT

This commit updates the following:
- Updates the panic handler to use only the RTC WDT to reset the system.
- Refactors some of the panic handler code.
- Updates Bluetooth files where in they now feed the WDTs instead of
  reconfiguring them.
- Removes some unnecessary configuration of WDTs from various files.
- Added a unit test to verify that the system does not lock up when the
  panic handler is stuck.
- Updates the memprot unit tests to work with the refactored panic
  handler.

Closes https://github.com/espressif/esp-idf/issues/15166
Closes https://github.com/espressif/esp-idf/issues/15018
Closes https://github.com/espressif/esp-idf/issues/10110
This commit is contained in:
Sudeep Mohanty
2025-01-27 17:48:09 +01:00
parent b0306575a8
commit cd887ef59a
20 changed files with 578 additions and 163 deletions

View File

@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: 2015-2024 Espressif Systems (Shanghai) CO LTD
* SPDX-FileCopyrightText: 2015-2025 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
@@ -74,13 +74,19 @@
#define MWDT_DEFAULT_TICKS_PER_US 500
#define PANIC_ENTRY_COUNT_MAX 2 // We allow at least 2 panic entries to let the panic handler process Double Exceptions
bool g_panic_abort = false;
char *g_panic_abort_details = NULL;
static wdt_hal_context_t rtc_wdt_ctx = RWDT_HAL_CONTEXT_DEFAULT();
static uint32_t DRAM_ATTR g_panic_entry_count[CONFIG_FREERTOS_NUMBER_OF_CORES] = {0}; // Number of times panic handler has been entered per core since multiple cores can enter the panic handler simultaneously
#if !CONFIG_ESP_SYSTEM_PANIC_SILENT_REBOOT
/********************** Panic print functions **********************/
#if CONFIG_ESP_CONSOLE_UART
static uart_hal_context_t s_panic_uart = { .dev = CONFIG_ESP_CONSOLE_UART_NUM == 0 ? &UART0 :&UART1 };
@@ -171,68 +177,106 @@ void panic_print_dec(int d)
}
#endif // CONFIG_ESP_SYSTEM_PANIC_SILENT_REBOOT
/*
If watchdogs are enabled, the panic handler runs the risk of getting aborted pre-emptively because
an overzealous watchdog decides to reset it. On the other hand, if we disable all watchdogs, we run
the risk of somehow halting in the panic handler and not resetting. That is why this routine kills
all watchdogs except the timer group 0 watchdog, and it reconfigures that to reset the chip after
one second.
We have to do this before we do anything that might cause issues in the WDT interrupt handlers,
for example stalling the other core on ESP32 may cause the ESP32_ECO3_CACHE_LOCK_FIX
handler to get stuck.
*/
void esp_panic_handler_reconfigure_wdts(uint32_t timeout_ms)
static void print_abort_details(const void *f)
{
wdt_hal_context_t wdt0_context = {.inst = WDT_MWDT0, .mwdt_dev = &TIMERG0};
#if SOC_TIMER_GROUPS >= 2
// IDF-3825
wdt_hal_context_t wdt1_context = {.inst = WDT_MWDT1, .mwdt_dev = &TIMERG1};
#endif
//Todo: Refactor to use Interrupt or Task Watchdog API, and a system level WDT context
//Reconfigure TWDT (Timer Group 0)
wdt_hal_init(&wdt0_context, WDT_MWDT0, MWDT_LL_DEFAULT_CLK_PRESCALER, false); //Prescaler: wdt counts in ticks of TG0_WDT_TICK_US
wdt_hal_write_protect_disable(&wdt0_context);
wdt_hal_config_stage(&wdt0_context, 0, timeout_ms * 1000 / MWDT_DEFAULT_TICKS_PER_US, WDT_STAGE_ACTION_RESET_SYSTEM); //1 second before reset
wdt_hal_enable(&wdt0_context);
wdt_hal_write_protect_enable(&wdt0_context);
#if SOC_TIMER_GROUPS >= 2
//Disable IWDT (Timer Group 1)
wdt_hal_write_protect_disable(&wdt1_context);
wdt_hal_disable(&wdt1_context);
wdt_hal_write_protect_enable(&wdt1_context);
#endif
panic_print_str(g_panic_abort_details);
}
/*
This disables all the watchdogs for when we call the gdbstub.
*/
static inline void disable_all_wdts(void)
/********************** Panic handler watchdog timer functions **********************/
/* This function disables the Timer Group WDTs */
void esp_panic_handler_disable_timg_wdts(void)
{
wdt_hal_context_t wdt0_context = {.inst = WDT_MWDT0, .mwdt_dev = &TIMERG0};
#if SOC_TIMER_GROUPS >= 2
wdt_hal_context_t wdt1_context = {.inst = WDT_MWDT1, .mwdt_dev = &TIMERG1};
#endif
//Todo: Refactor to use Interrupt or Task Watchdog API, and a system level WDT context
//Task WDT is the Main Watchdog Timer of Timer Group 0
wdt_hal_write_protect_disable(&wdt0_context);
wdt_hal_disable(&wdt0_context);
wdt_hal_write_protect_enable(&wdt0_context);
#if SOC_TIMER_GROUPS >= 2
//Interrupt WDT is the Main Watchdog Timer of Timer Group 1
wdt_hal_context_t wdt1_context = {.inst = WDT_MWDT1, .mwdt_dev = &TIMERG1};
wdt_hal_write_protect_disable(&wdt1_context);
wdt_hal_disable(&wdt1_context);
wdt_hal_write_protect_enable(&wdt1_context);
#endif
#endif /* SOC_TIMER_GROUPS >= 2 */
}
static void print_abort_details(const void *f)
/* This function enables the RTC WDT with the given timeout in milliseconds */
void esp_panic_handler_enable_rtc_wdt(uint32_t timeout_ms)
{
panic_print_str(g_panic_abort_details);
wdt_hal_init(&rtc_wdt_ctx, WDT_RWDT, 0, false);
uint32_t stage_timeout_ticks = (uint32_t)(timeout_ms * rtc_clk_slow_freq_get_hz() / 1000ULL);
wdt_hal_write_protect_disable(&rtc_wdt_ctx);
wdt_hal_config_stage(&rtc_wdt_ctx, WDT_STAGE0, stage_timeout_ticks, WDT_STAGE_ACTION_RESET_RTC);
wdt_hal_enable(&rtc_wdt_ctx);
wdt_hal_write_protect_enable(&rtc_wdt_ctx);
}
/* Feed the watchdogs if they are enabled and if we are not already in the panic handler */
void esp_panic_handler_feed_wdts(void)
{
/* If we have already entered the panic handler multiple times,
* we should not feed the WDTs. This is because we need an
* alternate mechanism to reset the system if we happen to be stuck
* in a panic loop.
*/
if (g_panic_entry_count[esp_cpu_get_core_id()] > PANIC_ENTRY_COUNT_MAX) {
return;
}
// Feed Timer Group 0 WDT
wdt_hal_context_t wdt0_context = {.inst = WDT_MWDT0, .mwdt_dev = &TIMERG0};
if (wdt_hal_is_enabled(&wdt0_context)) {
wdt_hal_write_protect_disable(&wdt0_context);
wdt_hal_feed(&wdt0_context);
wdt_hal_write_protect_enable(&wdt0_context);
}
#if SOC_TIMER_GROUPS >= 2
// Feed Timer Group 1 WDT
wdt_hal_context_t wdt1_context = {.inst = WDT_MWDT1, .mwdt_dev = &TIMERG1};
if (wdt_hal_is_enabled(&wdt1_context)) {
wdt_hal_write_protect_disable(&wdt1_context);
wdt_hal_feed(&wdt1_context);
wdt_hal_write_protect_enable(&wdt1_context);
}
#endif /* SOC_TIMER_GROUPS >= 2 */
// Feed RTC WDT
if (wdt_hal_is_enabled(&rtc_wdt_ctx)) {
wdt_hal_write_protect_disable(&rtc_wdt_ctx);
wdt_hal_feed(&rtc_wdt_ctx);
wdt_hal_write_protect_enable(&rtc_wdt_ctx);
}
}
/* This function disables all the watchdogs */
static inline void disable_all_wdts(void)
{
//Disable Timer Group WDTs
esp_panic_handler_disable_timg_wdts();
//Disable RTC WDT
wdt_hal_write_protect_disable(&rtc_wdt_ctx);
wdt_hal_disable(&rtc_wdt_ctx);
wdt_hal_write_protect_enable(&rtc_wdt_ctx);
}
/********************** Panic handler functions **********************/
/* This function is called from the panic handler entry point to increment the panic entry count */
void esp_panic_handler_increment_entry_count(void)
{
int core_id = esp_cpu_get_core_id();
g_panic_entry_count[core_id]++;
if (g_panic_entry_count[core_id] > PANIC_ENTRY_COUNT_MAX) {
/* If we have already panicked multiple times, chances are
* that the panic handler itself is broken. In this case, we
* should just reset the system.
*/
panic_print_str("Panic handler entered multiple times. Abort panic handling. Rebooting ...\r\n");
panic_restart();
}
}
// Control arrives from chip-specific panic handler, environment prepared for
@@ -241,8 +285,8 @@ static void print_abort_details(const void *f)
void esp_panic_handler(panic_info_t *info)
{
// The port-level panic handler has already called this, but call it again
// to reset the TG0WDT period
esp_panic_handler_reconfigure_wdts(1000);
// to reset the RTC WDT period
esp_panic_handler_feed_wdts();
// If the exception was due to an abort, override some of the panic info
if (g_panic_abort) {
@@ -253,28 +297,28 @@ void esp_panic_handler(panic_info_t *info)
}
/*
* For any supported chip, the panic handler prints the contents of panic_info_t in the following format:
*
*
* Guru Meditation Error: Core <core> (<exception>). <description>
* <details>
*
* <state>
*
* <elf_info>
*
*
* ----------------------------------------------------------------------------------------
* core - core where exception was triggered
* exception - what kind of exception occurred
* description - a short description regarding the exception that occurred
* details - more details about the exception
* state - processor state like register contents, and backtrace
* elf_info - details about the image currently running
*
* NULL fields in panic_info_t are not printed.
*
* */
* For any supported chip, the panic handler prints the contents of panic_info_t in the following format:
*
*
* Guru Meditation Error: Core <core> (<exception>). <description>
* <details>
*
* <state>
*
* <elf_info>
*
*
* ----------------------------------------------------------------------------------------
* core - core where exception was triggered
* exception - what kind of exception occurred
* description - a short description regarding the exception that occurred
* details - more details about the exception
* state - processor state like register contents, and backtrace
* elf_info - details about the image currently running
*
* NULL fields in panic_info_t are not printed.
*
*/
if (info->reason) {
panic_print_str("Guru Meditation Error: Core ");
panic_print_dec(info->core);
@@ -312,7 +356,6 @@ void esp_panic_handler(panic_info_t *info)
panic_print_str("Setting breakpoint at 0x");
panic_print_hex((uint32_t)info->addr);
panic_print_str(" and returning...\r\n");
disable_all_wdts();
#if CONFIG_APPTRACE_ENABLE
#if CONFIG_APPTRACE_SV_ENABLE
SEGGER_RTT_ESP_FlushNoLock(CONFIG_APPTRACE_POSTMORTEM_FLUSH_THRESH, APPTRACE_ONPANIC_HOST_FLUSH_TMO);
@@ -322,24 +365,16 @@ void esp_panic_handler(panic_info_t *info)
#endif
#endif
disable_all_wdts();
esp_cpu_set_breakpoint(0, info->addr); // use breakpoint 0
return;
}
#endif //CONFIG_ESP_DEBUG_OCDAWARE
// start panic WDT to restart system if we hang in this handler
if (!wdt_hal_is_enabled(&rtc_wdt_ctx)) {
wdt_hal_init(&rtc_wdt_ctx, WDT_RWDT, 0, false);
uint32_t stage_timeout_ticks = (uint32_t)(7000ULL * rtc_clk_slow_freq_get_hz() / 1000ULL);
wdt_hal_write_protect_disable(&rtc_wdt_ctx);
wdt_hal_config_stage(&rtc_wdt_ctx, WDT_STAGE0, stage_timeout_ticks, WDT_STAGE_ACTION_RESET_SYSTEM);
// 64KB of core dump data (stacks of about 30 tasks) will produce ~85KB base64 data.
// @ 115200 UART speed it will take more than 6 sec to print them out.
wdt_hal_enable(&rtc_wdt_ctx);
wdt_hal_write_protect_enable(&rtc_wdt_ctx);
}
esp_panic_handler_reconfigure_wdts(1000); // Restart WDT again
/* Feed the WDTs here. This is done to fascilitate a "slow" UART
* which might take a longer time to print the state of the processor.
*/
esp_panic_handler_feed_wdts();
PANIC_INFO_DUMP(info, state);
panic_print_str("\r\n");
@@ -361,66 +396,45 @@ void esp_panic_handler(panic_info_t *info)
panic_print_str("\r\n");
#if CONFIG_APPTRACE_ENABLE
disable_all_wdts();
esp_panic_handler_feed_wdts();
#if CONFIG_APPTRACE_SV_ENABLE
SEGGER_RTT_ESP_FlushNoLock(CONFIG_APPTRACE_POSTMORTEM_FLUSH_THRESH, APPTRACE_ONPANIC_HOST_FLUSH_TMO);
#else
esp_apptrace_flush_nolock(ESP_APPTRACE_DEST_TRAX, CONFIG_APPTRACE_POSTMORTEM_FLUSH_THRESH,
APPTRACE_ONPANIC_HOST_FLUSH_TMO);
#endif
esp_panic_handler_reconfigure_wdts(1000); // restore WDT config
#endif // CONFIG_APPTRACE_ENABLE
#if CONFIG_ESP_COREDUMP_ENABLE
static bool s_dumping_core;
esp_panic_handler_feed_wdts();
static bool s_dumping_core = false;
if (s_dumping_core) {
panic_print_str("Re-entered core dump! Exception happened during core dump!\r\n");
} else {
disable_all_wdts();
s_dumping_core = true;
esp_core_dump_write(info);
s_dumping_core = false;
esp_panic_handler_reconfigure_wdts(1000);
}
#endif /* CONFIG_ESP_COREDUMP_ENABLE */
#if CONFIG_ESP_SYSTEM_PANIC_GDBSTUB
disable_all_wdts();
wdt_hal_write_protect_disable(&rtc_wdt_ctx);
wdt_hal_disable(&rtc_wdt_ctx);
wdt_hal_write_protect_enable(&rtc_wdt_ctx);
panic_print_str("Entering gdb stub now.\r\n");
disable_all_wdts();
esp_gdbstub_panic_handler((void *)info->frame);
#else
#if CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS
// start RTC WDT if it hasn't been started yet and set the timeout to more than the delay time
wdt_hal_init(&rtc_wdt_ctx, WDT_RWDT, 0, false);
uint32_t stage_timeout_ticks = (uint32_t)(((CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS + 1) * 1000
* rtc_clk_slow_freq_get_hz()) / 1000ULL);
wdt_hal_write_protect_disable(&rtc_wdt_ctx);
wdt_hal_config_stage(&rtc_wdt_ctx, WDT_STAGE0, stage_timeout_ticks, WDT_STAGE_ACTION_RESET_SYSTEM);
// 64KB of core dump data (stacks of about 30 tasks) will produce ~85KB base64 data.
// @ 115200 UART speed it will take more than 6 sec to print them out.
wdt_hal_enable(&rtc_wdt_ctx);
wdt_hal_write_protect_enable(&rtc_wdt_ctx);
esp_panic_handler_reconfigure_wdts((CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS + 1) * 1000);
esp_panic_handler_feed_wdts();
panic_print_str("Rebooting in ");
panic_print_dec(CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS);
panic_print_str(" seconds...\r\n");
esp_rom_delay_us(CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS * 1000000);
esp_panic_handler_reconfigure_wdts(1000);
#endif /* CONFIG_ESP_SYSTEM_PANIC_REBOOT_DELAY_SECONDS */
wdt_hal_write_protect_disable(&rtc_wdt_ctx);
wdt_hal_disable(&rtc_wdt_ctx);
wdt_hal_write_protect_enable(&rtc_wdt_ctx);
#if CONFIG_ESP_SYSTEM_PANIC_PRINT_REBOOT || CONFIG_ESP_SYSTEM_PANIC_SILENT_REBOOT
esp_panic_handler_feed_wdts();
if (esp_reset_reason_get_hint() == ESP_RST_UNKNOWN) {
switch (info->exception) {
case PANIC_EXCEPTION_IWDT:
@@ -440,9 +454,9 @@ void esp_panic_handler(panic_info_t *info)
panic_print_str("Rebooting...\r\n");
panic_restart();
#else /* CONFIG_ESP_SYSTEM_PANIC_PRINT_REBOOT || CONFIG_ESP_SYSTEM_PANIC_SILENT_REBOOT */
disable_all_wdts();
panic_print_str("CPU halted.\r\n");
esp_system_reset_modules_on_exit();
disable_all_wdts();
ESP_INFINITE_LOOP();
#endif /* CONFIG_ESP_SYSTEM_PANIC_PRINT_REBOOT || CONFIG_ESP_SYSTEM_PANIC_SILENT_REBOOT */
#endif /* CONFIG_ESP_SYSTEM_PANIC_GDBSTUB */