From f69a3800b616c4db828a6d0dcdb041ad7a7e3655 Mon Sep 17 00:00:00 2001 From: MandyMCHung Date: Fri, 26 Jun 2026 11:01:13 +0800 Subject: [PATCH] yv4-sd: Fix CPU thermal trip SEL handling [Issue Description] CPU thermal trip may be reported as an incorrect SEL event on yv4-sd. When FM_CPU_BIC_THERMTRIP_N is asserted, BIC correctly enters ISR_SOC_THMALTRIP. However, CPU thermal trip uses event type 0x00, and this value may be treated as an invalid event in the current SEL handling flow. As a result, the CPU thermal trip SEL may not be sent correctly to BMC. [Root Cause] CPU thermal trip uses event type 0x00, which is a valid SEL event. The current SEL handling flow may treat this value as an invalid or uninitialized event when deciding whether the work item belongs to the normal GPIO SEL path or the wrapper SEL path. As a result, CPU thermal trip may fall through to the wrapper path and be reported incorrectly. [Solution] Keep addsel_work_handler() for normal GPIO SEL events, so CPU thermal trip can be sent correctly with event type 0x00. Add addsel_wrapper_work_handler() for FAST_PROCHOT and SYS_THROTTLE wrapper SEL events. The wrapper handler skips incomplete wrapper SEL data to avoid reporting it as CPU thermal trip. Update FAST_PROCHOT and SYS_THROTTLE work initialization to use the wrapper SEL handler. [Test Log] "580": { "additional_data": { "DEVICE": "/xyz/openbmc_project/State/Thermal/host6/cpu0", "FAILURE_DATA": "CPU Thermal Trip", "_CODE_FILE": "/usr/src/debug/pldm/1.0+git/oem/meta/libpldmresponder/file_io_type_event_log.cpp", "_CODE_FUNC": "void pldm::responder::oem_meta::record::commit(const std::string&, pldm::responder::oem_meta::EventAssert, const std::string&) [with const char* TypeLabel = (& label); AssertType = sdbusplus::error::xyz::openbmc_project::state::Thermal::DeviceOverOperatingTemperatureFault; DeassertType = sdbusplus::event::xyz::openbmc_project::state::Thermal::DeviceOperatingNormalTemperature; std::string = std::__cxx11::basic_string]", "_CODE_LINE": "201", "_PID": "575" }, "event_id": "", "message": "xyz.openbmc_project.State.Thermal.DeviceOverOperatingTemperatureFault", "redfish": { "args": [ "/xyz/openbmc_project/State/Thermal/host6/cpu0" ], "id": "OpenBMC_StateThermal.DeviceOverOperatingTemperatureFault", "message": "Device /xyz/openbmc_project/State/Thermal/host6/cpu0 is significantly over safe operating temperature and may have been powered off." }, "resolution": "", "resolved": false, "severity": "xyz.openbmc_project.Logging.Entry.Level.Critical", "timestamp": "2026-06-26T09:18:39.875000000Z", "updated_timestamp": "2026-06-26T09:18:39.875000000Z" } --- meta-facebook/yv4-sd/src/platform/plat_isr.c | 40 ++++++++++---------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/meta-facebook/yv4-sd/src/platform/plat_isr.c b/meta-facebook/yv4-sd/src/platform/plat_isr.c index e3a0dac99f..168d65cfe1 100644 --- a/meta-facebook/yv4-sd/src/platform/plat_isr.c +++ b/meta-facebook/yv4-sd/src/platform/plat_isr.c @@ -130,28 +130,30 @@ void init_event_work() void addsel_work_handler(struct k_work *work_item) { - struct pldm_addsel_data msg = { 0 }; struct k_work_delayable *dwork = k_work_delayable_from_work(work_item); + add_sel_info *work_info = CONTAINER_OF(dwork, add_sel_info, add_sel_work); + struct pldm_addsel_data msg = { 0 }; + msg.event_type = work_info->event_type; + msg.assert_type = work_info->assert_type; - const add_sel_info *work_info = CONTAINER_OF(dwork, add_sel_info, add_sel_work); + if (send_event_log_to_bmc(msg) != PLDM_SUCCESS) { + LOG_ERR("Failed to send event log, event type: 0x%x, assert type: 0x%x", + work_info->event_type, work_info->assert_type); + }; +} - if ((work_info->gpio_num != 0) && (work_info->event_type != 0)) { - msg.event_type = work_info->event_type; - msg.assert_type = work_info->assert_type; - } else { - // for fastprochot and sys_throttle - const sel_work_wrapper *wrap = CONTAINER_OF(work_item, sel_work_wrapper, work); - if (wrap->sel_data.event_type != 0) { - msg = wrap->sel_data; - } else { - LOG_ERR("Invalid work item received, skip sending SEL."); - return; - } +void addsel_wrapper_work_handler(struct k_work *work_item) +{ + sel_work_wrapper *wrap = CONTAINER_OF(work_item, sel_work_wrapper, work); + + if (wrap->sel_data.event_type == 0) { + LOG_ERR("Invalid wrapper SEL, skip sending SEL."); + return; } - if (send_event_log_to_bmc(msg) != PLDM_SUCCESS) { - LOG_ERR("Failed to send SEL: event_type=0x%x, assert_type=0x%x", msg.event_type, - msg.assert_type); + if (send_event_log_to_bmc(wrap->sel_data) != PLDM_SUCCESS) { + LOG_ERR("Failed to send SEL: event_type=0x%x, assert_type=0x%x", + wrap->sel_data.event_type, wrap->sel_data.assert_type); } } @@ -452,7 +454,7 @@ void ISR_MB_THROTTLE() hw_event_register[2]++; } int ret = -1; - k_work_init_delayable(&wrap->work, addsel_work_handler); + k_work_init_delayable(&wrap->work, addsel_wrapper_work_handler); ret = k_work_schedule_for_queue(&mb_throttle_work_q, &wrap->work, K_NO_WAIT); if (ret != 1) { LOG_ERR("Fail MB_THROTTLE Kwork failed, %d", ret); @@ -500,7 +502,7 @@ void ISR_SYS_THROTTLE() hw_event_register[4]++; } int ret = -1; - k_work_init_delayable(&wrap->work, addsel_work_handler); + k_work_init_delayable(&wrap->work, addsel_wrapper_work_handler); ret = k_work_schedule_for_queue(&sys_throttle_work_q, &wrap->work, K_NO_WAIT); if (ret != 1) { LOG_ERR("Fail SYS_THROTTLE Kwork failed, %d", ret);