Initial commit
This commit is contained in:
Executable
+205
@@ -0,0 +1,205 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script monitors S0/S1 fault GPIO and detects errors or warnings from CPUs
|
||||
#
|
||||
# According to OpenBMC_Software_Funcional_Specification, section 3.16,
|
||||
#
|
||||
# When the BMC detects the GPIO_FAULT signal indicating an SCP booting failure:
|
||||
# • If a non-critical error/warning from the SCP occurs, the BMC blinks the Fault LED once.
|
||||
# • If a critical error from the SCP occurs, the BMC turns on the Fault LED.
|
||||
# The BMC monitors the GPIO_FAULT signal from the SCP during SCP booting to determine whether
|
||||
# the error is non-critical or critical. A fatal error is indicated when the signal is On and then Off
|
||||
# continuously, followed by a “quiet” period of about three seconds, and this pattern repeats. If the “quiet”
|
||||
# period is longer than three seconds, the error is non-fatal. The BMC must set up appropriate debounce
|
||||
# times to detect such errors. The BMC is expected to turn on the Fault LED forever for fatal errors, or to
|
||||
# turn on the Fault LED and turn it off when the fault clears for non-fatal errors.
|
||||
#
|
||||
# Usage: <app_name> <socket 0/1>
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source /usr/sbin/gpio-lib.sh
|
||||
|
||||
# global variables
|
||||
error_flag='/tmp/fault_err'
|
||||
warning_flag='/tmp/fault_warning'
|
||||
|
||||
duty_cycle=250000
|
||||
scan_pulse=100000
|
||||
blank_num=8
|
||||
|
||||
curr_pattern=0
|
||||
prev_pattern=0
|
||||
|
||||
gpio_status=0
|
||||
repeat=0
|
||||
|
||||
socket=$1
|
||||
|
||||
socket1_present=15
|
||||
socket1_status=1
|
||||
|
||||
S0_fault_gpio=73
|
||||
S1_fault_gpio=201
|
||||
|
||||
map_event_name() {
|
||||
case $curr_pattern in
|
||||
1)
|
||||
event_name="RAS_GPIO_INVALID_LCS"
|
||||
;;
|
||||
2)
|
||||
event_name="RAS_GPIO_FILE_HDR_INVALID"
|
||||
;;
|
||||
3)
|
||||
event_name="RAS_GPIO_FILE_INTEGRITY_INVALID"
|
||||
;;
|
||||
4)
|
||||
event_name="RAS_GPIO_KEY_CERT_AUTH_ERR"
|
||||
;;
|
||||
5)
|
||||
event_name="RAS_GPIO_CNT_CERT_AUTH_ERR"
|
||||
;;
|
||||
6)
|
||||
event_name="RAS_GPIO_I2C_HARDWARE_ERR"
|
||||
;;
|
||||
7)
|
||||
event_name="RAS_GPIO_CRYPTO_ENGINE_ERR"
|
||||
;;
|
||||
8)
|
||||
event_name="RAS_GPIO_ROTPK_EFUSE_INVALID"
|
||||
;;
|
||||
9)
|
||||
event_name="RAS_GPIO_SEED_EFUSE_INVALID"
|
||||
;;
|
||||
10)
|
||||
event_name="RAS_GPIO_LCS_FROM_EFUSE_INVALID"
|
||||
;;
|
||||
11)
|
||||
event_name="RAS_GPIO_PRIM_ROLLBACK_EFUSE_INVALID"
|
||||
;;
|
||||
12)
|
||||
event_name="RAS_GPIO_SEC_ROLLBACK_EFUSE_INVALID"
|
||||
;;
|
||||
13)
|
||||
event_name="RAS_GPIO_HUK_EFUSE_INVALID"
|
||||
;;
|
||||
14)
|
||||
event_name="RAS_GPIO_CERT_DATA_INVALID"
|
||||
;;
|
||||
15)
|
||||
event_name="RAS_GPIO_INTERNAL_HW_ERR"
|
||||
;;
|
||||
*)
|
||||
event_name="NOT_SUPPORT"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
detect_patern_repeat() {
|
||||
local prev=0
|
||||
local curr=0
|
||||
local cnt=13
|
||||
|
||||
while true
|
||||
do
|
||||
usleep $scan_pulse
|
||||
gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
|
||||
prev=$curr
|
||||
curr=$gpio_status
|
||||
if [ "$prev" == 0 ] && [ "$curr" == 1 ]; then
|
||||
# patern start repeating, check if previous and current pattern are the same
|
||||
repeat=1
|
||||
break
|
||||
fi
|
||||
if [ "$cnt" == 0 ]; then
|
||||
map_event_name
|
||||
echo "detected a warning from fault GPIO #$fault_gpio $socket, event $event_name"
|
||||
# pattern not repeat, this is a warning, turn on warning flag
|
||||
touch $warning_flag
|
||||
break
|
||||
fi
|
||||
cnt=$(( cnt - 1 ))
|
||||
done
|
||||
}
|
||||
|
||||
detect_pattern() {
|
||||
local cnt_falling_edge=0
|
||||
local cnt_blank=0
|
||||
|
||||
local prev=0
|
||||
local curr=0
|
||||
|
||||
while true
|
||||
do
|
||||
prev=$curr
|
||||
curr=$gpio_status
|
||||
# count the falling edges, if they appear, just reset cnt_blank
|
||||
if [ "$prev" == 1 ] && [ "$curr" == 0 ]; then
|
||||
cnt_falling_edge=$(( cnt_falling_edge + 1 ))
|
||||
cnt_blank=0
|
||||
continue
|
||||
# check if we are in the quite gap
|
||||
elif [ "$prev" == 0 ] && [ "$curr" == 0 ]; then
|
||||
cnt_blank=$(( cnt_blank + 1 ))
|
||||
if [ "$cnt_blank" == "$blank_num" ]; then
|
||||
# echo "pattern number falling_edge=$cnt_falling_edge blank=$cnt_blank"
|
||||
curr_pattern=$cnt_falling_edge
|
||||
# after count all falling edges, now check if patern repeat after 3s
|
||||
detect_patern_repeat
|
||||
break
|
||||
fi
|
||||
fi
|
||||
usleep $scan_pulse
|
||||
gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
|
||||
done
|
||||
}
|
||||
|
||||
gpio_config_input() {
|
||||
echo "$gpio_Id" > /sys/class/gpio/export
|
||||
echo "in" > /sys/class/gpio/gpio"${gpio_Id}"/direction
|
||||
}
|
||||
|
||||
init_sysfs_fault_gpio() {
|
||||
gpio_Id=$(gpio_number "$fault_gpio")
|
||||
if [ -d /sys/class/gpio/gpio"$gpio_Id" ]; then
|
||||
return
|
||||
fi
|
||||
gpio_config_input "$fault_gpio"
|
||||
}
|
||||
|
||||
# init
|
||||
if [ "$socket" == "0" ]; then
|
||||
fault_gpio=$S0_fault_gpio
|
||||
else
|
||||
socket1_status=$(gpioget 0 "$socket1_present")
|
||||
if [ "$socket1_status" == 1 ]; then
|
||||
echo "socket 1 not present"
|
||||
exit 1
|
||||
fi
|
||||
fault_gpio=$S1_fault_gpio
|
||||
fi
|
||||
|
||||
init_sysfs_fault_gpio
|
||||
|
||||
# daemon start
|
||||
while true
|
||||
do
|
||||
# detect when pattern starts
|
||||
if [ "$gpio_status" == 1 ]; then
|
||||
# now, there is something on gpio, check if that is a pattern
|
||||
detect_pattern
|
||||
if [ "$repeat" == 1 ] && [ "$prev_pattern" == "$curr_pattern" ]; then
|
||||
map_event_name
|
||||
echo "detected an error from fault GPIO #$fault_gpio $socket, event#$curr_pattern $event_name"
|
||||
touch $error_flag
|
||||
repeat=0
|
||||
fi
|
||||
prev_pattern=$curr_pattern
|
||||
curr_pattern=0
|
||||
continue
|
||||
fi
|
||||
usleep $duty_cycle
|
||||
gpio_status=$(cat /sys/class/gpio/gpio"$gpio_Id"/value)
|
||||
|
||||
done
|
||||
|
||||
exit 1
|
||||
+180
@@ -0,0 +1,180 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script monitors fan, over-temperature, PSU, CPU/SCP failure and update fault LED status
|
||||
|
||||
# shellcheck disable=SC2004
|
||||
# shellcheck disable=SC2046
|
||||
# shellcheck source=/dev/null
|
||||
|
||||
# common variables
|
||||
warning_fault_flag='/tmp/fault_warning'
|
||||
error_fault_flag='/tmp/fault_err'
|
||||
overtemp_fault_flag='/tmp/fault_overtemp'
|
||||
fault_RAS_UE_flag='/tmp/fault_RAS_UE'
|
||||
|
||||
blink_rate=100000
|
||||
|
||||
fault="false"
|
||||
|
||||
on="true"
|
||||
off="false"
|
||||
|
||||
gpio_fault="false"
|
||||
|
||||
# fan variables
|
||||
fan_failed="false"
|
||||
fan_failed_flag='/tmp/fan_failed'
|
||||
|
||||
# PSU variables
|
||||
psu_failed="false"
|
||||
psu_bus=6
|
||||
psu0_addr=0x58
|
||||
psu1_addr=0x59
|
||||
status_word_cmd=0x79
|
||||
# Following the PMBus Specification
|
||||
# Bit[1]: CML faults
|
||||
# Bit[2]: Over temperature faults
|
||||
# Bit[3]: Under voltage faults
|
||||
# Bit[4]: Over current faults
|
||||
# Bit[5]: Over voltage fault
|
||||
# Bit[10]: Fan faults
|
||||
psu_fault_bitmask=0x43e
|
||||
|
||||
# led variables
|
||||
led_service='xyz.openbmc_project.LED.GroupManager'
|
||||
led_fault_path='/xyz/openbmc_project/led/groups/system_fault'
|
||||
led_fault_interface='xyz.openbmc_project.Led.Group'
|
||||
fault_led_status=$off
|
||||
|
||||
# functions declaration
|
||||
check_fan_failed() {
|
||||
if [[ -f $fan_failed_flag ]]; then
|
||||
fan_failed="true"
|
||||
else
|
||||
fan_failed="false"
|
||||
fi
|
||||
}
|
||||
|
||||
turn_on_off_fault_led() {
|
||||
busctl set-property $led_service $led_fault_path $led_fault_interface Asserted b "$1" >> /dev/null
|
||||
}
|
||||
|
||||
check_psu_failed() {
|
||||
local psu0_presence
|
||||
local psu1_presence
|
||||
local psu0_value
|
||||
local psu1_value
|
||||
|
||||
psu0_presence=$(gpioget $(gpiofind PSU1_PRESENT))
|
||||
psu0_failed="true"
|
||||
if [ "$psu0_presence" == "0" ]; then
|
||||
# PSU0 presence, monitor the PSUs using pmbus, check the STATUS_WORD
|
||||
psu0_value=$(i2cget -f -y $psu_bus $psu0_addr $status_word_cmd w)
|
||||
psu0_bit_fault=$(($psu0_value & $psu_fault_bitmask))
|
||||
if [ "$psu0_bit_fault" == "0" ]; then
|
||||
psu0_failed="false"
|
||||
fi
|
||||
fi
|
||||
|
||||
psu1_presence=$(gpioget $(gpiofind PSU2_PRESENT))
|
||||
psu1_failed="true"
|
||||
if [ "$psu1_presence" == "0" ]; then
|
||||
# PSU1 presence, monitor the PSUs using pmbus, check the STATUS_WORD
|
||||
psu1_value=$(i2cget -f -y $psu_bus $psu1_addr $status_word_cmd w)
|
||||
psu1_bit_fault=$(($psu1_value & $psu_fault_bitmask))
|
||||
if [ "$psu1_bit_fault" == "0" ]; then
|
||||
psu1_failed="false"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$psu0_failed" == "true" ] || [ "$psu1_failed" == "true" ]; then
|
||||
psu_failed="true"
|
||||
else
|
||||
psu_failed="false"
|
||||
fi
|
||||
}
|
||||
|
||||
check_fault() {
|
||||
if [[ "$fan_failed" == "true" ]] || [[ "$psu_failed" == "true" ]] \
|
||||
|| [[ "$gpio_fault" == "true" ]] \
|
||||
|| [[ "$RAS_UE_occured" == "true" ]] \
|
||||
|| [[ "$overtemp_occured" == "true" ]]; then
|
||||
fault="true"
|
||||
else
|
||||
fault="false"
|
||||
fi
|
||||
}
|
||||
|
||||
control_fault_led() {
|
||||
if [ "$fault" == "true" ]; then
|
||||
if [ "$fault_led_status" == $off ]; then
|
||||
turn_on_off_fault_led $on
|
||||
fault_led_status=$on
|
||||
fi
|
||||
else
|
||||
if [ "$fault_led_status" == $on ]; then
|
||||
turn_on_off_fault_led $off
|
||||
fault_led_status=$off
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
blink_fault_led() {
|
||||
if [ "$fault_led_status" == $off ]; then
|
||||
turn_on_off_fault_led $on
|
||||
usleep $blink_rate
|
||||
turn_on_off_fault_led $off
|
||||
else
|
||||
turn_on_off_fault_led $off
|
||||
usleep $blink_rate
|
||||
turn_on_off_fault_led $on
|
||||
fi
|
||||
}
|
||||
|
||||
check_gpio_fault() {
|
||||
if [[ -f $error_fault_flag ]]; then
|
||||
gpio_fault="true"
|
||||
else
|
||||
if [ -f $warning_fault_flag ]; then
|
||||
blink_fault_led
|
||||
rm $warning_fault_flag
|
||||
fi
|
||||
gpio_fault="false"
|
||||
fi
|
||||
}
|
||||
|
||||
check_RAS_UE_occured() {
|
||||
if [[ -f $fault_RAS_UE_flag ]]; then
|
||||
echo "RAS UE error occured, turn on fault LED"
|
||||
RAS_UE_occured="true"
|
||||
else
|
||||
RAS_UE_occured="false"
|
||||
fi
|
||||
}
|
||||
|
||||
check_overtemp_occured() {
|
||||
if [[ -f $overtemp_fault_flag ]]; then
|
||||
echo "Over temperature occured, turn on fault LED"
|
||||
overtemp_occured="true"
|
||||
else
|
||||
overtemp_occured="false"
|
||||
fi
|
||||
}
|
||||
|
||||
# daemon start
|
||||
while true
|
||||
do
|
||||
check_gpio_fault
|
||||
check_fan_failed
|
||||
check_overtemp_occured
|
||||
check_RAS_UE_occured
|
||||
|
||||
# Monitors PSU presence
|
||||
check_psu_failed
|
||||
|
||||
check_fault
|
||||
control_fault_led
|
||||
sleep 2
|
||||
done
|
||||
|
||||
exit 1
|
||||
Reference in New Issue
Block a user