ARM cortex-M上面的fault想必大家都不陌生,我相信还没有谁从来没有出现过fault。
但出现fault后如何排查,相信很多人都是一筹莫展。
在我的项目中加了一些代码,Fault 后可以打印出更多的信息。
#define SCB_CFSR (*(volatile const unsigned )0xE000ED28) / Configurable Fault Status Register */
#define SCB_HFSR (*(volatile const unsigned )0xE000ED2C) / HardFault Status Register */
#define SCB_MMAR (*(volatile const unsigned )0xE000ED34) / MemManage Fault Address register */
#define SCB_BFAR (*(volatile const unsigned )0xE000ED38) / Bus Fault Address Register */
#define SCB_CFSR_MFSR ((volatile const unsigned char)0xE000ED28) /* Memory-management Fault Status Register */
#define SCB_CFSR_BFSR ((volatile const unsigned char)0xE000ED29) /* Bus Fault Status Register */
#define SCB_CFSR_UFSR ((volatile const unsigned short)0xE000ED2A) /* Usage Fault Status Register */
static void usage_fault_track(void)
{
rt_kprintf("usage fault:\n");
rt_kprintf("SCB_CFSR_UFSR:0x%02X ", SCB_CFSR_UFSR);
if(SCB_CFSR_UFSR & (1<<0))
{
rt_kprintf("UNDEFINSTR ");
}
if(SCB_CFSR_UFSR & (1<<1))
{
rt_kprintf("INVSTATE ");
}
if(SCB_CFSR_UFSR & (1<<2))
{
rt_kprintf("INVPC ");
}
if(SCB_CFSR_UFSR & (1<<3))
{
rt_kprintf("NOCP ");
}
if(SCB_CFSR_UFSR & (1<<8))
{
rt_kprintf("UNALIGNED ");
}
if(SCB_CFSR_UFSR & (1<<9))
{
rt_kprintf("DIVBYZERO ");
}
rt_kprintf("\n");
}
static void bus_fault_track(void)
{
rt_kprintf("bus fault:\n");
rt_kprintf("SCB_CFSR_BFSR:0x%02X ", SCB_CFSR_BFSR);
if(SCB_CFSR_BFSR & (1<<0))
{
rt_kprintf("IBUSERR ");
}
if(SCB_CFSR_BFSR & (1<<1))
{
rt_kprintf("PRECISERR ");
}
if(SCB_CFSR_BFSR & (1<<2))
{
rt_kprintf("IMPRECISERR ");
}
if(SCB_CFSR_BFSR & (1<<3))
{
rt_kprintf("UNSTKERR ");
}
if(SCB_CFSR_BFSR & (1<<4))
{
rt_kprintf("STKERR ");
}
if(SCB_CFSR_BFSR & (1<<7))
{
rt_kprintf("SCB->BFAR:%08X\n", SCB_BFAR);
}
else
{
rt_kprintf("\n");
}
}
static void mem_manage_fault_track(void)
{
rt_kprintf("mem manage fault:\n");
rt_kprintf("SCB_CFSR_MFSR:0x%02X ", SCB_CFSR_MFSR);
if(SCB_CFSR_MFSR & (1<<0))
{
rt_kprintf("IACCVIOL ");
}
if(SCB_CFSR_MFSR & (1<<1))
{
rt_kprintf("DACCVIOL ");
}
if(SCB_CFSR_MFSR & (1<<3))
{
rt_kprintf("MUNSTKERR ");
}
if(SCB_CFSR_MFSR & (1<<4))
{
rt_kprintf("MSTKERR ");
}
if(SCB_CFSR_MFSR & (1<<7))
{
rt_kprintf("SCB->MMAR:%08X\n", SCB_MMAR);
}
else
{
rt_kprintf("\n");
}
}
static void hard_fault_track(void)
{
if(SCB_HFSR & (1UL<<1))
{
rt_kprintf("failed vector fetch\n");
}
if(SCB_HFSR & (1UL<<30))
{
if(SCB_CFSR_BFSR)
{
bus_fault_track();
}
if(SCB_CFSR_MFSR)
{
mem_manage_fault_track();
}
if(SCB_CFSR_UFSR)
{
usage_fault_track();
}
}
if(SCB_HFSR & (1UL<<31))
{
rt_kprintf("debug event\n");
}
}
/**
*/
void rt_hw_hard_fault_exception(struct stack_context* contex)
{
rt_kprintf("psr: 0x%08x\n", contex->psr);
rt_kprintf(" pc: 0x%08x\n", contex->pc);
rt_kprintf(" lr: 0x%08x\n", contex->lr);
rt_kprintf("r12: 0x%08x\n", contex->r12);
rt_kprintf("r03: 0x%08x\n", contex->r3);
rt_kprintf("r02: 0x%08x\n", contex->r2);
rt_kprintf("r01: 0x%08x\n", contex->r1);
rt_kprintf("r00: 0x%08x\n", contex->r0);
hard_fault_track();
rt_kprintf("hard fault on thread: %s\n", rt_current_thread->name);
#ifdef RT_USING_FINSH
list_thread();
#endif
while (1);
}
再写了两个测试代码,以手动触发fault。
void div0_test(void)
{
volatile int * SCB_CCR = (volatile int *)0xE000ED14;
int x,y,z;
*SCB_CCR |= (1 << 4);
x = 10;
y = 0;
z = x / y;
rt_kprintf("z:%d\n", z);
}
void unalign_test(void)
{
volatile int * SCB_CCR = (volatile int *)0xE000ED14; // SCB->CCR
volatile int * p;
volatile int value;
*SCB_CCR |= (1 << 3); /* bit3: UNALIGN_TRP. */
p = (int *)0x00;
value = *p;
rt_kprintf("addr:0x%02X value:0x%08X\n", (int)p, value);
p = (int *)0x04;
value = *p;
rt_kprintf("addr:0x%02X value:0x%08X\n", (int)p, value);
p = (int *)0x03;
value = *p;
rt_kprintf("addr:0x%02X value:0x%08X\n", (int)p, value);
}
#ifdef RT_USING_FINSH
#include <finsh.h>
FINSH_FUNCTION_EXPORT(div0_test, div0_test)
FINSH_FUNCTION_EXPORT(unalign_test, unalign_test)
#endif /* RT_USING_FINSH */
测试访问末授权区域
finsh>>int * p //声明一个指针变量
0, 0x00000000
finsh>>p = 0xDFFFFFF0 // 指向片上外设区结束处,一般不可能用完,所以此处一般不可访问。
-536870928, 0xdffffff0
finsh>>*p // 读取指针处数据
psr: 0x01000000
pc: 0x00000e3e
lr: 0x0000451d
r12: 0x00000000
r03: 0x00000000
r02: 0x1fff0180
r01: 0x1fff0814
r00: 0xdffffff0
bus fault:
SCB_CFSR_BFSR:0x82 PRECISERR SCB->BFAR:DFFFFFF0
hard fault on thread: tshell
thread pri status sp stack size max used left tick error
tidle 0x1f ready 0x00000040 0x00000100 0x00000060 0x00000015 000
tshell 0x14 ready 0x00000088 0x00000400 0x00000218 0x00000009 000
非对齐访问测试
finsh>>unalign_test()
addr:0x00 value:0x20001B80
addr:0x04 value:0x0800DE81
psr: 0x21000000
r00: 0x00000000
r01: 0x40013800
r02: 0x20000690
r03: 0x00000000
r04: 0x00000003
r05: 0xe000ed14
r06: 0xdeadbeef
r07: 0x20002678
r08: 0xdeadbeef
r09: 0xdeadbeef
r10: 0xdeadbeef
r11: 0xdeadbeef
r12: 0x08000a95
lr: 0x08002eb5
pc: 0x08000386
usage fault:
SCB_CFSR_UFSR:0x100 UNALIGNED
hard fault on thread: tshell
thread pri status sp stack size max used left tick error
tidle 0x1f ready 0x00000040 0x00000100 0x0000005c 0x00000009 000
tshell 0x14 ready 0x00000088 0x00000800 0x000001b0 0x0000000a 000
led 0x14 suspend 0x00000078 0x00000200 0x00000078 0x00000005 000
除零异常测试
finsh>>div0_test()
psr: 0x41000000
r00: 0x00000010
r01: 0x08000337
r02: 0x20000bb7
r03: 0x20000130
r04: 0xe000ed14
r05: 0x00000000
r06: 0xdeadbeef
r07: 0x0000000a
r08: 0xdeadbeef
r09: 0xdeadbeef
r10: 0xdeadbeef
r11: 0xdeadbeef
r12: 0x00000000
lr: 0x08008d91
pc: 0x08000348
usage fault:
SCB_CFSR_UFSR:0x200 DIVBYZERO
hard fault on thread: tshell
thread pri status sp stack size max used left tick error
tidle 0x1f ready 0x00000058 0x00000100 0x0000005c 0x0000000f 000
tshell 0x14 ready 0x00000088 0x00000800 0x000001b0 0x0000000a 000
led 0x14 suspend 0x00000078 0x00000200 0x00000078 0x00000005 000
问题追踪
上面测试出了问题,是我们人为设置的故障,但在平时调试中出了问题如何追综呢?
以非对齐访问为例,开发环境使用MDK。
进入JTAG仿真状态,并触发非对齐异常。
此时串口会打印出异常时的寄存器值,此时停止仿真器发现程序停在rt_hw_hard_fault_exception中。
根据上面打印出来的寄存器,提取出关键值是 pc: 0x08000386
我们在MDK的command窗口中输入 pc = 0x08000386
可以把PC指针临时设回问题发生时的场景,我们看到出现问题的指令是
239: p = (int *)0x03;
0x08000384 2403 MOVS r4,#0x03
240: value = *p;
0x08000386 6820 LDR r0,[r4,#0x00]
0x08000388 9000 STR r0,[sp,#0x00]
分析 #386 这条指令从 R4+0 的问题读取4字节到R0中,
先前打印出的R4的值为 r04: 0x00000003
因此可以确定为这是因为地址不对齐造成的。
当然,具体情况要具体分析,有时候某个步骤出现问题并不会马上崩溃,
而是过一段时间以后才出问题,因此要结合上下文综合分析。
比如上面这个案例真正有问题的指令是 0x08000384。
原作者:aozima