1. 前言
限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
2. 分析背景
本文分析基于 linux-4.14.132
内核代码分析,运行环境 Ubuntu 16.04.4 LTS + QEMU + ARM vexpress-a9
,rootfs
基于 ubuntu-base-16.04-core-armhf.tar.gz
制作。
3. soft lockup 机制
3.1 什么是 soft lockup ?
当某个进程一直占住当前 CPU ,其它进程无法在当前 CPU 上得到调度。soft lockup
在不同的抢占配置下,有着不同的情形。
3.2 各种抢占配置下的 soft lockup
3.2.1 CONFIG_PREEMPT_NONE
配置下的 soft lockup
CONFIG_PREEMPT_NONE 不支持内核态抢占,致力于提高吞吐量,通常用于服务器。
。在 CONFIG_PREEMPT_NONE
配置下,包含死循环的进程,可导致 soft lockup
。
3.2.2 CONFIG_PREEMPT
配置下的 soft lockup
CONFIG_PREEMPT 支持内核态抢占,用于低延迟的桌面系统
。在CONFIG_PREEMPT
配置下,较长时间禁用抢占的进程,可导致 soft lockup
。
3.2.3 CONFIG_PREEMPT_VOLUNTARY
配置下的 soft lockup
CONFIG_PREEMPT_VOLUNTARY 不支持内核态抢占,通常用于桌面系统
。相对于配置 CONFIG_PREEMPT_NONE
的情形,在一些可能导致睡眠的代码路径上,插入了一些调度点,以降低延迟。
/** include/linux/kernel.h*/...#ifdef CONFIG_PREEMPT_VOLUNTARY
extern int _cond_resched(void);
# define might_resched() _cond_resched()
#else
# define might_resched() do { } while (0)
#endif#ifdef CONFIG_DEBUG_ATOMIC_SLEEPvoid ___might_sleep(const char *file, int line, int preempt_offset);void __might_sleep(const char *file, int line, int preempt_offset);
/*** might_sleep - annotation for functions that can sleep** this macro will print a stack trace if it is executed in an atomic* context (spinlock, irq-handler, ...).** This is a useful debugging help to be able to catch problems early and not* be bitten later when the calling function happens to sleep when it is not* supposed to.*/
# define might_sleep() \do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
# define sched_annotate_sleep() (current->task_state_change = 0)
#elsestatic inline void ___might_sleep(const char *file, int line,int preempt_offset) { }static inline void __might_sleep(const char *file, int line,int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0)
# define sched_annotate_sleep() do { } while (0)
#endif#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)...
在 CONFIG_PREEMPT_VOLUNTARY
配置下,包含死循环的进程,可导致 soft lockup
。
3.3 soft lockup 的实现
3.3.1 创建 soft lockup watchdog 每 CPU 线程
为所有的 CPU 创建每 CPU 的 watchdog 内核线程,该内核线程每次被调度时更新一次 watchdog时间戳
,同时启动一个定时器;定时器到期时,对比 watchdog时间戳
和 当前时间戳
,如果(当前时间戳 - watchdog时间戳 >= 设定的soft lockup阈值)
,表示当前 CPU 上的当前进程已经占住 CPU 较长时间,如此报告一个 soft lockup
问题 。
void __init lockup_detector_init(void)
{...cpumask_copy(&watchdog_cpumask, cpu_possible_mask);...lockup_detector_setup();
}
static struct smp_hotplug_thread watchdog_threads = {.store = &softlockup_watchdog,.thread_should_run = watchdog_should_run,.thread_fn = watchdog,.thread_comm = "watchdog/%u",.setup = watchdog_enable,.cleanup = watchdog_cleanup,.park = watchdog_disable,.unpark = watchdog_enable,
};static __init void lockup_detector_setup(void)
{...lockup_detector_update_enable();.../** 创建每 CPU 的 watchdog 内核线程:* . 负责更新每 CPU 的 watchdog 时间戳; * . 每次更新 watchdog 时间戳的同时启动一个定时器,* 该定时器检测 watchdog 时间戳更新的时间间隔,如* 果更新间隔超过设定的阈值,则报告 soft lockup 问题。*/ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,&watchdog_allowed_mask);if (ret) {pr_err("Failed to initialize soft lockup detector threads\n");return;}mutex_lock(&watchdog_mutex);softlockup_threads_initialized = true;lockup_detector_reconfigure();mutex_unlock(&watchdog_mutex);
}
static void lockup_detector_update_enable(void)
{watchdog_enabled = 0;...if (soft_watchdog_user_enabled)watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
}
int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,const struct cpumask *cpumask)
{...for_each_online_cpu(cpu) {ret = __smpboot_create_thread(plug_thread, cpu); /* 创建 @cpu 的内核线程 */...if (cpumask_test_cpu(cpu, cpumask))smpboot_unpark_thread(plug_thread, cpu); /* 启动 @cpu 的内核线程,进入 smpboot_thread_fn() */}...
}
static int
__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);struct smpboot_thread_data *td;td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));td->cpu = cpu;td->ht = ht;/* * SMP 内核线程的公共入口为 smpboot_thread_fn() , * 通过 smpboot_thread_fn() 调用具体的线程入口。*/tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,ht->thread_comm);kthread_park(tsk); /* 暂停内核线程 */get_task_struct(tsk);*per_cpu_ptr(ht->store, cpu) = tsk;...return 0;
}
3.3.2 运行 soft lockup watchdog 内核线程
static int smpboot_thread_fn(void *data)
{struct smpboot_thread_data *td = data;struct smp_hotplug_thread *ht = td->ht; /* watchdog_threads */while (1) {set_current_state(TASK_INTERRUPTIBLE);preempt_disable();if (kthread_should_stop()) {__set_current_state(TASK_RUNNING);preempt_enable();/* cleanup must mirror setup */if (ht->cleanup && td->status != HP_THREAD_NONE)ht->cleanup(td->cpu, cpu_online(td->cpu));kfree(td);return 0;}if (kthread_should_park()) {__set_current_state(TASK_RUNNING);preempt_enable();if (ht->park && td->status == HP_THREAD_ACTIVE) {BUG_ON(td->cpu != smp_processor_id());ht->park(td->cpu);td->status = HP_THREAD_PARKED;}kthread_parkme();/* We might have been woken for stop */continue;}BUG_ON(td->cpu != smp_processor_id());/* Check for state change setup */switch (td->status) {case HP_THREAD_NONE:__set_current_state(TASK_RUNNING);preempt_enable();if (ht->setup)ht->setup(td->cpu); /* watchdog_enable(): 初始化 watchdog时间戳,以及时间戳更新间隔检测定时器 */td->status = HP_THREAD_ACTIVE;continue;case HP_THREAD_PARKED:__set_current_state(TASK_RUNNING);preempt_enable();if (ht->unpark)ht->unpark(td->cpu);td->status = HP_THREAD_ACTIVE;continue;}if (!ht->thread_should_run(td->cpu)) {preempt_enable_no_resched();schedule();} else {__set_current_state(TASK_RUNNING);preempt_enable();/* 更新当前 CPU 的 watchdog时间戳 */ht->thread_fn(td->cpu); /* kernel/watchdog.c: watchdog() */}}
}
static void watchdog_enable(unsigned int cpu)
{/* 启动时间戳更新间隔检测定时器 */hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);hrtimer->function = watchdog_timer_fn;hrtimer_start(hrtimer, ns_to_ktime(sample_period),HRTIMER_MODE_REL_PINNED);__touch_watchdog(); /* 初始 watchdog 的时间戳 */watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); /* 设置为 RT 调度类的 SCHED_FIFO */
}
static void watchdog(unsigned int cpu)
{__this_cpu_write(soft_lockup_hrtimer_cnt,__this_cpu_read(hrtimer_interrupts));__touch_watchdog(); /* watchdog 线程负责时不时的更新当前CPU的watchdog时间戳 */
}
3.3.3 触发 soft lockup 问题报告
定时器到期后,进入 watchdog_timer_fn()
,触发可能的 soft lockup 问题报告:
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); /* 读取当前CPU最近的更新的时间戳 */.../* kick the softlockup detector */wake_up_process(__this_cpu_read(softlockup_watchdog)); /* 唤醒当前CPU时间戳更新线程 *//* .. and repeat */hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));if (touch_ts == 0) { /* 休眠重启等类似情形下,时间戳复位为0 */...__touch_watchdog(); /* 重新初始化时间戳 */return HRTIMER_RESTART; /* 重启定时器 */}duration = is_softlockup(touch_ts); /* 上次更新的时间戳和当前时间戳的差值 */if (unlikely(duration)) { /* 时间戳差值大于设定的阈值 */.../* 报告 sotf lockup 问题 */pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",smp_processor_id(), duration,current->comm, task_pid_nr(current));__this_cpu_write(softlockup_task_ptr_saved, current);print_modules();print_irqtrace_events(current);if (regs)show_regs(regs);elsedump_stack();...add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);if (softlockup_panic)panic("softlockup: hung tasks");__this_cpu_write(soft_watchdog_warn, true);} else__this_cpu_write(soft_watchdog_warn, false);return HRTIMER_RESTART;
}
static int is_softlockup(unsigned long touch_ts)
{unsigned long now = get_timestamp();if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){/* Warn about unreasonable delays. *//* 更新时间戳的间隔超过了设定的阈值 */if (time_after(now, touch_ts + get_softlockup_thresh()))return now - touch_ts;}return 0;
}
4. softlock up 举例
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/delay.h>static struct task_struct *softlockup_task;static int softlockup_task_fn(void *ignored)
{int ret = 0;while (!kthread_should_stop()) {#if (defined(CONFIG_PREEMPT_NONE) || defined(CONFIG_PREEMPT_VOLUNTARY))asm("nop");#else /* CONFIG_PREEMPT */preempt_disable();mdelay(30 * 1000);preempt_enable();#endif}return ret;
}static int __init softlockup_task_demo_init(void)
{int ret = 0;softlockup_task = kthread_run(softlockup_task_fn, NULL, "softlockup_task");if (IS_ERR(softlockup_task)) {ret = PTR_ERR(softlockup_task);printk(KERN_ERR "%s: Failed to create kernel thread, ret = [%d]\n", __func__, ret);}printk(KERN_INFO "soft lockup task example module loaded.\n");return ret;
}static void __exit softlockup_task_demo_exit(void)
{if (softlockup_task) {kthread_stop(softlockup_task);softlockup_task = NULL;}printk(KERN_INFO "soft lockup task example module exited.\n");
}module_init(softlockup_task_demo_init);
module_exit(softlockup_task_demo_exit);MODULE_LICENSE("GPL");
这里包含配置编译的完整代码。编译测试模块:
#
# 配置
## 抢占模式 3 选一
CONFIG_PREEMPT_NONE=y
#CONFIG_PREEMPT_VOLUNTARY=y
#CONFIG_PREEMPT=yCONFIG_DEBUG_KERNEL=y
CONFIG_LOCKUP_DETECTOR=y
CONFIG_SOFTLOCKUP_DETECTOR=y
CONFIG_SAMPLE_SOFTLOCKUP=m
cd linux-4.14.132
make ARCH=arm CROSS_COMPILE=arm-linux-gnueabi- -j8 O=outputsudo mount rootfs.img temp
cd linux-4.14.132
sudo make ARCH=arm CROSS_COMPILE=arm-linux-gnueabi- O=output INSTALL_MOD_PATH=/path/to/temp modules_install
cd -
sudo umount temp
然后用QEMU
启动系统,加载测试模块:
sudo qemu-system-arm \-M vexpress-a9 \-smp 4 \-m 512M \-kernel /path/to/zImage \-dtb /path/to/vexpress-v2p-ca9.dtb \-nographic \-append "root=/dev/mmcblk0 rw rootfstype=ext4 console=ttyAMA0" \-sd rootfs.img
# 当前位于 QEMU 模拟器系统特权模式下# modprobe softlockup_example
模块加载一段时间后,报 soft lockup BUG:
[ 157.356865] soft lockup task example module loaded.
[ 176.383362] INFO: rcu_sched self-detected stall on CPU
[ 176.384981] 1-...: (2113 ticks this GP) idle=a42/140000000000001/0 softirq=2326/2326 fqs=1019
[ 176.385513] (t=2100 jiffies g=903 c=902 q=636)
[ 176.387315] NMI backtrace for cpu 1
[ 176.387904] CPU: 1 PID: 939 Comm: softlockup_task Not tainted 4.14.132 #34
[ 176.387942] Hardware name: ARM-Versatile Express
[ 176.391912] [<8011149c>] (unwind_backtrace) from [<8010c330>] (show_stack+0x20/0x24)
[ 176.393289] [<8010c330>] (show_stack) from [<806ddfd8>] (dump_stack+0x8c/0xa0)
[ 176.393356] [<806ddfd8>] (dump_stack) from [<806e3dc4>] (nmi_cpu_backtrace+0xc0/0xc4)
[ 176.393417] [<806e3dc4>] (nmi_cpu_backtrace) from [<806e3eb0>] (nmi_trigger_cpumask_backtrace+0xe8/0x12c)
[ 176.393474] [<806e3eb0>] (nmi_trigger_cpumask_backtrace) from [<8010f490>] (arch_trigger_cpumask_backtrace+0x20/0x24)
[ 176.393534] [<8010f490>] (arch_trigger_cpumask_backtrace) from [<80182ea0>] (rcu_dump_cpu_stacks+0xac/0xd8)
[ 176.393672] [<80182ea0>] (rcu_dump_cpu_stacks) from [<80182478>] (rcu_check_callbacks+0x7f8/0x9f8)
[ 176.393725] [<80182478>] (rcu_check_callbacks) from [<80187f84>] (update_process_times+0x44/0x6c)
[ 176.393775] [<80187f84>] (update_process_times) from [<80197144>] (tick_periodic+0x4c/0xcc)
[ 176.393828] [<80197144>] (tick_periodic) from [<80197368>] (tick_handle_periodic+0x38/0x98)
[ 176.393877] [<80197368>] (tick_handle_periodic) from [<8010ffe4>] (twd_handler+0x40/0x50)
[ 176.393924] [<8010ffe4>] (twd_handler) from [<80172128>] (handle_percpu_devid_irq+0x98/0x24c)
[ 176.393979] [<80172128>] (handle_percpu_devid_irq) from [<8016c728>] (generic_handle_irq+0x34/0x44)
[ 176.394031] [<8016c728>] (generic_handle_irq) from [<8016cd3c>] (__handle_domain_irq+0x6c/0xc4)
[ 176.394125] [<8016cd3c>] (__handle_domain_irq) from [<80101508>] (gic_handle_irq+0x5c/0xa0)
[ 176.394166] [<80101508>] (gic_handle_irq) from [<8010d10c>] (__irq_svc+0x6c/0x90)
[ 176.394240] Exception stack(0x9e90df18 to 0x9e90df60)
[ 176.394533] df00: 00000000 9e983bc0
[ 176.394835] df20: 00000000 9e983bc0 9e983bc0 00000000 9f5fc940 9e90c000 00000000 9e983bdc
[ 176.394945] df40: 9e8fdd30 9e90df74 9e90df68 9e90df68 7f000020 7f000020 00000013 ffffffff
[ 176.396350] [<8010d10c>] (__irq_svc) from [<7f000020>] (softlockup_task_fn+0x20/0x30 [softlockup_example])
[ 176.396667] [<7f000020>] (softlockup_task_fn [softlockup_example]) from [<80143ae0>] (kthread+0x144/0x174)
[ 176.396718] [<80143ae0>] (kthread) from [<80107ee8>] (ret_from_fork+0x14/0x2c)
[ 204.214887] watchdog: BUG: soft lockup - CPU#1 stuck for 23s! [softlockup_task:939]
[ 204.215332] Modules linked in: softlockup_example
[ 204.215593] CPU: 1 PID: 939 Comm: softlockup_task Not tainted 4.14.132 #34
[ 204.215604] Hardware name: ARM-Versatile Express
[ 204.215648] task: 9f700c00 task.stack: 9e90c000
[ 204.215701] PC is at kthread_should_stop+0x30/0x54
[ 204.215736] LR is at softlockup_task_fn+0x20/0x30 [softlockup_example]
[ 204.215755] pc : [<80143524>] lr : [<7f000020>] psr: 00000013
[ 204.215770] sp : 9e90df50 ip : 9e90df68 fp : 9e90df64
[ 204.215785] r10: 9e8fdd30 r9 : 9e983bdc r8 : 00000000
[ 204.215804] r7 : 9e90c000 r6 : 9f5fc940 r5 : 00000000 r4 : 9f700c00
[ 204.215821] r3 : 00208040 r2 : 00000000 r1 : 9e983bc0 r0 : 00000000
[ 204.215875] Flags: nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none
[ 204.215899] Control: 10c5387d Table: 7e92806a DAC: 00000051
[ 204.215927] CPU: 1 PID: 939 Comm: softlockup_task Not tainted 4.14.132 #34
[ 204.215936] Hardware name: ARM-Versatile Express
[ 204.215979] [<8011149c>] (unwind_backtrace) from [<8010c330>] (show_stack+0x20/0x24)
[ 204.216011] [<8010c330>] (show_stack) from [<806ddfd8>] (dump_stack+0x8c/0xa0)
[ 204.216039] [<806ddfd8>] (dump_stack) from [<80108acc>] (show_regs+0x1c/0x20)
[ 204.216067] [<80108acc>] (show_regs) from [<801b5f08>] (watchdog_timer_fn+0x248/0x2c0)
[ 204.216095] [<801b5f08>] (watchdog_timer_fn) from [<80188f48>] (hrtimer_run_queues+0x1b8/0x370)
[ 204.216147] [<80188f48>] (hrtimer_run_queues) from [<80187f14>] (run_local_timers+0x24/0x50)
[ 204.216182] [<80187f14>] (run_local_timers) from [<80187f7c>] (update_process_times+0x3c/0x6c)
[ 204.216207] [<80187f7c>] (update_process_times) from [<80197144>] (tick_periodic+0x4c/0xcc)
[ 204.216232] [<80197144>] (tick_periodic) from [<80197368>] (tick_handle_periodic+0x38/0x98)
[ 204.216256] [<80197368>] (tick_handle_periodic) from [<8010ffe4>] (twd_handler+0x40/0x50)
[ 204.216282] [<8010ffe4>] (twd_handler) from [<80172128>] (handle_percpu_devid_irq+0x98/0x24c)
[ 204.216312] [<80172128>] (handle_percpu_devid_irq) from [<8016c728>] (generic_handle_irq+0x34/0x44)
[ 204.216338] [<8016c728>] (generic_handle_irq) from [<8016cd3c>] (__handle_domain_irq+0x6c/0xc4)
[ 204.216363] [<8016cd3c>] (__handle_domain_irq) from [<80101508>] (gic_handle_irq+0x5c/0xa0)
[ 204.216385] [<80101508>] (gic_handle_irq) from [<8010d10c>] (__irq_svc+0x6c/0x90)
[ 204.216399] Exception stack(0x9e90df00 to 0x9e90df48)
[ 204.216511] df00: 00000000 9e983bc0 00000000 00208040 9f700c00 00000000 9f5fc940 9e90c000
[ 204.216617] df20: 00000000 9e983bdc 9e8fdd30 9e90df64 9e90df68 9e90df50 7f000020 80143524
[ 204.216657] df40: 00000013 ffffffff
[ 204.216697] [<8010d10c>] (__irq_svc) from [<80143524>] (kthread_should_stop+0x30/0x54)
[ 204.216730] [<80143524>] (kthread_should_stop) from [<7f000020>] (softlockup_task_fn+0x20/0x30 [softlockup_example])
[ 204.216766] [<7f000020>] (softlockup_task_fn [softlockup_example]) from [<80143ae0>] (kthread+0x144/0x174)
[ 204.216791] [<80143ae0>] (kthread) from [<80107ee8>] (ret_from_fork+0x14/0x2c)
5. soft lockup 用户空间接口
/proc/sys/kernel/soft_watchdog # 是否启用 softlockup 功能
/proc/sys/kernel/softlockup_panic # lockup是否导致kernel panic
/proc/sys/kernel/softlockup_all_cpu_backtrace # 是否不只输出lockup CPU调用栈,还输出其它CPU调用栈
/proc/sys/kernel/watchdog_thresh # softlockup 触发时间阈值
/proc/sys/kernel/watchdog_cpumask # 启用 softlockup 检测的CPU掩码