Hibernate (kernel 5.10)

Hibernate (kernel 5.10)

state_store

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
suspend_state_t state;
int error;

error = pm_autosleep_lock(); //获取autosleep锁
if (error)
return error;

if (pm_autosleep_state() > PM_SUSPEND_ON) { //判断当前autosleep状态
error = -EBUSY;
goto out;
}
/*关于suspend宏如下:
#define PM_SUSPEND_ON ((__force suspend_state_t) 0)
#define PM_SUSPEND_TO_IDLE ((__force suspend_state_t) 1)
#define PM_SUSPEND_STANDBY ((__force suspend_state_t) 2)
#define PM_SUSPEND_MEM ((__force suspend_state_t) 3)
#define PM_SUSPEND_MIN PM_SUSPEND_TO_IDLE
#define PM_SUSPEND_MAX ((__force suspend_state_t) 4) */

state = decode_state(buf, n); //解析传入的state状态值
if (state < PM_SUSPEND_MAX) {
if (state == PM_SUSPEND_MEM)
state = mem_sleep_current;

error = pm_suspend(state); //S3
} else if (state == PM_SUSPEND_MAX) {
error = hibernate(); //S4
} else {
error = -EINVAL;
}

out:
pm_autosleep_unlock();
return error ? error : n;
}

hibernate

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/**
* hibernate - Carry out system hibernation, including saving the image.
* hibernate - 执行系统休眠,包括保存镜像
*/
int hibernate(void)
{
bool snapshot_test = false;
int error;

if (!hibernation_available()) { //检查系统是否支持休眠
pm_pr_dbg("Hibernation not available.\n");
return -EPERM;
}

lock_system_sleep(); //获取全局休眠锁,防止其他进程同时修改系统休眠状态
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) { //尝试获取休眠资源,如果设备正在被其他进程使用,则返回,并释放锁
error = -EBUSY;
goto Unlock;
}

pr_info("hibernation entry\n");
pm_prepare_console(); //切换控制台
//触发电源管理通知,通知所有注册的子系统即将进入休眠
error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
if (error)
goto Restore;

ksys_sync_helper(); //同步磁盘数据,避免休眠过程中数据丢失

error = freeze_processes(); //冻结所有用户态进程,确保休眠时不会有进程修改系统状态
if (error)
goto Exit;

lock_device_hotplug(); //锁定设备热插拔,防止休眠期间设备被插拔导致状态不一致
/* Allocate memory management structures */
error = create_basic_memory_bitmaps(); //创建内存管理结构,用于记录需要保存到磁盘的内存状态
if (error)
goto Thaw;

error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); //创建系统快照,捕获当前内存状态。
if (error || freezer_test_done) //如果 freezer_test_done 为 true,说明只是测试休眠,并不会实际写入镜像,直接跳转清理资源
goto Free_bitmaps;

if (in_suspend) { //指示当前系统是否处于挂起状态
unsigned int flags = 0;

if (hibernation_mode == HIBERNATION_PLATFORM)
flags |= SF_PLATFORM_MODE; //使用平台提供的休眠方式
if (nocompress)
flags |= SF_NOCOMPRESS_MODE;//不压缩休眠镜像(默认会压缩)
else
flags |= SF_CRC32_MODE;//启用 CRC32 校验,确保数据完整性

pm_pr_dbg("Writing hibernation image.\n");
error = swsusp_write(flags); //将内存映像写入磁盘
swsusp_free(); //释放不再需要的休眠数据
if (!error) {
if (hibernation_mode == HIBERNATION_TEST_RESUME)
snapshot_test = true;
else
power_down(); //关机
}
//清理状态
in_suspend = 0; //表示休眠结束
pm_restore_gfp_mask(); //恢复分配内存的 GFP 标志
} else {
pm_pr_dbg("Hibernation image restored successfully.\n");
}

Free_bitmaps:
free_basic_memory_bitmaps(); //释放创建的内存快照数据
Thaw:
unlock_device_hotplug(); //恢复设备热插拔
if (snapshot_test) {
pm_pr_dbg("Checking hibernation image\n");
error = swsusp_check(); //校验休眠映像是否有效
if (!error)
error = load_image_and_restore(); //从磁盘恢复映像
}
thaw_processes(); //解冻所有进程

/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
pm_notifier_call_chain(PM_POST_HIBERNATION); //通知所有监听者休眠已经结束
Restore:
pm_restore_console(); //恢复终端
hibernate_release(); //释放快照设备
Unlock:
unlock_system_sleep(); //解锁系统
pr_info("hibernation exit\n");

return error;
}

freeze_processes

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/**
* freeze_processes - Signal user space processes to enter the refrigerator.
* The current thread will not be frozen. The same process that calls
* freeze_processes must later call thaw_processes.
*
* On success, returns 0. On failure, -errno and system is fully thawed.
*/
int freeze_processes(void)
{
int error;

error = __usermodehelper_disable(UMH_FREEZING);//禁用用户模式辅助进程(usermode helper),如 modprobe、udev 等
if (error)
return error;

/* Make sure this task doesn't get frozen */
current->flags |= PF_SUSPEND_TASK;//current是当前执行该函数的进程,一般是调用它的内核线程

if (!pm_freezing)
atomic_inc(&system_freezing_cnt);

pm_wakeup_clear(true); //清除所有唤醒事件,以防止系统在冻结过程中被意外唤醒
pr_info("Freezing user space processes ... ");
pm_freezing = true; //表示系统正在进行冻结进程的操作
error = try_to_freeze_tasks(true); //这个函数会遍历所有进程,并尝试将它们冻结
if (!error) {
__usermodehelper_set_disable_depth(UMH_DISABLED);//完全禁用用户模式辅助进程(之前只是进入冻结模式)
pr_cont("done.");
}
pr_cont("\n");
BUG_ON(in_atomic());//确保冻结状态下不处于原子上下文,因为冻结进程不能在原子上下文中执行。

/*
* Now that the whole userspace is frozen we need to disable
* the OOM killer to disallow any further interference with
* killable tasks. There is no guarantee oom victims will
* ever reach a point they go away we have to wait with a timeout.
*/
if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) //禁用 OOM Killer
error = -EBUSY;

if (error)
thaw_processes();
return error;
}

create_basic_memory_bitmaps

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/**
* create_basic_memory_bitmaps - Create bitmaps to hold basic page information.
*
* Create bitmaps needed for marking page frames that should not be saved and
* free page frames. The forbidden_pages_map and free_pages_map pointers are
* only modified if everything goes well, because we don't want the bits to be
* touched before both bitmaps are set up.
*/
// 为了保证休眠和恢复的正确性,内核必须:
// 标记哪些页面不应该保存(例如:DMA 缓冲区、I/O 映射内存等)。
// 跟踪哪些页面是空闲的,以便在休眠映像中跳过它们,减少数据写入量。
int create_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
int error = 0;

if (forbidden_pages_map && free_pages_map)//如果 forbidden_pages_map 和 free_pages_map 都已经创建,直接返回 0,表示成功
return 0;
else
BUG_ON(forbidden_pages_map || free_pages_map);

bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
if (!bm1)
return -ENOMEM;

error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); //用于创建 bm1 并分配位图数据
if (error)
goto Free_first_object;

bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
if (!bm2)
goto Free_first_bitmap;

error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY);
if (error)
goto Free_second_object;

forbidden_pages_map = bm1;
free_pages_map = bm2;
mark_nosave_pages(forbidden_pages_map);//标记哪些页面是不应该保存的,例如 DMA 缓冲区、页表等

pr_debug("Basic memory bitmaps created\n");

return 0;

Free_second_object:
kfree(bm2);
Free_first_bitmap:
memory_bm_free(bm1, PG_UNSAFE_CLEAR);
Free_first_object:
kfree(bm1);
return -ENOMEM;
}

hibernation_snapshot

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/**
* hibernation_snapshot - Quiesce devices and create a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
*
* This routine must be called with system_transition_mutex held.
*/
int hibernation_snapshot(int platform_mode)
{
pm_message_t msg;
int error;

pm_suspend_clear_flags();//清除之前可能存在的挂起(suspend)标志,确保新的休眠流程不受影响
error = platform_begin(platform_mode);
if (error)
goto Close;

/* Preallocate image memory before shutting down devices. */
error = hibernate_preallocate_memory();//预先分配休眠映像所需的内存,确保系统可以存储当前内存快照
if (error)
goto Close;

error = freeze_kernel_threads(); //冻结内核线程
if (error)
goto Cleanup;

if (hibernation_test(TEST_FREEZER)) {

/*
* Indicate to the caller that we are returning due to a
* successful freezer test.
*/
freezer_test_done = true;
goto Thaw;
}

error = dpm_prepare(PMSG_FREEZE); //通知所有设备驱动程序即将进入冻结状态
if (error) {
dpm_complete(PMSG_RECOVER); //让设备恢复到正常状态,并跳转到 Thaw 进行解冻。
goto Thaw;
}

suspend_console(); //暂停控制台的输入输出,防止显示输出干扰休眠过程
pm_restrict_gfp_mask(); //限制内存分配,避免在 suspend 过程中分配不合适的内存

error = dpm_suspend(PMSG_FREEZE); //负责挂起所有设备,并将它们置于低功耗状态

if (error || hibernation_test(TEST_DEVICES)) //如果挂起失败则进入恢复流程
platform_recover(platform_mode);
else
error = create_image(platform_mode); //如果设备挂起成功,则调用 create_image() 进行系统内存快照的创建

/*
* In the case that we call create_image() above, the control
* returns here (1) after the image has been created or the
* image creation has failed and (2) after a successful restore.
*/

/* We may need to release the preallocated image pages here. */
if (error || !in_suspend)
swsusp_free();

msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
//如果设备已经被冻结(dpm_suspend(PMSG_FREEZE)),映像创建成功,则使用 PMSG_THAW 恢复设备状态。
//如果设备已经被冻结,但创建休眠映像失败(或者其他原因导致不能继续休眠)。则需要恢复设备,使系统返回到正常状态。
//如果设备之前被冻结,但系统从未真正进入 suspend。例如,在测试模式(hibernation_test())或者某些失败情况下,不执行真正的休眠,而是直接恢复设备。
dpm_resume(msg);//调用 dpm_resume() 以 msg 为参数,使所有设备恢复到正常运行状态:

if (error || !in_suspend)
pm_restore_gfp_mask(); //解除 pm_restrict_gfp_mask() 之前对 GFP(内存分配)的限制

resume_console(); //恢复控制台显示
dpm_complete(msg); //让设备完成对应的状态转换

Close:
platform_end(platform_mode); //进行平台相关的清理
return error;

Thaw:
thaw_kernel_threads(); //解冻内核线程,使其恢复运行
Cleanup:
swsusp_free();
goto Close;
}
dpm_prepare
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/**
* dpm_prepare - Prepare all non-sysdev devices for a system PM transition.
* @state: PM transition of the system being carried out.
*
* Execute the ->prepare() callback(s) for all devices.
*/
int dpm_prepare(pm_message_t state)
{
int error = 0;

trace_suspend_resume(TPS("dpm_prepare"), state.event, true);//记录 dpm_prepare 过程的开始,便于调试和跟踪 PM 事件
might_sleep();//调试辅助函数,确保当前代码不会在不允许睡眠(如持有自旋锁)的情况下进入可能会睡眠的区域(如 wait_for_device_probe())

/*
* Give a chance for the known devices to complete their probes, before
* disable probing of devices. This sync point is important at least
* at boot time + hibernation restore.
*/
wait_for_device_probe();//确保所有设备的 probe(驱动绑定)过程完成,防止正在执行 probe 的设备在 suspend/hibernate 过程中发生问题。
/*
* It is unsafe if probing of devices will happen during suspend or
* hibernation and system behavior will be unpredictable in this case.
* So, let's prohibit device's probing here and defer their probes
* instead. The normal behavior will be restored in dpm_complete().
*/
device_block_probing();//在 suspend 或 hibernate 期间,禁止新的设备探测和驱动绑定,防止系统行为不可预测

mutex_lock(&dpm_list_mtx);//加锁 dpm_list_mtx,确保在遍历设备列表时不会有并发修改
while (!list_empty(&dpm_list)) { //dpm_list 是设备列表,存储了所有参与 suspend/hibernate 过程的设备
struct device *dev = to_device(dpm_list.next); //取出列表头部的设备

get_device(dev); //增加设备的引用计数,防止设备在 suspend 过程中被释放
mutex_unlock(&dpm_list_mtx); //解锁 dpm_list_mtx,因为 device_prepare(dev, state) 可能会睡眠(sleep),避免死锁

trace_device_pm_callback_start(dev, "", state.event);
error = device_prepare(dev, state); //执行设备的 ->prepare() 回调
trace_device_pm_callback_end(dev, error);

mutex_lock(&dpm_list_mtx);
if (error) { //处理 prepare() 失败情况
if (error == -EAGAIN) {
put_device(dev);
error = 0;
continue;
}
pr_info("Device %s not prepared for power transition: code %d\n",
dev_name(dev), error);
put_device(dev);
break;
}
dev->power.is_prepared = true; //设备已准备好进入 suspend/hibernate
if (!list_empty(&dev->power.entry))
list_move_tail(&dev->power.entry, &dpm_prepared_list); //将设备从 dpm_list 移动到 dpm_prepared_list,确保后续阶段只处理已准备好的设备
put_device(dev); //释放 get_device() 增加的引用计数
}
mutex_unlock(&dpm_list_mtx);
trace_suspend_resume(TPS("dpm_prepare"), state.event, false);
return error;
}
device_prepare
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/**
* device_prepare - Prepare a device for system power transition.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
*
* Execute the ->prepare() callback(s) for given device. No new children of the
* device may be registered after this function has returned.
*/
static int device_prepare(struct device *dev, pm_message_t state)
{
int (*callback)(struct device *) = NULL;
int ret = 0;

/*
* If a device's parent goes into runtime suspend at the wrong time,
* it won't be possible to resume the device. To prevent this we
* block runtime suspend here, during the prepare phase, and allow
* it again during the complete phase.
*/
pm_runtime_get_noresume(dev);//增加设备的 runtime PM(电源管理)引用计数,防止设备在 prepare() 过程中进入 runtime suspend

if (dev->power.syscore)//syscore 设备通常是关键的核心设备,它们需要在整个系统 suspend 过程中保持运行,因此不需要 prepare() 处理
return 0;

device_lock(dev); //保护 dev,防止并发访问

dev->power.wakeup_path = false; //该标志表示设备是否能从 suspend 中唤醒系统,在 prepare() 开始前先清除,即清唤醒源

if (dev->power.no_pm_callbacks) //如果设备明确 不支持 PM 回调,直接跳过
goto unlock;

/* prepare() 回调的查找顺序:
* PM domain(电源管理域)
* 设备类型(type)
* 设备类(class)
*/
if (dev->pm_domain)
callback = dev->pm_domain->ops.prepare;
else if (dev->type && dev->type->pm)
callback = dev->type->pm->prepare;
else if (dev->class && dev->class->pm)
callback = dev->class->pm->prepare;
else if (dev->bus && dev->bus->pm)
callback = dev->bus->pm->prepare;

if (!callback && dev->driver && dev->driver->pm)
callback = dev->driver->pm->prepare;

if (callback)
ret = callback(dev);

unlock:
device_unlock(dev);

if (ret < 0) {
suspend_report_result(callback, ret);
pm_runtime_put(dev); //释放 runtime PM 资源
return ret;
}
/*
* A positive return value from ->prepare() means "this device appears
* to be runtime-suspended and its state is fine, so if it really is
* runtime-suspended, you can leave it in that state provided that you
* will do the same thing with all of its descendants". This only
* applies to suspend transitions, however.
*/
spin_lock_irq(&dev->power.lock);
//direct_complete主要用于设备已经处于runtime suspend状态,并且可以继续保持这个状态,从而优化suspend/resume流程,避免不必要的操作。
dev->power.direct_complete = state.event == PM_EVENT_SUSPEND &&
(ret > 0 || dev->power.no_pm_callbacks) &&
!dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE);
spin_unlock_irq(&dev->power.lock);
return 0;
}
dpm_complete
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/**
* dpm_complete - Complete a PM transition for all non-sysdev devices.
* @state: PM transition of the system being carried out.
*
* Execute the ->complete() callbacks for all devices whose PM status is not
* DPM_ON (this allows new devices to be registered).
*/
void dpm_complete(pm_message_t state)
{
struct list_head list;

trace_suspend_resume(TPS("dpm_complete"), state.event, true);// 用于跟踪 suspend/resume 过程
might_sleep();

INIT_LIST_HEAD(&list);
mutex_lock(&dpm_list_mtx);
while (!list_empty(&dpm_prepared_list)) {//遍历 dpm_prepared_list,该列表存放的是处于 is_prepared = true 状态的设备
struct device *dev = to_device(dpm_prepared_list.prev);

get_device(dev);
dev->power.is_prepared = false; //清除 is_prepared 标志,表示设备已经完成 suspend 的准备状态
list_move(&dev->power.entry, &list);//将设备从 dpm_prepared_list 移动到 list,即将处理完成的设备从 "待处理" 状态转移到 "已完成" 状态
mutex_unlock(&dpm_list_mtx);

trace_device_pm_callback_start(dev, "", state.event);
device_complete(dev, state);//调用 device_complete(dev, state),执行设备的 ->complete() 回调
trace_device_pm_callback_end(dev, 0);

mutex_lock(&dpm_list_mtx);
put_device(dev);
}
list_splice(&list, &dpm_list); //将 list 中的设备合并到 dpm_list,表示这些设备已经完成了 complete() 处理
mutex_unlock(&dpm_list_mtx);

/* Allow device probing and trigger re-probing of deferred devices */
device_unblock_probing();//解除设备探测阻塞,允许新的设备注册,并重新探测之前由于 suspend/resume 过程被推迟的设备
trace_suspend_resume(TPS("dpm_complete"), state.event, false);
}
suspend_console
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/**
* suspend_console - suspend the console subsystem
*
* This disables printk() while we go into suspend states
*/
void suspend_console(void)
{
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
console_lock();
console_suspended = 1;
up_console_sem();
}
pm_restrict_gfp_mask
1
2
3
4
5
6
7
void pm_restrict_gfp_mask(void)		
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); //限制 I/O 和文件系统操作
}
dpm_suspend
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/**
* dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices.
* @state: PM transition of the system being carried out.
*/
int dpm_suspend(pm_message_t state)
{
ktime_t starttime = ktime_get();
int error = 0;

trace_suspend_resume(TPS("dpm_suspend"), state.event, true);
might_sleep(); //确保此函数不会在原子上下文中调用

devfreq_suspend();//挂起动态调频设备
cpufreq_suspend();//挂起cpu调频机制

mutex_lock(&dpm_list_mtx);
pm_transition = state;
async_error = 0;
while (!list_empty(&dpm_prepared_list)) { //dpm_prepared_list 存放已准备suspend的设备
struct device *dev = to_device(dpm_prepared_list.prev);

get_device(dev); //增加设备引用计数,确保设备不会被释放
mutex_unlock(&dpm_list_mtx);

error = device_suspend(dev); //执行设备suspend

mutex_lock(&dpm_list_mtx);
if (error) {
pm_dev_err(dev, state, "", error);
dpm_save_failed_dev(dev_name(dev)); //记录 suspend 失败的设备
put_device(dev); //释放设备引用计数
break;
}
if (!list_empty(&dev->power.entry))
list_move(&dev->power.entry, &dpm_suspended_list); //设备挂起成功,移动到dpm_suspended_list
put_device(dev);
if (async_error)
break;
}
mutex_unlock(&dpm_list_mtx);
async_synchronize_full();
if (!error)
error = async_error;
if (error) {
suspend_stats.failed_suspend++;
dpm_save_failed_step(SUSPEND_SUSPEND); //记录 suspend 失败的阶段
}
dpm_show_time(starttime, state, error, NULL); //计算并显示suspend过程耗时
trace_suspend_resume(TPS("dpm_suspend"), state.event, false);
return error;
}
device_suspend
1
2
3
4
5
6
7
static int device_suspend(struct device *dev)
{
if (dpm_async_fn(dev, async_suspend))
return 0;

return __device_suspend(dev, pm_transition, false);
}
  • __device_suspend
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/**
* __device_suspend - Execute "suspend" callbacks for given device.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
* @async: If true, the device is being suspended asynchronously.
*/
static int __device_suspend(struct device *dev, pm_message_t state, bool async)
{
pm_callback_t callback = NULL;
const char *info = NULL;
int error = 0;
DECLARE_DPM_WATCHDOG_ON_STACK(wd);

TRACE_DEVICE(dev);
TRACE_SUSPEND(0);

dpm_wait_for_subordinate(dev, async); //等待子设备完成挂起

if (async_error) { //检查是否有异步错误
dev->power.direct_complete = false;
goto Complete;
}

/*
* Wait for possible runtime PM transitions of the device in progress
* to complete and if there's a runtime resume request pending for it,
* resume it before proceeding with invoking the system-wide suspend
* callbacks for it.
*
* If the system-wide suspend callbacks below change the configuration
* of the device, they must disable runtime PM for it or otherwise
* ensure that its runtime-resume callbacks will not be confused by that
* change in case they are invoked going forward.
*/
pm_runtime_barrier(dev); // 刷新设备的运行时电源管理请求,并等待所有正在进行的运行时PM操作完成

if (pm_wakeup_pending()) { //检测是否有未处理的唤醒请求。如果有,设备不会挂起,并返回 -EBUSY
dev->power.direct_complete = false;
async_error = -EBUSY;
goto Complete;
}

if (dev->power.syscore) //如果设备是 syscore 设备(例如 CPU、时钟控制器等),则不进行普通设备的挂起处理
goto Complete;

/* Avoid direct_complete to let wakeup_path propagate.
* device_may_wakeup(dev) 检查设备是否支持唤醒功能,即该设备是否可以从低功耗状态中唤醒系统。
* dev->power.wakeup_path 表示该设备已经被标记为唤醒路径,通常用于保证唤醒信号的传递
*/
if (device_may_wakeup(dev) || dev->power.wakeup_path)
dev->power.direct_complete = false;

if (dev->power.direct_complete) { // 如果设备支持 direct_complete,且仍然处于 runtime suspend 状态,就可以直接跳到 Complete
if (pm_runtime_status_suspended(dev)) {
pm_runtime_disable(dev); // 禁用 runtime PM,避免状态变化
if (pm_runtime_status_suspended(dev)) {
pm_dev_dbg(dev, state, "direct-complete ");
goto Complete;
}

pm_runtime_enable(dev); // 如果状态发生变化,重新启用 runtime PM
}
dev->power.direct_complete = false; // 清除 direct_complete 标志
}

dev->power.may_skip_resume = true;
dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME);

dpm_watchdog_set(&wd, dev);
device_lock(dev);

if (dev->pm_domain) { //挂起回调的查找和执行
info = "power domain ";
callback = pm_op(&dev->pm_domain->ops, state);
goto Run;
}

if (dev->type && dev->type->pm) {
info = "type ";
callback = pm_op(dev->type->pm, state);
goto Run;
}

if (dev->class && dev->class->pm) {
info = "class ";
callback = pm_op(dev->class->pm, state);
goto Run;
}

if (dev->bus) {
if (dev->bus->pm) {
info = "bus ";
callback = pm_op(dev->bus->pm, state);
} else if (dev->bus->suspend) {
pm_dev_dbg(dev, state, "legacy bus ");
error = legacy_suspend(dev, state, dev->bus->suspend,
"legacy bus ");
goto End;
}
}

Run: //执行驱动层面的挂起回调
if (!callback && dev->driver && dev->driver->pm) {
info = "driver ";
callback = pm_op(dev->driver->pm, state);
}

error = dpm_run_callback(callback, dev, state, info);

End:
if (!error) { //如果挂起成功,更新设备的suspend状态
dev->power.is_suspended = true;
if (device_may_wakeup(dev))
dev->power.wakeup_path = true;

//将当前设备的唤醒能力传播到其父设备,以确保整个设备树能够正确处理唤醒事件。
如果某个设备能够唤醒系统,其父设备通常也需要知道这一点,以便协同工作。
dpm_propagate_wakeup_to_parent(dev);
//清除当前设备的父设备和供应商设备的 direct_complete 标志,确保这些设备在后续 suspend/resume 过程中不会因为被标记为 "直接完成" 而跳过状态转换
dpm_clear_superiors_direct_complete(dev);
}

device_unlock(dev);
dpm_watchdog_clear(&wd);

Complete:
if (error)
async_error = error;

complete_all(&dev->power.completion);
TRACE_SUSPEND(error);
return error;
}
  • pm_wakeup_pending
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/**
* pm_wakeup_pending - Check if power transition in progress should be aborted.
*
* Compare the current number of registered wakeup events with its preserved
* value from the past and return true if new wakeup events have been registered
* since the old value was stored. Also return true if the current number of
* wakeup events being processed is different from zero.
*/
bool pm_wakeup_pending(void)
{
unsigned long flags;
bool ret = false;

raw_spin_lock_irqsave(&events_lock, flags);
if (events_check_enabled) { //是否启用唤醒事件检测
unsigned int cnt, inpr;

split_counters(&cnt, &inpr); //获取当前唤醒事件计数和正在处理的事件数
ret = (cnt != saved_count || inpr > 0); //如果当前唤醒事件计数 cnt 与之前保存的 saved_count 不同,表示有新的唤醒事件。如果正在处理的唤醒事件数 inpr > 0,也需要中止休眠
events_check_enabled = !ret; //如果检测到新的唤醒事件或正在处理唤醒事件,则暂时关闭 events_check_enabled,避免重复检测
}
raw_spin_unlock_irqrestore(&events_lock, flags);

if (ret) {
pm_pr_dbg("Wakeup pending, aborting suspend\n");
pm_print_active_wakeup_sources();
}

return ret || atomic_read(&pm_abort_suspend) > 0; //如果有唤醒事件阻止挂起(ret == true)或者 pm_abort_suspend 的原子计数器大于 0,则返回 true,表示需要中止电源状态转换。
}
create_image
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/**
* create_image - Create a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
*
* Execute device drivers' "late" and "noirq" freeze callbacks, create a
* hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
*
* Control reappears in this routine after the subsequent restore.
*/
static int create_image(int platform_mode)
{
int error;

error = dpm_suspend_end(PMSG_FREEZE); //发送PMSG_FREEZE消息以冻结设备
if (error) {
pr_err("Some devices failed to power down, aborting\n");
return error;
}

error = platform_pre_snapshot(platform_mode); //预处理平台相关的操作,如 BIOS 或 ACPI 的挂起前准备。
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;

error = suspend_disable_secondary_cpus(); //关闭除主 CPU 之外的所有次级 CPU,以减少系统复杂度和同步开销
if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus;

local_irq_disable(); //禁用本地中断

system_state = SYSTEM_SUSPEND;

error = syscore_suspend(); //挂起系统核心设备(通常是时钟、计时器等关键硬件)
if (error) {
pr_err("Some system devices failed to power down, aborting\n");
goto Enable_irqs;
}

if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) //处理挂起测试与唤醒检测
goto Power_up;

in_suspend = 1;
save_processor_state(); //保存处理器状态,包括寄存器、浮点状态等,以便恢复时重建。
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend(); //进行体系结构相关的具体挂起操作
/* Restore control flow magically appears here */
restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
pr_err("Error %d creating image\n", error);

if (!in_suspend) {
events_check_enabled = false;
clear_free_pages();
}

platform_leave(platform_mode);

Power_up:
syscore_resume();

Enable_irqs:
system_state = SYSTEM_RUNNING;
local_irq_enable();

Enable_cpus:
suspend_enable_secondary_cpus();

/* Allow architectures to do nosmt-specific post-resume dances */
if (!in_suspend)
error = arch_resume_nosmt();

Platform_finish:
platform_finish(platform_mode);

dpm_resume_start(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);

return error;
}
dpm_suspend_end
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/**
* dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks.
* @state: PM transition of the system being carried out.
*/
int dpm_suspend_end(pm_message_t state)
{
ktime_t starttime = ktime_get();
int error;

error = dpm_suspend_late(state); //进入设备挂起的late阶段,即设备挂起前的最后一个阶段
if (error)
goto out;

error = dpm_suspend_noirq(state); //如果late挂起成功,则继续执行设备的noirq挂起回调,即无中断的设备挂起阶段
if (error)
dpm_resume_early(resume_event(state));

out:
dpm_show_time(starttime, state, error, "end");
return error;
}
EXPORT_SYMBOL_GPL(dpm_suspend_end);
swsusp_arch_suspend
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
int swsusp_arch_suspend(void)
{
int ret = 0;
unsigned long flags;
struct sleep_stack_data state;

if (cpus_are_stuck_in_kernel()) { //此函数检查是否存在无法下线(offline)的CPU,通常是因为某些 CPU 被卡住在内核中,无法进入休眠状态
pr_err("Can't hibernate: no mechanism to offline secondary CPUs.\n");
return -EBUSY;
}

flags = local_daif_save(); //保存当前中断状态标志。该函数保存当前的中断标志 DAIF(Debug, Abort, Interrupt, FIQ),以便稍后恢复

if (__cpu_suspend_enter(&state)) { //尝试进入 CPU 休眠状态
/* make the crash dump kernel image visible/saveable */
crash_prepare_suspend(); //使内核崩溃转储的内存区域可见,以便在休眠期间发生异常时可以记录调试信息。主要用于 kdump 调试。

ret = swsusp_mte_save_tags(); //保存 MTE 标记,用于标记内存块,检测内存越界或非法访问。
if (ret)
return ret;

sleep_cpu = smp_processor_id(); //获取当前执行的处理器 ID,存储到 sleep_cpu 变量,用于记录休眠时的CPU
ret = swsusp_save(); //系统休眠的核心操作,将当前内存状态保存创建
} else {
/* Clean kernel core startup/idle code to PoC*/ //清理数据缓存的特定范围,确保内存一致性,避免缓存中的旧数据引发错误
dcache_clean_range(__mmuoff_data_start, __mmuoff_data_end);
dcache_clean_range(__idmap_text_start, __idmap_text_end);

/* Clean kvm setup code to PoC? */
if (el2_reset_needed()) { //检查是否需要重置 EL2(Exception Level 2)。EL2主要用于虚拟化(KVM),如果需要重置则清理 Hypervisor相关代码缓存
dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end);
dcache_clean_range(__hyp_text_start, __hyp_text_end);
}

swsusp_mte_restore_tags(); //恢复内存 MTE 标记,与保存阶段对应,确保内存访问的合法性

/* make the crash dump kernel image protected again */
crash_post_resume(); //恢复内核崩溃转储区域的保护状态,以确保调试信息不会被随意修改

/*
* Tell the hibernation core that we've just restored
* the memory
*/
in_suspend = 0; //标记系统已经恢复到活跃状态,不再处于挂起状态

sleep_cpu = -EINVAL; //表示恢复时无效的 CPU ID,重置 sleep_cpu,避免误判
__cpu_suspend_exit(); //恢复 CPU 的休眠状态,允许再次进入休眠模式

/*
* Just in case the boot kernel did turn the SSBD
* mitigation off behind our back, let's set the state
* to what we expect it to be.
*/
spectre_v4_enable_mitigation(NULL); //预防 Spectre V4 攻击,恢复对 “推测执行” 漏洞的缓解措施。Spectre 是一种 CPU 漏洞,可能导致数据泄漏
}

local_daif_restore(flags); //恢复中断状态到保存的 flags,确保系统中断的正常工作

return ret;
}
  • swsusp_save
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;

pr_info("Creating image:\n");

drain_local_pages(NULL); //清空本地 CPU 的页面缓存,防止在分配页表或快照时,缓存中的“冷页”(未使用页)影响内存状态的一致性
nr_pages = count_data_pages(); //计算内存页数
nr_highmem = count_highmem_pages();
pr_info("Need to copy %u pages\n", nr_pages + nr_highmem);

if (!enough_free_mem(nr_pages, nr_highmem)) { //检查系统可用的内存是否足够保存快照数据
pr_err("Not enough free memory\n");
return -ENOMEM;
}

if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) { //分配页面拷贝位图
pr_err("Memory allocation failed\n");
return -ENOMEM;
}

/*
* During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
*/
drain_local_pages(NULL); //再次清空本地 CPU 的页面缓存,确保没有新的冷页被分配,避免快照过程中状态不一致的问题
copy_data_pages(&copy_bm, &orig_bm); //执行实际的内存页复制操作

/*
* End of critical section. From now on, we can write to memory,
* but we should not touch disk. This specially means we must _not_
* touch swap space! Except we must write out our image of course.
*/

nr_pages += nr_highmem;
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);

pr_info("Image created (%d pages copied)\n", nr_pages);

return 0;
}

swsusp_write

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/**
* swsusp_write - Write entire image and metadata.
* @flags: flags to pass to the "boot" kernel in the image header
*
* It is important _NOT_ to umount filesystems at this point. We want
* them synced (in case something goes wrong) but we DO not want to mark
* filesystem clean: it is not. (And it does not matter, if we resume
* correctly, we'll mark system clean, anyway.)
*/

int swsusp_write(unsigned int flags)
{
struct swap_map_handle handle;
struct snapshot_handle snapshot;
struct swsusp_info *header;
unsigned long pages;
int error;

pages = snapshot_get_image_size(); //获取要写入的页面数量
error = get_swap_writer(&handle); //获取 swap 句柄
if (error) {
pr_err("Cannot get swap writer\n");
return error;
}
if (flags & SF_NOCOMPRESS_MODE) { //SF_NOCOMPRESS_MODE不压缩模式
if (!enough_swap(pages)) { //检查是否有足够的交换空间
pr_err("Not enough free swap\n");
error = -ENOSPC;
goto out_finish;
}
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_read_next(&snapshot); //读取快照数据到 snapshot 结构体中
if (error < (int)PAGE_SIZE) {
if (error >= 0)
error = -EFAULT;

goto out_finish;
}
header = (struct swsusp_info *)data_of(snapshot); //从 snapshot 结构体中获取快照头信息,并转换为 struct swsusp_info 结构体指针 header
error = swap_write_page(&handle, header, NULL); //将快照头信息写入交换分区中
if (!error) { //写入内存快照数据
error = (flags & SF_NOCOMPRESS_MODE) ?
save_image(&handle, &snapshot, pages - 1) : //直接写入不压缩的快照数据
save_image_lzo(&handle, &snapshot, pages - 1); //将快照数据进行 LZO 压缩后再写入
}
out_finish:
error = swap_writer_finish(&handle, flags, error); //负责关闭交换区写入句柄并执行清理操作
return error;
}

power_down

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/**
* power_down - Shut the machine down for hibernation.
*
* Use the platform driver, if configured, to put the system into the sleep
* state corresponding to hibernation, or try to power it off or reboot,
* depending on the value of hibernation_mode.
*/
static void power_down(void)
{
#ifdef CONFIG_SUSPEND
int error;

if (hibernation_mode == HIBERNATION_SUSPEND) {
error = suspend_devices_and_enter(PM_SUSPEND_MEM);
if (error) {
hibernation_mode = hibernation_ops ?
HIBERNATION_PLATFORM :
HIBERNATION_SHUTDOWN;
} else {
/* Restore swap signature. */
error = swsusp_unmark();
if (error)
pr_err("Swap will be unusable! Try swapon -a.\n");

return;
}
}
#endif

switch (hibernation_mode) {
case HIBERNATION_REBOOT:
kernel_restart(NULL);
break;
case HIBERNATION_PLATFORM:
hibernation_platform_enter();
fallthrough;
case HIBERNATION_SHUTDOWN:
if (pm_power_off)
kernel_power_off();
break;
}
kernel_halt(); //终止所有设备操作和 CPU 指令,使系统停止运行
/*
* Valid image is on the disk, if we continue we risk serious data
* corruption after resume.
*/
pr_crit("Power down manually\n");
while (1)
cpu_relax();
}

Hibernate (kernel 5.10)
https://tomwithkernel.github.io/pm/Hibernate/
作者
Tom
发布于
2025年3月18日
更新于
2025年3月26日
许可协议