内核模块里访问struct rq及获取rq_clock

一、背景

在之前的 CFS及RT调度整体介绍_cfs任务和rt任务-CSDN博客博客里，我们讲到了内核调度里的一个很重要的概念rq，即运行队列run queue。我们知道，每个cpu上都有一个struct rq的结构体，管理着per-cpu的运行队列的情况，而在内核模块要获取某个cpu上的rq的结构体并不容易，我们在下面第二章里会展示获取的源码，并在第三章里说明获取的方法。

获取到了rq，也自然能获取到rq结构体里的clock和clock_task，而clock_task也就是rq_clock_task，是统计各个任务的执行时间的标尺，rq_clock_task由于根据内核选项不同，rq_clock_task时间并不确定和local_clock的时间一致（因为会根据内核选项去掉中断的时间），所以为了在内核模块里去用local_clock的时间来作为rq_clock_task时间是不精准的，为了拿到更加精确的rq_clock_task时间，我们需要用这篇博客里的方法，先拿到rq再拿到rq里的clock_task，从而获取rq_clock_task时间。rq_clock_task函数是inline函数，在cat /proc/kallsyms里并没有它：

关于内核里的各个常用的时间接口，尤其调度用的时间及local_clock时间等细节，参考之前的博客与调度相关的内核时间接口的分析及实现介绍-CSDN博客。

在下面第二章里，我们先贴出内核模块里获取rq_clock_task的实现源码，在第三章里，我们讲里面的细节。

二、获取rq_clock_task的实现源码

#include <linux/module.h>
#include <linux/capability.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/init.h>
#include <asm/atomic.h>
#include <trace/events/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/tracepoint.h>
#include <trace/events/osmonitor.h>
#include <trace/events/sched.h>
#include <trace/events/irq.h>
#include <trace/events/kmem.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/sched/task_stack.h>
#include <linux/nmi.h>
#include <asm/apic.h>
#include <linux/version.h>
#include <linux/sched/mm.h>
#include <asm/irq_regs.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>#include <linux/stop_machine.h>MODULE_LICENSE("GPL");
MODULE_AUTHOR("zhaoxin");
MODULE_DESCRIPTION("Module use rq_clock_task.");
MODULE_VERSION("1.0");#define IODELAY_TRACEPOINT_ENABLEstruct uclamp_bucket {unsigned long value : bits_per(SCHED_CAPACITY_SCALE);unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
};struct uclamp_rq {unsigned int value;struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};/* CFS-related fields in a runqueue */
struct cfs_rq {struct load_weight	load;unsigned int		nr_running;unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */unsigned int		idle_nr_running;   /* SCHED_IDLE */unsigned int		idle_h_nr_running; /* SCHED_IDLE */u64			exec_clock;u64			min_vruntime;
#ifdef CONFIG_SCHED_COREunsigned int		forceidle_seq;u64			min_vruntime_fi;
#endif#ifndef CONFIG_64BITu64			min_vruntime_copy;
#endifstruct rb_root_cached	tasks_timeline;/** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).*/struct sched_entity	*curr;struct sched_entity	*next;struct sched_entity	*last;struct sched_entity	*skip;#ifdef	CONFIG_SCHED_DEBUGunsigned int		nr_spread_over;
#endif#ifdef CONFIG_SMP/** CFS load tracking*/struct sched_avg	avg;
#ifndef CONFIG_64BITu64			last_update_time_copy;
#endifstruct {raw_spinlock_t	lock ____cacheline_aligned;int		nr;unsigned long	load_avg;unsigned long	util_avg;unsigned long	runnable_avg;} removed;#ifdef CONFIG_FAIR_GROUP_SCHEDunsigned long		tg_load_avg_contrib;long			propagate;long			prop_runnable_sum;/**   h_load = weight * f(tg)** Where f(tg) is the recursive weight fraction assigned to* this group.*/unsigned long		h_load;u64			last_h_load_update;struct sched_entity	*h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */#ifdef CONFIG_FAIR_GROUP_SCHEDstruct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached *//** leaf cfs_rqs are those that hold tasks (lowest schedulable entity in* a hierarchy). Non-leaf lrqs hold other higher schedulable entities* (like users, containers etc.)** leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.* This list is used during load balance.*/int			on_list;struct list_head	leaf_cfs_rq_list;struct task_group	*tg;	/* group that "owns" this runqueue *//* Locally cached copy of our task_group's idle value */int			idle;#ifdef CONFIG_CFS_BANDWIDTHint			runtime_enabled;s64			runtime_remaining;u64			throttled_pelt_idle;
#ifndef CONFIG_64BITu64                     throttled_pelt_idle_copy;
#endifu64			throttled_clock;u64			throttled_clock_pelt;u64			throttled_clock_pelt_time;int			throttled;int			throttle_count;struct list_head	throttled_list;
#ifdef CONFIG_SMPstruct list_head	throttled_csd_list;
#endif
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};struct rt_prio_array {DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */struct list_head queue[MAX_RT_PRIO];
};/* Real-Time classes' related field in a runqueue: */
struct rt_rq {struct rt_prio_array	active;unsigned int		rt_nr_running;unsigned int		rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHEDstruct {int		curr; /* highest queued rt task prio */
#ifdef CONFIG_SMPint		next; /* next highest */
#endif} highest_prio;
#endif
#ifdef CONFIG_SMPunsigned int		rt_nr_migratory;unsigned int		rt_nr_total;int			overloaded;struct plist_head	pushable_tasks;#endif /* CONFIG_SMP */int			rt_queued;int			rt_throttled;u64			rt_time;u64			rt_runtime;/* Nests inside the rq lock: */raw_spinlock_t		rt_runtime_lock;#ifdef CONFIG_RT_GROUP_SCHEDunsigned int		rt_nr_boosted;struct rq		*rq;struct task_group	*tg;
#endif
};/* Deadline class' related fields in a runqueue */
struct dl_rq {/* runqueue is an rbtree, ordered by deadline */struct rb_root_cached	root;unsigned int		dl_nr_running;#ifdef CONFIG_SMP/** Deadline values of the currently executing and the* earliest ready task on this rq. Caching these facilitates* the decision whether or not a ready but not running task* should migrate somewhere else.*/struct {u64		curr;u64		next;} earliest_dl;unsigned int		dl_nr_migratory;int			overloaded;/** Tasks on this rq that can be pushed away. They are kept in* an rb-tree, ordered by tasks' deadlines, with caching* of the leftmost (earliest deadline) element.*/struct rb_root_cached	pushable_dl_tasks_root;
#elsestruct dl_bw		dl_bw;
#endif/** "Active utilization" for this runqueue: increased when a* task wakes up (becomes TASK_RUNNING) and decreased when a* task blocks*/u64			running_bw;/** Utilization of the tasks "assigned" to this runqueue (including* the tasks that are in runqueue and the tasks that executed on this* CPU and blocked). Increased when a task moves to this runqueue, and* decreased when the task moves away (migrates, changes scheduling* policy, or terminates).* This is needed to compute the "inactive utilization" for the* runqueue (inactive utilization = this_bw - running_bw).*/u64			this_bw;u64			extra_bw;/** Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM* tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).*/u64			max_bw;/** Inverse of the fraction of CPU utilization that can be reclaimed* by the GRUB algorithm.*/u64			bw_ratio;
};struct rq {/* runqueue lock: */raw_spinlock_t		__lock;/** nr_running and cpu_load should be in the same cacheline because* remote CPUs use both these fields when doing load calculation.*/unsigned int		nr_running;
#ifdef CONFIG_NUMA_BALANCINGunsigned int		nr_numa_running;unsigned int		nr_preferred_running;unsigned int		numa_migrate_on;
#endif
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMPunsigned long		last_blocked_load_update_tick;unsigned int		has_blocked_load;call_single_data_t	nohz_csd;
#endif /* CONFIG_SMP */unsigned int		nohz_tick_stopped;atomic_t		nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */#ifdef CONFIG_SMPunsigned int		ttwu_pending;
#endifu64			nr_switches;#ifdef CONFIG_UCLAMP_TASK/* Utilization clamp values based on CPU's RUNNABLE tasks */struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;unsigned int		uclamp_flags;
#define UCLAMP_FLAG_IDLE 0x01
#endifstruct cfs_rq		cfs;struct rt_rq		rt;struct dl_rq		dl;#ifdef CONFIG_FAIR_GROUP_SCHED/* list of leaf cfs_rq on this CPU: */struct list_head	leaf_cfs_rq_list;struct list_head	*tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED *//** This is part of a global counter where only the total sum* over all CPUs matters. A task can increase this counter on* one CPU and if it got migrated afterwards it may decrease* it on another CPU. Always updated under the runqueue lock:*/unsigned int		nr_uninterruptible;struct task_struct __rcu	*curr;struct task_struct	*idle;struct task_struct	*stop;unsigned long		next_balance;struct mm_struct	*prev_mm;unsigned int		clock_update_flags;u64			clock;/* Ensure that all clocks are in the same cache line */u64			clock_task ____cacheline_aligned;u64			clock_pelt;unsigned long		lost_idle_time;atomic_t		nr_iowait;#ifdef CONFIG_SCHED_DEBUGu64 last_seen_need_resched_ns;int ticks_without_resched;
#endif#ifdef CONFIG_MEMBARRIERint membarrier_state;
#endif#ifdef CONFIG_SMPstruct root_domain		*rd;struct sched_domain __rcu	*sd;unsigned long		cpu_capacity;unsigned long		cpu_capacity_orig;struct callback_head	*balance_callback;unsigned char		nohz_idle_balance;unsigned char		idle_balance;unsigned long		misfit_task_load;/* For active balancing */int			active_balance;int			push_cpu;struct cpu_stop_work	active_balance_work;/* CPU of this runqueue: */int			cpu;int			online;struct list_head cfs_tasks;struct sched_avg	avg_rt;struct sched_avg	avg_dl;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQstruct sched_avg	avg_irq;
#endif
#ifdef CONFIG_SCHED_THERMAL_PRESSUREstruct sched_avg	avg_thermal;
#endifu64			idle_stamp;u64			avg_idle;unsigned long		wake_stamp;u64			wake_avg_idle;/* This is used to determine avg_idle's max value */u64			max_idle_balance_cost;#ifdef CONFIG_HOTPLUG_CPUstruct rcuwait		hotplug_wait;
#endif
#endif /* CONFIG_SMP */#ifdef CONFIG_IRQ_TIME_ACCOUNTINGu64			prev_irq_time;
#endif
#ifdef CONFIG_PARAVIRTu64			prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTINGu64			prev_steal_time_rq;
#endif/* calc_load related fields */unsigned long		calc_load_update;long			calc_load_active;#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMPcall_single_data_t	hrtick_csd;
#endifstruct hrtimer		hrtick_timer;ktime_t 		hrtick_time;
#endif#ifdef CONFIG_SCHEDSTATS/* latency stats */struct sched_info	rq_sched_info;unsigned long long	rq_cpu_time;/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? *//* sys_sched_yield() stats */unsigned int		yld_count;/* schedule() stats */unsigned int		sched_count;unsigned int		sched_goidle;/* try_to_wake_up() stats */unsigned int		ttwu_count;unsigned int		ttwu_local;
#endif#ifdef CONFIG_CPU_IDLE/* Must be inspected within a rcu lock section */struct cpuidle_state	*idle_state;
#endif#ifdef CONFIG_SMPunsigned int		nr_pinned;
#endifunsigned int		push_busy;struct cpu_stop_work	push_work;#ifdef CONFIG_SCHED_CORE/* per rq */struct rq		*core;struct task_struct	*core_pick;unsigned int		core_enabled;unsigned int		core_sched_seq;struct rb_root		core_tree;/* shared state -- careful with sched_core_cpu_deactivate() */unsigned int		core_task_seq;unsigned int		core_pick_seq;unsigned long		core_cookie;unsigned int		core_forceidle_count;unsigned int		core_forceidle_seq;unsigned int		core_forceidle_occupation;u64			core_forceidle_start;
#endif
};// runqueues (not export symbol)
struct rq* _prq = NULL;struct rq* my_cpu_rq(int i_cpu)
{return per_cpu_ptr(_prq, i_cpu);
}static void cb_sched_stat_runtime(void *i_data, struct task_struct *i_curr,u64 i_runtime, u64 i_vruntime)
{struct rq* prq = my_cpu_rq(smp_processor_id());printk("rq_clock_task[%llu],rq_clock[%llu],local_clock[%llu]\n", prq->clock_task, prq->clock, local_clock());
}struct kern_tracepoint {void *callback;struct tracepoint *ptr;bool bregister;
};
static void clear_kern_tracepoint(struct kern_tracepoint *tp)
{if (tp->bregister) {tracepoint_probe_unregister(tp->ptr, tp->callback, NULL);}
}#define INIT_KERN_TRACEPOINT(tracepoint_name) \static struct kern_tracepoint mykern_##tracepoint_name = {.callback = NULL, .ptr = NULL, .bregister = false};#define TRACEPOINT_CHECK_AND_SET(tracepoint_name)                                             \static void tracepoint_name##_tracepoint_check_and_set(struct tracepoint *tp, void *priv) \{                                                                                \if (!strcmp(#tracepoint_name, tp->name))                                     \{                                                                            \((struct kern_tracepoint *)priv)->ptr = tp;                          \return;                                                                  \}                                                                            \}INIT_KERN_TRACEPOINT(sched_stat_runtime)
TRACEPOINT_CHECK_AND_SET(sched_stat_runtime)typedef unsigned long (*kallsyms_lookup_name_func)(const char *name);
kallsyms_lookup_name_func _kallsyms_lookup_name_func;void* get_func_by_symbol_name_kallsyms_lookup_name(void)
{int ret;void* pfunc = NULL;struct kprobe kp;memset(&kp, 0, sizeof(kp));kp.symbol_name = "kallsyms_lookup_name";kp.pre_handler = NULL;kp.addr = NULL;	// 作为强调，提示使用symbol_nameret = register_kprobe(&kp);if (ret < 0) {printk("register_kprobe fail!\n");return NULL;}printk("register_kprobe succeed!\n");pfunc = (void*)kp.addr;unregister_kprobe(&kp);return pfunc;
}void* get_func_by_symbol_name(const char* i_symbol)
{if (_kallsyms_lookup_name_func == NULL) {return NULL;}return _kallsyms_lookup_name_func(i_symbol);
}static int __init testrqclocktask_init(void)
{_kallsyms_lookup_name_func = get_func_by_symbol_name_kallsyms_lookup_name();_prq = get_func_by_symbol_name("runqueues");if (_prq == NULL) {printk(KERN_ERR "get_func_by_symbol_name runqueues failed!\n");return -1;}mykern_sched_stat_runtime.callback = cb_sched_stat_runtime;for_each_kernel_tracepoint(sched_stat_runtime_tracepoint_check_and_set, &mykern_sched_stat_runtime);if (!mykern_sched_stat_runtime.ptr) {printk(KERN_ERR "mykern_sched_stat_runtime register failed!\n");return 0;}else {printk(KERN_INFO "mykern_sched_stat_runtime register succeeded!\n");}tracepoint_probe_register(mykern_sched_stat_runtime.ptr, mykern_sched_stat_runtime.callback, NULL);mykern_sched_stat_runtime.bregister = 1;return 0;
}static void __exit testrqclocktask_exit(void)
{clear_kern_tracepoint(&mykern_sched_stat_runtime);tracepoint_synchronize_unregister();
}module_init(testrqclocktask_init);
module_exit(testrqclocktask_exit);

运行后的dmesg的打印如下：

可以从上图中看到当前系统下rq_clock_task和rq_clock是一样的，rq_clock和local_clock差了少量的时间，这是来自于rq_clock的更新是事件触发型，所以就无法做到实时的动态变化，而我们程序里在sched_stat_runtime这个tracepoint点里去进行的printk打印这些时间，而sched_stat_runtime时，调度系统逻辑里也会去更新一下这个rq_clock时间，但是更新完以后的逻辑再运行的这个时间就更新不到rq->clock和clock_task里去了，所以就从打印上看到有100到几百纳秒的时间上的落后。

三、实现原理

我们在 3.1 一节里介绍struct rq是如何存储的。在 3.2 一节里说明如何在内核模块里获取各个cpu上的这个struct rq的指针，有了rq的指针，也就自然能获取到rq上的clock和clock_task变量了，clock_task变量对应的就是内核函数rq_clock_task返回的时间。

3.1 运行队列rq在内核里的存储

我们看内核代码里常用的cpu_rq宏的实现：

从上图中可以看到，它是用的per-cpu变量runqueues来存储的。

3.2 运行队列rq在内核模块里如何获取

我们先在 3.2.1 里介绍如何通过kprobe来获取kallsyms_lookup_name函数指针，继而获取struct rq*地址的方法，然后在 3.2.2 里介绍如何让系统认识struct rq这个结构体，从而获取rq里的clock_task的值

3.2.1 通过kprobe获取kallsyms_lookup_name函数指针，继而获取struct rq*地址

runqueues这个变量，其实并没有export symbol出来，所以我们不能直接通过per_cpu的宏来直接拿到它，得先通过之前的博客获取任意一个进程的共享内存的fd对应的资源，增加引用，实现数据的接管——包含非export的内核函数的模块内使用_kprobe能追踪模块的未export函数吗?-CSDN博客里第三章讲到的方法，通过kprobe找到kallsyms_lookup_name函数的函数指针，再调用拿到的kallsyms_lookup_name的函数来获取runqueues变量的地址。

第二章代码中，获取kallsyms_lookup_name函数指针的实现部分：

第二章代码里的通过kallsyms_lookup_name函数的函数指针获取符号的地址或者函数的地址的部分：

在初始化时，获取到struct rq*指针：

根据cpu号获取具体某cpu对应的strcut rq*指针：

3.2.2 内核模块里如何认识struct rq结构体，从而获取rq里的clock_task数值

我们看struct rq的定义是在哪里的，是在kernel/sched/sched.h里的：

而这个kernel/sched/sched.h是内核调度代码内部定义和使用的一个头文件，在内核模块里是包含不到的，我们需要把这个文件里的struct rq的定义部分给复制出来，复制到我们写的这个测试的内核模块的testrqclock.c里：

而这个struct rq结构体依赖了一些结构体定义，我们需要也包含进来，有依赖struct ulamp_bucket，struct uclamp_rq，struct cfs_rq，struct rt_prio_array，struct rt_rq，struct dl_rq这些结构体。所以我们在第二章的代码里也把这些依赖的定义也复制过来了。

内核模块能认识了struct rq结构体了以后，就可以取struct rq里的变量了，而rq_clock_task这个内核调度代码里计算任务执行时间的标尺所使用的这个inline函数的实现就是访问的rq里的clock_task变量：