上一節(jié)介紹了kprobe的基本概念,下面我們將使用幾個具體的例子,看下kprobe在實際使用中有那些應(yīng)用場景。
內(nèi)核的samples/kprobe目錄下有kprobe相關(guān)的例子,我們以這些例子為基礎(chǔ),簡單修改下。
我們所有的例子都是探測do_sys_open() 或者_do_fork(),以下是內(nèi)核中的源碼。
struct audit_names;
struct filename {
const char *name; /* pointer to actual string */
const __user char *uptr; /* original userland pointer */
struct audit_names *aname;
int refcnt;
const char iname[];
};
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_flags op;
int fd = build_open_flags(flags, mode, &op);
struct filename *tmp;
if (fd)
return fd;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f);
fd_install(fd, f);
}
}
putname(tmp);
return fd;
}
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
{
struct task_struct *p;
int trace = 0;
long nr;
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
struct completion vfork;
struct pid *pid;
cpufreq_task_times_alloc(p);
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
put_pid(pid);
} else {
nr = PTR_ERR(p);
}
return nr;
}
實際調(diào)試中經(jīng)常需要調(diào)查函數(shù)使用的變量的值。要在kprobes的偵測器內(nèi)顯示某個函數(shù)的局部變量的值,需要一些技巧,原因是在printk的參數(shù)中無法直接指定變量名,因此必須給偵測器函數(shù)提供一個pt_regs結(jié)構(gòu),其中保存了指定地址的命令執(zhí)行時的寄存器信息。
當(dāng)然,不同架構(gòu)下該結(jié)構(gòu)的成員變量不盡相同,但用該結(jié)構(gòu)可以顯示變量等更為詳細(xì)的信息。
ARM64,ARM32,X86的寄存器及其訪問方式可以看文末的目錄
/*
* NOTE: This example is works on x86 and powerpc.
* Here's a sample kernel module showing the use of kprobes to dump a
* stack trace and selected registers when _do_fork() is called.
*
* For more information on theory of operation of kprobes, see
* Documentation/kprobes.txt
*
* You will see the trace data in /var/log/messages and on the console
* whenever _do_fork() is invoked to create a new process.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#define TRACE_SYMBOL "do_filp_open"
/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
.symbol_name = TRACE_SYMBOL,
};
/* x86_64中寄存器中參數(shù)的順序: rdi rsi rdx rcx r8 r9*/
/* aarch64: x0-x7 對應(yīng)參數(shù) */
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
int dfd = -1;
struct filename *filename = NULL;
#ifdef CONFIG_X86
dfd = regs->di;
filename = (struct filename *) regs->si;
#endif
#ifdef CONFIG_ARM64
dfd = regs->regs[0];
filename = (struct filename *) regs->regs[1];
#endif
if (filename && !(strcmp(filename->name, "testfile")))
printk(KERN_INFO "handler_pre:%s: dfd=%d, name=%s\n", p->symbol_name, dfd, filename->name);
return 0;
}
/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
//printk(KERN_INFO "handler_post\n");
}
/*
* fault_handler: this is called if an exception is generated for any
* instruction within the pre- or post-handler, or when Kprobes
* single-steps the probed instruction.
*/
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
/*printk(KERN_INFO "fault_handler: p->addr = 0x%p, trap #%dn",
p->addr, trapnr);*/
/* Return 0 because we don't handle the fault. */
return 0;
}
static int __init kprobe_init(void)
{
int ret;
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
kp.fault_handler = handler_fault;
ret = register_kprobe(&kp);
if (ret < 0) {
printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
return ret;
}
printk(KERN_INFO "Planted kprobe at %p\n", kp.addr);
return 0;
}
static void __exit kprobe_exit(void)
{
unregister_kprobe(&kp);
printk(KERN_INFO "kprobe at %p unregistered\n", kp.addr);
}
module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");
我們以內(nèi)核目錄下的例程做一個簡單修改,探測do_filp_open函數(shù),當(dāng)打開testfile文件時,自動打印出文件的路徑。
為了減少無效信息的打印,我們將handler_post,handler_fault直接注釋掉。
當(dāng)探測點do_filp_open命中時,Kprobes調(diào)用handler_pre。在handler_pre根據(jù)struct filename *pathname來獲得文件的名字。
在x86_64架構(gòu)中,函數(shù)的參數(shù)從左到右分別保存在rdi、rsi、rdx、rcx、r8、r9中,因此查看rdi和rsi就能得到第1個、第2個參數(shù)的值。
同理,在ARM64架構(gòu)中, 函數(shù)的參數(shù)1~參數(shù)8分別保存到 X0~X7 寄存器中 ,剩下的參數(shù)從右往左依次入棧。因此,X0和X1分別存放dfd, pathname的值。
CROSS_COMPILE:=aarch64-linux-gnu-
ARCH:= arm64
CC:= $(CROSS_COMPILE)gcc
LD:= $(CROSS_COMPILE)ld
PWD:= $(shell pwd)
obj-m := kprobe_example.o jprobe_example.o kretprobe_example.o
KERNELDIR:=/home/zhongyi/code/rk3399_linux_release_v2.5.1_20210301/kernel
all:
make -C $(KERNELDIR) M=$(PWD) modules ARCH=$(ARCH)
clean:
rm -f *.o
rm -f *.symvers
rm -f *.order
rm -f *.ko
rm -f *.mod.c
執(zhí)行make編譯后,在開發(fā)板上將驅(qū)動加載后,手動打開testfile文件。
insmod kprobe_example.ko
vim testfile
rmmod kprobe_example.ko
dmesg
使用dmesg可以看到成功輸出文件名和dfd。
[ 307.572314] Planted kprobe at ffffff80081fdf84
[ 311.997767] handler_pre:do_filp_open: dfd=-100, name=testfile
[ 312.034774] handler_pre:do_filp_open: dfd=-100, name=testfile
[ 347.969572] kprobe at ffffff80081fdf84 unregistered
使用kprobes的另一個有效的調(diào)試方法,就是顯示棧跟蹤。
我們只需要在handler_pre中調(diào)用dump_stack();即可。
/* x86_64中寄存器中參數(shù)的順序: rdi rsi rdx rcx r8 r9*/
/* aarch64: x0-x7 對應(yīng)參數(shù) */
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
dump_stack();
return 0;
}
編譯加載
insmod kprobe_example.ko
rmmod kprobe_example.ko
dmesg
成功打印出棧的信息。
[ 451.620803] CPU: 4 PID: 1299 Comm: rmmod Tainted: G O 4.4.194+ #18
[ 451.620809] Hardware name: Firefly-RK3399 Board (Linux Opensource) (DT)
[ 451.620813] Call trace:
[ 451.620820] [<ffffff8008088410>] dump_backtrace+0x0/0x220
[ 451.620828] [<ffffff8008088654>] show_stack+0x24/0x30
[ 451.620834] [<ffffff80084f842c>] dump_stack+0x94/0xbc
[ 451.620842] [<ffffff8000f22048>] handler_pre+0x14/0x24 [kprobe_example]
[ 451.620848] [<ffffff8008efd824>] kprobe_breakpoint_handler+0x100/0x14c
[ 451.620855] [<ffffff8008084128>] brk_handler+0x54/0x80
[ 451.620860] [<ffffff8008080b0c>] do_debug_exception+0x58/0xc0
[ 451.620866] Exception stack(0xffffffc0f2ef7c40 to 0xffffffc0f2ef7d70)
[ 451.620879] 7c40: ffffffc0ef782000 0000008000000000 ffffffc0f2ef7e20 ffffff80081fdf84
[ 451.620886] 7c60: 0000000060000145 ffffff8008efc228 ffffffc0ceff2a50 ffffffc0ee7d2988
[ 451.620892] 7c80: ffffffc0f2ef7ca0 ffffff80081c0dc8 ffffffc0f0582e70 00e80000e95f3f53
[ 451.620898] 7ca0: ffffffc0f2ef7d70 ffffff8008efe3e8 ffffffc0f2ef7ec0 0000005583d31928
[ 451.620905] 7cc0: 0000000000000055 0000000092000047 ffffffc0ceec5100 ffffffc0dccbd500
[ 451.620911] 7ce0: 0000000000000024 ffffffc0dccbd580 00000000ffffff9c ffffffc0ef782000
[ 451.620917] 7d00: ffffffc0f2ef7e78 0000000000000000 0000000000000000 0000000000000003
[ 451.620923] 7d20: ffffffc0dcfc9a80 0000007fd94380e8 0000000000000000 fefefefefefefeff
[ 451.620929] 7d40: 0000000000000001 0000007fd9437db8 0000000000000000 0000000000000000
[ 451.620934] 7d60: 0000000000000000 000000007fffffde
[ 451.620940] [<ffffff8008082668>] el1_dbg+0x18/0x7c
[ 451.620947] [<ffffff80081ed9a4>] SyS_openat+0x3c/0x4c
[ 451.620953] [<ffffff8008082f70>] el0_svc_naked+0x24/0x28
[ 451.630032] kprobe at ffffff80081fdf84 unregistered
kprobes擁有更加強(qiáng)大的功能,那就是它能在內(nèi)核的任意地址插入偵測器。此外,偵測器可以在任意地址的指令執(zhí)行之前或之后執(zhí)行,或者前后都執(zhí)行。
因此,應(yīng)當(dāng)觀察匯編代碼,找到源代碼中想要調(diào)查的位置對應(yīng)于編譯后的二進(jìn)制文件中的什么地址,并調(diào)查希望顯示的變量保存在哪個寄存器、哪個內(nèi)存地址。
通常,我們希望在函數(shù)執(zhí)行的過程中變量,即打印一些流程中的東西,而不是函數(shù)本身被調(diào)用,此時我們不能簡單設(shè)置 kprobe->symbol_name 函數(shù)名字 ,假設(shè)我們期望獲取 _do_fork函數(shù)變量 nr 的值:
將vmlinux進(jìn)行反匯編,找出_do_fork的地址。
aarch64-linux-gnu-objdump -s -d vmlinux > vmlinux.asm
_do_fork 反匯編如下所示,地址為ffffff80080ba83c。
ffffff80080ba83c <_do_fork>:
ffffff80080ba83c: a9b97bfd stp x29, x30, [sp, #-112]!
ffffff80080ba840: 910003fd mov x29, sp
ffffff80080ba844: a90153f3 stp x19, x20, [sp, #16]
ffffff80080ba848: a9025bf5 stp x21, x22, [sp, #32]
ffffff80080ba84c: a90363f7 stp x23, x24, [sp, #48]
ffffff80080ba850: aa0003f5 mov x21, x0
ffffff80080ba854: aa0103f3 mov x19, x1
ffffff80080ba858: aa0203f6 mov x22, x2
ffffff80080ba85c: aa0303f7 mov x23, x3
ffffff80080ba860: aa0403f8 mov x24, x4
ffffff80080ba864: aa1e03e0 mov x0, x30
ffffff80080ba868: 97ff4e8a bl ffffff800808e290 <_mcount>
ffffff80080ba86c: 37b814f5 tbnz w21, #23, ffffff80080bab08 <_do_fork+0x2cc>
ffffff80080ba870: 37701495 tbnz w21, #14, ffffff80080bab00 <_do_fork+0x2c4>
ffffff80080ba874: 92401ea0 and x0, x21, #0xff
ffffff80080ba878: 52800074 mov w20, #0x3 // #3
ffffff80080ba87c: f100441f cmp x0, #0x11
ffffff80080ba880: 1a9f1694 csinc w20, w20, wzr, ne // ne = any
ffffff80080ba884: 11000e81 add w1, w20, #0x3
............................
ffffff80080ba91c: b5000fb6 cbnz x22, ffffff80080bab10 <_do_fork+0x2d4>
ffffff80080ba920: 52800001 mov w1, #0x0 // #0
ffffff80080ba924: aa1303e0 mov x0, x19
ffffff80080ba928: 94006a17 bl ffffff80080d5184 <get_task_pid>
ffffff80080ba92c: aa0003f6 mov x22, x0
ffffff80080ba930: 94006a85 bl ffffff80080d5344 pid_vnr>
ffffff80080ba934: 93407c18 sxtw x24, w0
ffffff80080ba938: 36a00195 tbz w21, #20, ffffff80080ba968 <_do_fork+0x12c>
ffffff80080ba93c: d5384101 mrs x1, sp_el0
ffffff80080ba940: f9400422 ldr x2, [x1, #8]
ffffff80080ba944: aa1703e1 mov x1, x23
ffffff80080ba948: b1001021 adds x1, x1, #0x4
nr 變量 是 函數(shù)pid_vnr的返回值(也是子進(jìn)程的pid) ,根據(jù)ARM調(diào)用規(guī)范,調(diào)用完成pid_vnr()后,寄存器x0存放的就是其函數(shù)返回值。
參考:ARM64調(diào)用標(biāo)準(zhǔn) https://blog.51cto.com/u_15333820/3452605
通過反匯編可以知道,pid_vnr在 ffffff80080ba930地址處被調(diào)用,因此,偵測器的插入地址就是在ffffff80080ba930之后,并且x0被改變之前。只要符合這兩個條件,放在哪里都無所謂。
因此,我們將kprobe的點設(shè)置為ffffff80080ba934,然后獲取 x0,就能獲取變量nr的值。
.offset 是探測點相對于_do_fork的偏移,在注冊時指定。我們這里的 offset = ffffff80080ba934 - ffffff80080ba83c = F8。
另外,反匯編能力就是多看匯編以及找到幾個關(guān)鍵點(例如常量,跳轉(zhuǎn)語句)就能定位到匯編對應(yīng)的源碼了,這里不再展開了。
/*
* NOTE: This example is works on x86 and powerpc.
* Here's a sample kernel module showing the use of kprobes to dump a
* stack trace and selected registers when _do_fork() is called.
*
* For more information on theory of operation of kprobes, see
* Documentation/kprobes.txt
*
* You will see the trace data in /var/log/messages and on the console
* whenever _do_fork() is invoked to create a new process.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
.symbol_name = "_do_fork",
.offset = 0xF8,
};
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
printk(KERN_INFO "pre_handler: p->addr = 0x%p, ip = %lx,"
" flags = 0x%lx,rax = 0x%lx\n",
p->addr, regs->ip, regs->flags,regs->ax);
#endif
#ifdef CONFIG_ARM64
pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
" pstate = 0x%lx,x0 = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate,(long)regs->regs[0]);
#endif
/* A dump_stack() here will give a stack backtrace */
return 0;
}
/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
#ifdef CONFIG_X86
printk(KERN_INFO "post_handler: p->addr = 0x%p, flags = 0x%lx\n",
p->addr, regs->flags);
#endif
#ifdef CONFIG_ARM64
pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pstate);
#endif
}
/*
* fault_handler: this is called if an exception is generated for any
* instruction within the pre- or post-handler, or when Kprobes
* single-steps the probed instruction.
*/
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
printk(KERN_INFO "fault_handler: p->addr = 0x%p, trap #%dn",
p->addr, trapnr);
/* Return 0 because we don't handle the fault. */
return 0;
}
static int __init kprobe_init(void)
{
int ret;
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
kp.fault_handler = handler_fault;
ret = register_kprobe(&kp);
if (ret < 0) {
printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
return ret;
}
printk(KERN_INFO "Planted kprobe at %p\n", kp.addr);
return 0;
}
static void __exit kprobe_exit(void)
{
unregister_kprobe(&kp);
printk(KERN_INFO "kprobe at %p unregistered\n", kp.addr);
}
module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");
insmod kprobe_example.ko
rmmod kprobe_example.ko
dmesg
編譯加載后,成功打印出rax的值。
[ 245.080636] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.080640] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.080936] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.080938] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.457340] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.457345] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.457643] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.457645] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.719208] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.719213] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.719505] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.719507] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.820761] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.820765] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.821061] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.821063] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 246.092572] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 246.092577] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 246.095863] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 246.095867] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 246.126196] kprobe at 0000000050a6c3dd unregistered
與kprobes相比,jprobes能更容易地獲取傳給函數(shù)的參數(shù)。有幾點需要注意:
/*
* Here's a sample kernel module showing the use of jprobes to dump
* the arguments of _do_fork().
*
* For more information on theory of operation of jprobes, see
* Documentation/kprobes.txt
*
* Build and insert the kernel module as done in the kprobe example.
* You will see the trace data in /var/log/messages and on the
* console whenever _do_fork() is invoked to create a new process.
* (Some messages may be suppressed if syslogd is configured to
* eliminate duplicate messages.)
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
/*
* Jumper probe for _do_fork.
* Mirror principle enables access to arguments of the probed routine
* from the probe handler.
*/
/* Proxy routine having the same arguments as actual _do_fork() routine */
#define TRACE_SYMBOL "do_filp_open"
/*與do_filp_open 的參數(shù)完全相同*/
static struct file * jp_do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
if (pathname && !(strcmp(pathname->name, "testfile")))
printk(KERN_INFO "jprobe: dfd = %d, pathname = %s\n", dfd, pathname->name);
/* Always end with a call to jprobe_return(). */
jprobe_return();
return 0;
}
static struct jprobe my_jprobe = {
.entry = jp_do_filp_open,
.kp = {
.symbol_name = TRACE_SYMBOL,
},
};
static int __init jprobe_init(void)
{
int ret;
ret = register_jprobe(&my_jprobe);
if (ret < 0) {
printk(KERN_INFO "register_jprobe failed, returned %d\n", ret);
return -1;
}
printk(KERN_INFO "Planted jprobe at %p, handler addr %p\n",
my_jprobe.kp.addr, my_jprobe.entry);
return 0;
}
static void __exit jprobe_exit(void)
{
unregister_jprobe(&my_jprobe);
printk(KERN_INFO "jprobe at %p unregistered\n", my_jprobe.kp.addr);
}
module_init(jprobe_init)
module_exit(jprobe_exit)
MODULE_LICENSE("GPL");
使用kprobes時,必須通過寄存器或棧才能計算出參數(shù)的值。此外,計算方法還依賴于架構(gòu)。
如果使用jprobes,那么無須了解架構(gòu)的詳細(xì)知識,也能簡單地查看參數(shù)的值。
編譯加載驅(qū)動程序
insmod jprobe_example.ko
vim testfile
rmmod jprobe_example.ko
dmesg
成功打印出函數(shù)的參數(shù)
[ 612.670453] jprobe at ffffff80081fdf84 unregistered
[ 867.293765] Planted jprobe at ffffff80081fdf84, handler addr ffffff8000f1a000
[ 871.107502] jprobe: dfd = -100, pathname = testfile
[ 871.147747] jprobe: dfd = -100, pathname = testfile
[ 875.723761] jprobe at ffffff80081fdf84 unregistered
[ 907.706066] Planted jprobe at ffffff80081fdf84, handler addr ffffff8000f22000
[ 911.661891] jprobe: dfd = -100, pathname = testfile
[ 911.694903] jprobe: dfd = -100, pathname = testfile
[ 919.272187] jprobe at ffffff80081fdf84 unregistered
[ 2296.830613] Planted jprobe at ffffff80081fdf84, handler addr ffffff8000f2a000
[ 2302.164861] jprobe: dfd = -100, pathname = testfile
[ 2302.200634] jprobe: dfd = -100, pathname = testfile
[ 2307.407014] jprobe at ffffff80081fdf84 unregistered
kretprobe 也是基于kprobe的,相比于kprobe和jprobe,實現(xiàn)相對復(fù)雜。下面我們以內(nèi)核目錄下的例程,簡單分析下。
/*
* kretprobe_example.c
*
* Here's a sample kernel module showing the use of return probes to
* report the return value and total time taken for probed function
* to run.
*
* usage: insmod kretprobe_example.ko func=<func_name>
*
* If no func_name is specified, _do_fork is instrumented
*
* For more information on theory of operation of kretprobes, see
* Documentation/kprobes.txt
*
* Build and insert the kernel module as done in the kprobe example.
* You will see the trace data in /var/log/messages and on the console
* whenever the probed function returns. (Some messages may be suppressed
* if syslogd is configured to eliminate duplicate messages.)
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/ktime.h>
#include <linux/limits.h>
#include <linux/sched.h>
static char func_name[NAME_MAX] = "do_sys_open";
module_param_string(func, func_name, NAME_MAX, S_IRUGO);
MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
" function's execution time");
/* per-instance private data */
struct my_data {
ktime_t entry_stamp;
};
/* Here we use the entry_hanlder to timestamp function entry */
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct my_data *data;
if (!current->mm)
return 1; /* Skip kernel threads */
data = (struct my_data *)ri->data;
data->entry_stamp = ktime_get();
return 0;
}
/*
* Return-probe handler: Log the return value and duration. Duration may turn
* out to be zero consistently, depending upon the granularity of time
* accounting on the platform.
*/
static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
int retval = regs_return_value(regs);
struct my_data *data = (struct my_data *)ri->data;
s64 delta;
ktime_t now;
now = ktime_get();
delta = ktime_to_ns(ktime_sub(now, data->entry_stamp));
printk(KERN_INFO "%s returned %d and took %lld ns to execute\n",
func_name, retval, (long long)delta);
return 0;
}
static struct kretprobe my_kretprobe = {
.handler = ret_handler,
.entry_handler = entry_handler,
.data_size = sizeof(struct my_data),
/* Probe up to 20 instances concurrently. */
.maxactive = 20,
};
static int __init kretprobe_init(void)
{
int ret;
my_kretprobe.kp.symbol_name = func_name;
ret = register_kretprobe(&my_kretprobe);
if (ret < 0) {
printk(KERN_INFO "register_kretprobe failed, returned %d\n",
ret);
return -1;
}
printk(KERN_INFO "Planted return probe at %s: %p\n",
my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr);
return 0;
}
static void __exit kretprobe_exit(void)
{
unregister_kretprobe(&my_kretprobe);
printk(KERN_INFO "kretprobe at %p unregistered\n",
my_kretprobe.kp.addr);
/* nmissed > 0 suggests that maxactive was set too low. */
printk(KERN_INFO "Missed probing %d instances of %s\n",
my_kretprobe.nmissed, my_kretprobe.kp.symbol_name);
}
module_init(kretprobe_init)
module_exit(kretprobe_exit)
MODULE_LICENSE("GPL");
/*
* Function-return probe -
* Note:
* User needs to provide a handler function, and initialize maxactive.
* maxactive - The maximum number of instances of the probed function that
* can be active concurrently.
* nmissed - tracks the number of times the probed function's return was
* ignored, due to maxactive being too low.
*
*/
struct kretprobe {
struct kprobe kp;
kretprobe_handler_t handler;
kretprobe_handler_t entry_handler;
int maxactive;
int nmissed;
size_t data_size;
struct hlist_head free_instances;
raw_spinlock_t lock;
};
typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
struct pt_regs *);
struct kretprobe_instance {
struct hlist_node hlist;
struct kretprobe *rp;
kprobe_opcode_t *ret_addr;
struct task_struct *task;
char data[0];
};
kretprobe探測點的blackpoint,用來表示不支持kretprobe探測的函數(shù)的信息。name表示該函數(shù)名,addr表示該函數(shù)的地址。
struct kretprobe_blackpoint {
const char *name;
void *addr;
};
1234
blackpoint與架構(gòu)相關(guān),x86架構(gòu)不支持的kretprobe探測點如下:
// arch/x86/kernel/kprobes/core.c
// 不支持kretprobe探測的函數(shù),從blacklist這個名字中我們也知道其含義了。
struct kretprobe_blackpoint kretprobe_blacklist[] = {
{"__switch_to", }, /* This function switches only current task, but
doesn't switch kernel stack.*/
{NULL, NULL} /* Terminator */
};
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
123456789
函數(shù)的開頭首先處理 kretprobe_blacklis t,如果指定的被探測函數(shù)在這個blacklist中就直接返回EINVAL,表示不支持探測,在x86架構(gòu)中是__switch_to 這個函數(shù),表示這個函數(shù)不能被kretprobe。
int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
struct kretprobe_instance *inst;
int i;
void *addr;
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
if (IS_ERR(addr))
return PTR_ERR(addr);
//如果kretprobe到kretprobe_blacklist中函數(shù),則返回EINVAL
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
if (kretprobe_blacklist[i].addr == addr)
return -EINVAL;
}
}
//內(nèi)核設(shè)置回調(diào)函數(shù) pre_handler_kretprobe 。
//與kprobe不同的是:kretprobe不支持用戶定義pre_handler和post_handler等回調(diào)函數(shù)。
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;
rp->kp.break_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPT
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
rp->maxactive = num_possible_cpus();
#endif
}
raw_spin_lock_init(&rp->lock);
INIT_HLIST_HEAD(&rp->free_instances);
//根據(jù)maxactive值分配 struct kretprobe_instance 內(nèi)存空間
for (i = 0; i < rp->maxactive; i++) {
inst = kmalloc(sizeof(struct kretprobe_instance) +
rp->data_size, GFP_KERNEL);
if (inst == NULL) {
free_rp_inst(rp);
return -ENOMEM;
}
INIT_HLIST_NODE(&inst->hlist);
hlist_add_head(&inst->hlist, &rp->free_instances);
}
rp->nmissed = 0;
/* Establish function entry probe point */
//注冊kprobe探測點
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
最后調(diào)用 register_kprobe(&rp->kp),注冊kprobe點,可以看出kretprobe也是基于kprobe機(jī)制實現(xiàn)的,kretprobe也是一種特殊形式的kprobe。
kretprobe注冊完成后就默認(rèn)啟動探測。
pre_handler_kretprobe這個函數(shù)是內(nèi)核自己定義的,內(nèi)核已經(jīng)指定該回調(diào)函數(shù),不支持用戶自定義。這個 kprobe pre_handler 在每個 kretprobe 中注冊。 當(dāng)探針命中時,它將設(shè)置返回探針。
#ifdef CONFIG_KRETPROBES
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
*/
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
unsigned long hash, flags = 0;
struct kretprobe_instance *ri;
/*
* To avoid deadlocks, prohibit return probing in NMI contexts,
* just skip the probe and increase the (inexact) 'nmissed'
* statistical counter, so that the user is informed that
* something happened:
*/
if (unlikely(in_nmi())) {
rp->nmissed++;
return 0;
}
/* TODO: consider to only swap the RA after the last pre_handler fired */
hash = hash_ptr(current, KPROBE_HASH_BITS);
raw_spin_lock_irqsave(&rp->lock, flags);
if (!hlist_empty(&rp->free_instances)) {
ri = hlist_entry(rp->free_instances.first,
struct kretprobe_instance, hlist);
hlist_del(&ri->hlist);
raw_spin_unlock_irqrestore(&rp->lock, flags);
ri->rp = rp;
ri->task = current;
(1)
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
raw_spin_lock_irqsave(&rp->lock, flags);
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock_irqrestore(&rp->lock, flags);
return 0;
}
(2)
arch_prepare_kretprobe(ri, regs);
/* XXX(hch): why is there no hlist_move_head? */
INIT_HLIST_NODE(&ri->hlist);
kretprobe_table_lock(hash, &flags);
hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
kretprobe_table_unlock(hash, &flags);
} else {
rp->nmissed++;
raw_spin_unlock_irqrestore(&rp->lock, flags);
}
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
struct kretprobe *rp
rp->entry_handler && rp->entry_handler(ri, regs)
entry_handler這個回調(diào)函數(shù)就是用戶自己定義的回調(diào)函數(shù)(可選的用戶指定的處理程序),前面我們已經(jīng)介紹過了,在這里不再介紹。
/* Here we use the entry_hanlder to timestamp function entry */
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct my_data *data;
//內(nèi)核線程 task->mm == NULL
if (!current->mm)
return 1; /* Skip kernel threads */
data = (struct my_data *)ri->data;
data->entry_stamp = ktime_get();
return 0;
}
arch_prepare_kretprobe(ri, regs)該函數(shù)架構(gòu)相關(guān),struct kretprobe_instance結(jié)構(gòu)體 的 ret_addr 成員用于保存并替換regs中的返回地址。返回地址被替換為kretprobe_trampoline。
// arch/x86/kernel/kprobes/core.c
#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
// x86_64
// arch/x86/include/asm/ptrace.h
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
// arch/x86/kernel/kprobes/core.c
void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long *sara = stack_addr(regs);
ri->ret_addr = (kprobe_opcode_t *) *sara;
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) &kretprobe_trampoline;
}
NOKPROBE_SYMBOL(arch_prepare_kretprobe);
//struct kretprobe_instance *ri;
//ri->ret_addr;
struct kretprobe_instance {
kprobe_opcode_t *ret_addr; //用于保存原始被探測函數(shù)的返回地址
};
// arch/arm64/kernel/probes/kprobes.c
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
ri->ret_addr = (kprobe_opcode_t *)regs->regs[30];
/* replace return addr (x30) with trampoline */
regs->regs[30] = (long)&kretprobe_trampoline;
}
ARM64架構(gòu)中regs->regs[30]是LR(procedure link register)寄存器(X30 :LR)。
kretprobe是基于kprobe實現(xiàn)的,有一個固定的pre_handler回調(diào)函數(shù),在內(nèi)核中實現(xiàn),無需用戶編寫。而在kprobe中pre_handler函數(shù)是提供給用戶的回調(diào)函數(shù)。
rp->kp.pre_handler = pre_handler_kretprobe; //內(nèi)核中已經(jīng)實現(xiàn)
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;
rp->kp.break_handler = NULL;
kretprobe提供給用戶的兩個回調(diào)函數(shù):
kretprobe_handler_t handler;
kretprobe_handler_t entry_handler; // (可選)
pre_handler回調(diào)函數(shù)會為kretprobe探測函數(shù)執(zhí)行的返回值做準(zhǔn)備工作,其中最主要的就是替換掉正常流程的返回地址,讓被探測函數(shù)在執(zhí)行之后能夠跳轉(zhuǎn)到kretprobe設(shè)計的函數(shù) kretprobe_trampoline中去。
pre_handler_kretprobe函數(shù)返回后,kprobe流程接著執(zhí)行singlestep流程并返回到正常的執(zhí)行流程,被探測函數(shù)(do_fork)繼續(xù)執(zhí)行,直到它執(zhí)行完畢并返回。
由于返回地址被替換為kretprobe_trampoline,所以跳轉(zhuǎn)到kretprobe_trampoline執(zhí)行,該函數(shù)架構(gòu)相關(guān)且有嵌入?yún)R編實現(xiàn)。
該函數(shù)會獲取被探測函數(shù)的寄存器信息并調(diào)用用戶定義的回調(diào)函數(shù)輸出其中的返回值,最后函數(shù)返回正常的執(zhí)行流程。
static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long retval = regs_return_value(regs);
......
}
static struct kretprobe my_kretprobe = {
.handler = ret_handler,
};
(1)
kretprobe_trampoline
-->trampoline_handler
kretprobe_trampoline
(2) kretprobe_trampoline
// arch/x86/kernel/kprobes/core.c
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
*/
asm(
".global kretprobe_trampoline\n"
".type kretprobe_trampoline, @function\n"
"kretprobe_trampoline:\n"
#ifdef CONFIG_X86_64
/* We don't bother saving the ss register */
" pushq %rsp\n"
" pushfq\n"
SAVE_REGS_STRING
" movq %rsp, %rdi\n"
" call trampoline_handler\n"
/* Replace saved sp with true return address. */
" movq %rax, 152(%rsp)\n"
RESTORE_REGS_STRING
" popfq\n"
#else
" pushf\n"
SAVE_REGS_STRING
" movl %esp, %eax\n"
" call trampoline_handler\n"
/* Move flags to cs */
" movl 56(%esp), %edx\n"
" movl %edx, 52(%esp)\n"
/* Replace saved flags with true return address. */
" movl %eax, 56(%esp)\n"
RESTORE_REGS_STRING
" popf\n"
#endif
" ret\n"
".size kretprobe_trampoline, .-kretprobe_trampoline\n"
);
NOKPROBE_SYMBOL(kretprobe_trampoline);
STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
(3) trampoline_handler
// arch/x86/kernel/kprobes/core.c
/*
* Called from kretprobe_trampoline
*/
__visible __used void *trampoline_handler(struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
kprobe_opcode_t *correct_ret_addr = NULL;
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
/* fixup registers */
#ifdef CONFIG_X86_64
regs->cs = __KERNEL_CS;
#else
regs->cs = __KERNEL_CS | get_kernel_rpl();
regs->gs = 0;
#endif
regs->ip = trampoline_address;
regs->orig_ax = ~0UL;
/*
* It is possible to have multiple instances associated with a given
* task either because multiple functions in the call path have
* return probes installed on them, and/or more than one
* return probe was registered for a target function.
*
* We can handle this because:
* - instances are always pushed into the head of the list
* - when multiple return probes are registered for the same
* function, the (chronologically) first instance's ret_addr
* will be the real return address, and all the rest will
* point to kretprobe_trampoline.
*/
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_assert(ri, orig_ret_address, trampoline_address);
correct_ret_addr = ri->ret_addr;
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
__this_cpu_write(current_kprobe, &ri->rp->kp);
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
__this_cpu_write(current_kprobe, NULL);
}
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_hash_unlock(current, &flags);
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
return (void *)orig_ret_address;
}
NOKPROBE_SYMBOL(trampoline_handler);
(4) ri->rp->handler(ri, regs)表示執(zhí)行用戶態(tài)自定義的回調(diào)函數(shù)handler(用來獲取_do_fork函數(shù)的返回值),handler回調(diào)函數(shù)執(zhí)行完畢以后,調(diào)用recycle_rp_inst函數(shù)將當(dāng)前的kretprobe_instance實例從kretprobe_inst_table哈希表釋放,重新鏈入free_instances中,以備后面kretprobe觸發(fā)時使用,另外如果kretprobe已經(jīng)被注銷則將它添加到銷毀表中待銷毀。
ri->rp->handler(ri, regs);
->recycle_rp_inst(ri, &empty_rp);
12
void recycle_rp_inst(struct kretprobe_instance *ri,
struct hlist_head *head)
{
struct kretprobe *rp = ri->rp;
/* remove rp inst off the rprobe_inst_table */
hlist_del(&ri->hlist);
INIT_HLIST_NODE(&ri->hlist);
if (likely(rp)) {
raw_spin_lock(&rp->lock);
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock(&rp->lock);
} else
/* Unregistering */
hlist_add_head(&ri->hlist, head);
}
NOKPROBE_SYMBOL(recycle_rp_inst);
(5) trampoline_handler函數(shù)執(zhí)行完后,返回被探測函數(shù)的原始返回地址,執(zhí)行流程再次回到kretprobe_trampoline函數(shù)中,將保存的 sp 替換為真實的返回地址。 從rax寄存器中取出原始的返回地址,然后恢復(fù)原始函數(shù)調(diào)用??臻g,最后跳轉(zhuǎn)到原始返回地址執(zhí)行,至此函數(shù)調(diào)用的流程就回歸正常流程了,整個kretprobe探測結(jié)束。
/* Replace saved sp with true return address. */
" movq %rax, 152(%rsp)\n"
RESTORE_REGS_STRING
" popfq\n"
1234
(1)
kretprobe_trampoline
-->trampoline_probe_handler
kretprobe_trampoline
(2) kretprobe_trampoline
// arch/arm64/kernel/probes/kprobes_trampoline.S
ENTRY(kretprobe_trampoline)
sub sp, sp, #S_FRAME_SIZE
save_all_base_regs
mov x0, sp
bl trampoline_probe_handler
/*
* Replace trampoline address in lr with actual orig_ret_addr return
* address.
*/
mov lr, x0
restore_all_base_regs
add sp, sp, #S_FRAME_SIZE
ret
ENDPROC(kretprobe_trampoline)
(3) trampoline_probe_handler
// arch/arm64/kernel/probes/kprobes.c
void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address =
(unsigned long)&kretprobe_trampoline;
kprobe_opcode_t *correct_ret_addr = NULL;
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
/*
* It is possible to have multiple instances associated with a given
* task either because multiple functions in the call path have
* return probes installed on them, and/or more than one
* return probe was registered for a target function.
*
* We can handle this because:
* - instances are always pushed into the head of the list
* - when multiple return probes are registered for the same
* function, the (chronologically) first instance's ret_addr
* will be the real return address, and all the rest will
* point to kretprobe_trampoline.
*/
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_assert(ri, orig_ret_address, trampoline_address);
correct_ret_addr = ri->ret_addr;
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
__this_cpu_write(current_kprobe, &ri->rp->kp);
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
__this_cpu_write(current_kprobe, NULL);
}
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_hash_unlock(current, &flags);
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
return (void *)orig_ret_address;
}
(4) 將 lr寄存器中的trampoline地址替換為實際的 orig_ret_addr 返回地址。 從x0寄存器中取出原始的返回地址,然后恢復(fù)原始函數(shù)調(diào)用??臻g,最后跳轉(zhuǎn)到原始返回地址執(zhí)行,至此函數(shù)調(diào)用的流程就回歸正常流程了,整個kretprobe探測結(jié)束。
/*
* Replace trampoline address in lr with actual orig_ret_addr return
* address.
*/
mov lr, x0
restore_all_base_regs
add sp, sp, #S_FRAME_SIZE
ret
insmod kprobe_example.ko
vim testfile
rmmod kprobe_example.ko
dmesg
成功打印出函數(shù)的執(zhí)行時間
[ 1056.875938] do_sys_open returned -2 and took 10500 ns to execute
[ 1057.567400] do_sys_open returned 34 and took 59208 ns to execute
[ 1058.382932] do_sys_open returned 3 and took 31469101 ns to execute
[ 1058.567046] do_sys_open returned 34 and took 61250 ns to execute
[ 1058.975879] do_sys_open returned 3 and took 224084 ns to execute
[ 1058.975935] do_sys_open returned 3 and took 16917 ns to execute
[ 1058.976041] do_sys_open returned 3 and took 13417 ns to execute
[ 1058.976148] do_sys_open returned 3 and took 15167 ns to execute
[ 1058.976254] do_sys_open returned 3 and took 15750 ns to execute
[ 1058.976356] do_sys_open returned 3 and took 16042 ns to execute
[ 1058.978036] do_sys_open returned -2 and took 23041 ns to execute
[ 1058.978074] do_sys_open returned 3 and took 24500 ns to execute
[ 1058.978175] do_sys_open returned -2 and took 9334 ns to execute
[ 1058.978211] do_sys_open returned 3 and took 23333 ns to execute
[ 1058.978246] do_sys_open returned 3 and took 13417 ns to execute
[ 1058.978286] do_sys_open returned 3 and took 14583 ns to execute
[ 1058.989701] kretprobe at ffffff80081ed6c8 unregistered
[ 1058.989709] Missed probing 0 instances of do_sys_open
這些事件類似于基于tracepoint的事件。與Tracepoint不同,它是基于kprobes(kprobe和kretprobe)的。所以它可以探測任何kprobes可以探測的地方。與基于Tracepoint的事件不同的是,它可以動態(tài)地添加和刪除。
要啟用這個功能,在編譯內(nèi)核時CONFIG_KPROBE_EVENTS=y
與 Event Tracing類似,這不需要通過current_tracer來激活??梢酝ㄟ^/sys/kernel/debug/tracing/kprobe_events添加探測點,并通過/sys/kernel/debug/tracing/events/kprobes/<EVENT>/enable來啟用它。
你也可以使用/sys/kernel/debug/tracing/dynamic_events,而不是kprobe_events。該接口也將提供對其他動態(tài)事件的統(tǒng)一訪問。
kprobe和內(nèi)核的ftrac結(jié)合使用,需要對內(nèi)核進(jìn)行配置,然后添加探測點、進(jìn)行探測、查看結(jié)果。
CONFIG_KPROBES=y
CONFIG_OPTPROBES=y
CONFIG_KPROBES_ON_FTRACE=y
CONFIG_UPROBES=y
CONFIG_KRETPROBES=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_OPTPROBES=y
CONFIG_HAVE_KPROBES_ON_FTRACE=y
CONFIG_KPROBE_EVENT=y
kprobe事件相關(guān)的節(jié)點有如下:
/sys/kernel/debug/tracing/kprobe_events-----------------------配置kprobe事件屬性,增加事件之后會在kprobes下面生成對應(yīng)目錄。
/sys/kernel/debug/tracing/kprobe_profile----------------------kprobe事件統(tǒng)計屬性文件。
/sys/kernel/debug/tracing/kprobes/<GRP>/<EVENT>/enabled-------使能kprobe事件
/sys/kernel/debug/tracing/kprobes/<GRP>/<EVENT>/filter--------過濾kprobe事件
/sys/kernel/debug/tracing/kprobes/<GRP>/<EVENT>/format--------查詢kprobe事件顯示格式
新增一個kprobe事件,通過寫kprobe_events來設(shè)置。
p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS]-------------------設(shè)置一個probe探測點
r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS]------------------------------設(shè)置一個return probe探測點
-:[GRP/]EVENT----------------------------------------------------------刪除一個探測點
細(xì)節(jié)解釋如下:
GRP : Group name. If omitted, use "kprobes" for it.------------設(shè)置后會在events/kprobes下創(chuàng)建<GRP>目錄。
EVENT : Event name. If omitted, the event name is generated based on SYM+offs or MEMADDR.---指定后在events/kprobes/<GRP>生成<EVENT>目錄。 MOD : Module name which has given SYM.--------------------------模塊名,一般不設(shè)
SYM[+offs] : Symbol+offset where the probe is inserted.-------------被探測函數(shù)名和偏移
MEMADDR : Address where the probe is inserted.----------------------指定被探測的內(nèi)存絕對地址
FETCHARGS : Arguments. Each probe can have up to 128 args.----------指定要獲取的參數(shù)信息。 %REG : Fetch register REG---------------------------------------獲取指定寄存器值
@ADDR : Fetch memory at ADDR (ADDR should be in kernel)--------獲取指定內(nèi)存地址的值
@SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)---獲取全局變量的值 $stackN : Fetch Nth entry of stack (N >= 0)----------------------------------獲取指定??臻g值,即sp寄存器+N后的位置值
$stack : Fetch stack address.-----------------------------------------------獲取sp寄存器值
$retval : Fetch return value.(*)--------------------------------------------獲取返回值,用戶return kprobe
$comm : Fetch current task comm.----------------------------------------獲取對應(yīng)進(jìn)程名稱。
+|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)------------- NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "string" and bitfield are supported.----------------設(shè)置參數(shù)的類型,可以支持字符串和比特類型
(*) only for return probe.
(**) this is useful for fetching a field of data structures.
執(zhí)行如下兩條命令就會生成目錄/sys/kernel/debug/tracing/events/kprobes/myprobe;第三條命令則可以刪除指定kprobe事件,如果要全部刪除則echo > /sys/kernel/debug/tracing/kprobe_events。
echo 'p:myprobe do_sys_open dfd=%x0 filename=%x1 flags=%x2 mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events
echo 'r:myretprobe do_sys_open ret=$retval' >> /sys/kernel/debug/tracing/kprobe_events-----------------------------------------------------這里面一定要用">>",不然就會覆蓋前面的設(shè)置。
echo '-:myprobe' >> /sys/kernel/debug/tracing/kprobe_eventsecho '-:myretprobe' >> /sys/kernel/debug/tracing/kprobe_events
參數(shù)后面的寄存器是跟架構(gòu)相關(guān)的,%x0、%x1、%x2表示第1/2/3個參數(shù),超出部分使用$stack來存儲參數(shù)。
函數(shù)返回值保存在$retval中
對kprobe事件的是能通過往對應(yīng)事件的enable寫1開啟探測;寫0暫停探測。
echo > /sys/kernel/debug/tracing/trace
echo 'p:myprobe do_sys_open dfd=%x0 filename=%x1 flags=%x2 mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events
echo 'r:myretprobe do_sys_open ret=$retval' >> /sys/kernel/debug/tracing/kprobe_events
echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
ls
echo 0 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
echo 0 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
cat /sys/kernel/debug/tracing/trace
然后在/sys/kernel/debug/tracing/trace中可以看到結(jié)果。
ARM32,ARM64,X86寄存器及訪問方式
"r0", pt_regs->r0
"r1", pt_regs->r1
"r2", pt_regs->r2
"r3", pt_regs->r3
"r4", pt_regs->r4
"r5", pt_regs->r5
"r6", pt_regs->r6
"r7", pt_regs->r7
"r8", pt_regs->r8
"r9", pt_regs->r9
"r10",pt_regs->r10
"fp", pt_regs->fp
"ip", pt_regs->ip
"sp", pt_regs->sp
"lr", pt_regs->lr
"pc", pt_regs->pc
"x0", pt_regs->regs[0]
"x1", pt_regs->regs[1]
"x2", pt_regs->regs[2]
"x3", pt_regs->regs[3]
"x4", pt_regs->regs[4]
"x5", pt_regs->regs[5]
"x6", pt_regs->regs[6]
"x7", pt_regs->regs[7]
"x8", pt_regs->regs[8]
"x9", pt_regs->regs[9]
"x10", pt_regs->regs[10]
"x11", pt_regs->regs[11]
"x12", pt_regs->regs[12]
"x13", pt_regs->regs[13]
"x14", pt_regs->regs[14]
"x15", pt_regs->regs[15]
"x16", pt_regs->regs[16]
"x17", pt_regs->regs[17]
"x18", pt_regs->regs[18]
"x19", pt_regs->regs[19]
"x20", pt_regs->regs[20]
"x21", pt_regs->regs[21]
"x22", pt_regs->regs[22]
"x23", pt_regs->regs[23]
"x24", pt_regs->regs[24]
"x25", pt_regs->regs[25]
"x26", pt_regs->regs[26]
"x27", pt_regs->regs[27]
"x28", pt_regs->regs[28]
"x29", pt_regs->regs[29]
"x30", pt_regs->regs[30]
"sp", pt_regs->sp
"pc", pt_regs->pc
"pstate",pt_regs->pstate
rax pt_regs->ax
rcx pt_regs->cx
rdx pt_regs->cx
rbx pt_regs->bx
rsp pt_regs->sp
rbp pt_regs->bp
rdi pt_regs->di
rsi pt_regs->si
r8 pt_regs->r8
r9 pt_regs->r9
r10 pt_regs->r10
r11 pt_regs->r11
r12 pt_regs->r12
r13 pt_regs->r13
r14 pt_regs->r14
r15 pt_regs->r15
https://blog.csdn.net/jakelylll/article/details/123667320
https://www.cnblogs.com/LiuYanYGZ/p/12643846.html
https://blog.csdn.net/weixin_45030965/article/details/125922528
https://www.cnblogs.com/LiuYanYGZ/p/12643846.html
https://blog.csdn.net/jasonactions/article/details/121065795
https://blog.csdn.net/mrpre/article/details/106801888
https://blog.csdn.net/u011622208/article/details/115535291
kprobe https://blog.csdn.net/WANGYONGZIXUE/article/details/127525367
https://www.kernel.org/doc/html/latest/trace/kprobetrace.html#kprobe-based-event-tracing
https://www.cnblogs.com/arnoldlu/p/9752061.html
上一節(jié)介紹了kprobe的基本概念,下面我們將使用幾個具體的例子,看下kprobe在實際使用中有那些應(yīng)用場景。
內(nèi)核的samples/kprobe目錄下有kprobe相關(guān)的例子,我們以這些例子為基礎(chǔ),簡單修改下。
我們所有的例子都是探測do_sys_open() 或者_do_fork(),以下是內(nèi)核中的源碼。
struct audit_names;
struct filename {
const char *name; /* pointer to actual string */
const __user char *uptr; /* original userland pointer */
struct audit_names *aname;
int refcnt;
const char iname[];
};
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_flags op;
int fd = build_open_flags(flags, mode, &op);
struct filename *tmp;
if (fd)
return fd;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f);
fd_install(fd, f);
}
}
putname(tmp);
return fd;
}
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
{
struct task_struct *p;
int trace = 0;
long nr;
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
struct completion vfork;
struct pid *pid;
cpufreq_task_times_alloc(p);
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
put_pid(pid);
} else {
nr = PTR_ERR(p);
}
return nr;
}
實際調(diào)試中經(jīng)常需要調(diào)查函數(shù)使用的變量的值。要在kprobes的偵測器內(nèi)顯示某個函數(shù)的局部變量的值,需要一些技巧,原因是在printk的參數(shù)中無法直接指定變量名,因此必須給偵測器函數(shù)提供一個pt_regs結(jié)構(gòu),其中保存了指定地址的命令執(zhí)行時的寄存器信息。
當(dāng)然,不同架構(gòu)下該結(jié)構(gòu)的成員變量不盡相同,但用該結(jié)構(gòu)可以顯示變量等更為詳細(xì)的信息。
ARM64,ARM32,X86的寄存器及其訪問方式可以看文末的目錄
/*
* NOTE: This example is works on x86 and powerpc.
* Here's a sample kernel module showing the use of kprobes to dump a
* stack trace and selected registers when _do_fork() is called.
*
* For more information on theory of operation of kprobes, see
* Documentation/kprobes.txt
*
* You will see the trace data in /var/log/messages and on the console
* whenever _do_fork() is invoked to create a new process.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#define TRACE_SYMBOL "do_filp_open"
/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
.symbol_name = TRACE_SYMBOL,
};
/* x86_64中寄存器中參數(shù)的順序: rdi rsi rdx rcx r8 r9*/
/* aarch64: x0-x7 對應(yīng)參數(shù) */
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
int dfd = -1;
struct filename *filename = NULL;
#ifdef CONFIG_X86
dfd = regs->di;
filename = (struct filename *) regs->si;
#endif
#ifdef CONFIG_ARM64
dfd = regs->regs[0];
filename = (struct filename *) regs->regs[1];
#endif
if (filename && !(strcmp(filename->name, "testfile")))
printk(KERN_INFO "handler_pre:%s: dfd=%d, name=%s\n", p->symbol_name, dfd, filename->name);
return 0;
}
/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
//printk(KERN_INFO "handler_post\n");
}
/*
* fault_handler: this is called if an exception is generated for any
* instruction within the pre- or post-handler, or when Kprobes
* single-steps the probed instruction.
*/
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
/*printk(KERN_INFO "fault_handler: p->addr = 0x%p, trap #%dn",
p->addr, trapnr);*/
/* Return 0 because we don't handle the fault. */
return 0;
}
static int __init kprobe_init(void)
{
int ret;
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
kp.fault_handler = handler_fault;
ret = register_kprobe(&kp);
if (ret < 0) {
printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
return ret;
}
printk(KERN_INFO "Planted kprobe at %p\n", kp.addr);
return 0;
}
static void __exit kprobe_exit(void)
{
unregister_kprobe(&kp);
printk(KERN_INFO "kprobe at %p unregistered\n", kp.addr);
}
module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");
我們以內(nèi)核目錄下的例程做一個簡單修改,探測do_filp_open函數(shù),當(dāng)打開testfile文件時,自動打印出文件的路徑。
為了減少無效信息的打印,我們將handler_post,handler_fault直接注釋掉。
當(dāng)探測點do_filp_open命中時,Kprobes調(diào)用handler_pre。在handler_pre根據(jù)struct filename *pathname來獲得文件的名字。
在x86_64架構(gòu)中,函數(shù)的參數(shù)從左到右分別保存在rdi、rsi、rdx、rcx、r8、r9中,因此查看rdi和rsi就能得到第1個、第2個參數(shù)的值。
同理,在ARM64架構(gòu)中, 函數(shù)的參數(shù)1~參數(shù)8分別保存到 X0~X7 寄存器中 ,剩下的參數(shù)從右往左依次入棧。因此,X0和X1分別存放dfd, pathname的值。
CROSS_COMPILE:=aarch64-linux-gnu-
ARCH:= arm64
CC:= $(CROSS_COMPILE)gcc
LD:= $(CROSS_COMPILE)ld
PWD:= $(shell pwd)
obj-m := kprobe_example.o jprobe_example.o kretprobe_example.o
KERNELDIR:=/home/zhongyi/code/rk3399_linux_release_v2.5.1_20210301/kernel
all:
make -C $(KERNELDIR) M=$(PWD) modules ARCH=$(ARCH)
clean:
rm -f *.o
rm -f *.symvers
rm -f *.order
rm -f *.ko
rm -f *.mod.c
執(zhí)行make編譯后,在開發(fā)板上將驅(qū)動加載后,手動打開testfile文件。
insmod kprobe_example.ko
vim testfile
rmmod kprobe_example.ko
dmesg
使用dmesg可以看到成功輸出文件名和dfd。
[ 307.572314] Planted kprobe at ffffff80081fdf84
[ 311.997767] handler_pre:do_filp_open: dfd=-100, name=testfile
[ 312.034774] handler_pre:do_filp_open: dfd=-100, name=testfile
[ 347.969572] kprobe at ffffff80081fdf84 unregistered
使用kprobes的另一個有效的調(diào)試方法,就是顯示棧跟蹤。
我們只需要在handler_pre中調(diào)用dump_stack();即可。
/* x86_64中寄存器中參數(shù)的順序: rdi rsi rdx rcx r8 r9*/
/* aarch64: x0-x7 對應(yīng)參數(shù) */
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
dump_stack();
return 0;
}
編譯加載
insmod kprobe_example.ko
rmmod kprobe_example.ko
dmesg
成功打印出棧的信息。
[ 451.620803] CPU: 4 PID: 1299 Comm: rmmod Tainted: G O 4.4.194+ #18
[ 451.620809] Hardware name: Firefly-RK3399 Board (Linux Opensource) (DT)
[ 451.620813] Call trace:
[ 451.620820] [<ffffff8008088410>] dump_backtrace+0x0/0x220
[ 451.620828] [<ffffff8008088654>] show_stack+0x24/0x30
[ 451.620834] [<ffffff80084f842c>] dump_stack+0x94/0xbc
[ 451.620842] [<ffffff8000f22048>] handler_pre+0x14/0x24 [kprobe_example]
[ 451.620848] [<ffffff8008efd824>] kprobe_breakpoint_handler+0x100/0x14c
[ 451.620855] [<ffffff8008084128>] brk_handler+0x54/0x80
[ 451.620860] [<ffffff8008080b0c>] do_debug_exception+0x58/0xc0
[ 451.620866] Exception stack(0xffffffc0f2ef7c40 to 0xffffffc0f2ef7d70)
[ 451.620879] 7c40: ffffffc0ef782000 0000008000000000 ffffffc0f2ef7e20 ffffff80081fdf84
[ 451.620886] 7c60: 0000000060000145 ffffff8008efc228 ffffffc0ceff2a50 ffffffc0ee7d2988
[ 451.620892] 7c80: ffffffc0f2ef7ca0 ffffff80081c0dc8 ffffffc0f0582e70 00e80000e95f3f53
[ 451.620898] 7ca0: ffffffc0f2ef7d70 ffffff8008efe3e8 ffffffc0f2ef7ec0 0000005583d31928
[ 451.620905] 7cc0: 0000000000000055 0000000092000047 ffffffc0ceec5100 ffffffc0dccbd500
[ 451.620911] 7ce0: 0000000000000024 ffffffc0dccbd580 00000000ffffff9c ffffffc0ef782000
[ 451.620917] 7d00: ffffffc0f2ef7e78 0000000000000000 0000000000000000 0000000000000003
[ 451.620923] 7d20: ffffffc0dcfc9a80 0000007fd94380e8 0000000000000000 fefefefefefefeff
[ 451.620929] 7d40: 0000000000000001 0000007fd9437db8 0000000000000000 0000000000000000
[ 451.620934] 7d60: 0000000000000000 000000007fffffde
[ 451.620940] [<ffffff8008082668>] el1_dbg+0x18/0x7c
[ 451.620947] [<ffffff80081ed9a4>] SyS_openat+0x3c/0x4c
[ 451.620953] [<ffffff8008082f70>] el0_svc_naked+0x24/0x28
[ 451.630032] kprobe at ffffff80081fdf84 unregistered
kprobes擁有更加強(qiáng)大的功能,那就是它能在內(nèi)核的任意地址插入偵測器。此外,偵測器可以在任意地址的指令執(zhí)行之前或之后執(zhí)行,或者前后都執(zhí)行。
因此,應(yīng)當(dāng)觀察匯編代碼,找到源代碼中想要調(diào)查的位置對應(yīng)于編譯后的二進(jìn)制文件中的什么地址,并調(diào)查希望顯示的變量保存在哪個寄存器、哪個內(nèi)存地址。
通常,我們希望在函數(shù)執(zhí)行的過程中變量,即打印一些流程中的東西,而不是函數(shù)本身被調(diào)用,此時我們不能簡單設(shè)置 kprobe->symbol_name 函數(shù)名字 ,假設(shè)我們期望獲取 _do_fork函數(shù)變量 nr 的值:
將vmlinux進(jìn)行反匯編,找出_do_fork的地址。
aarch64-linux-gnu-objdump -s -d vmlinux > vmlinux.asm
_do_fork 反匯編如下所示,地址為ffffff80080ba83c。
ffffff80080ba83c <_do_fork>:
ffffff80080ba83c: a9b97bfd stp x29, x30, [sp, #-112]!
ffffff80080ba840: 910003fd mov x29, sp
ffffff80080ba844: a90153f3 stp x19, x20, [sp, #16]
ffffff80080ba848: a9025bf5 stp x21, x22, [sp, #32]
ffffff80080ba84c: a90363f7 stp x23, x24, [sp, #48]
ffffff80080ba850: aa0003f5 mov x21, x0
ffffff80080ba854: aa0103f3 mov x19, x1
ffffff80080ba858: aa0203f6 mov x22, x2
ffffff80080ba85c: aa0303f7 mov x23, x3
ffffff80080ba860: aa0403f8 mov x24, x4
ffffff80080ba864: aa1e03e0 mov x0, x30
ffffff80080ba868: 97ff4e8a bl ffffff800808e290 <_mcount>
ffffff80080ba86c: 37b814f5 tbnz w21, #23, ffffff80080bab08 <_do_fork+0x2cc>
ffffff80080ba870: 37701495 tbnz w21, #14, ffffff80080bab00 <_do_fork+0x2c4>
ffffff80080ba874: 92401ea0 and x0, x21, #0xff
ffffff80080ba878: 52800074 mov w20, #0x3 // #3
ffffff80080ba87c: f100441f cmp x0, #0x11
ffffff80080ba880: 1a9f1694 csinc w20, w20, wzr, ne // ne = any
ffffff80080ba884: 11000e81 add w1, w20, #0x3
............................
ffffff80080ba91c: b5000fb6 cbnz x22, ffffff80080bab10 <_do_fork+0x2d4>
ffffff80080ba920: 52800001 mov w1, #0x0 // #0
ffffff80080ba924: aa1303e0 mov x0, x19
ffffff80080ba928: 94006a17 bl ffffff80080d5184 <get_task_pid>
ffffff80080ba92c: aa0003f6 mov x22, x0
ffffff80080ba930: 94006a85 bl ffffff80080d5344 pid_vnr>
ffffff80080ba934: 93407c18 sxtw x24, w0
ffffff80080ba938: 36a00195 tbz w21, #20, ffffff80080ba968 <_do_fork+0x12c>
ffffff80080ba93c: d5384101 mrs x1, sp_el0
ffffff80080ba940: f9400422 ldr x2, [x1, #8]
ffffff80080ba944: aa1703e1 mov x1, x23
ffffff80080ba948: b1001021 adds x1, x1, #0x4
nr 變量 是 函數(shù)pid_vnr的返回值(也是子進(jìn)程的pid) ,根據(jù)ARM調(diào)用規(guī)范,調(diào)用完成pid_vnr()后,寄存器x0存放的就是其函數(shù)返回值。
參考:ARM64調(diào)用標(biāo)準(zhǔn) https://blog.51cto.com/u_15333820/3452605
通過反匯編可以知道,pid_vnr在 ffffff80080ba930地址處被調(diào)用,因此,偵測器的插入地址就是在ffffff80080ba930之后,并且x0被改變之前。只要符合這兩個條件,放在哪里都無所謂。
因此,我們將kprobe的點設(shè)置為ffffff80080ba934,然后獲取 x0,就能獲取變量nr的值。
.offset 是探測點相對于_do_fork的偏移,在注冊時指定。我們這里的 offset = ffffff80080ba934 - ffffff80080ba83c = F8。
另外,反匯編能力就是多看匯編以及找到幾個關(guān)鍵點(例如常量,跳轉(zhuǎn)語句)就能定位到匯編對應(yīng)的源碼了,這里不再展開了。
/*
* NOTE: This example is works on x86 and powerpc.
* Here's a sample kernel module showing the use of kprobes to dump a
* stack trace and selected registers when _do_fork() is called.
*
* For more information on theory of operation of kprobes, see
* Documentation/kprobes.txt
*
* You will see the trace data in /var/log/messages and on the console
* whenever _do_fork() is invoked to create a new process.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
.symbol_name = "_do_fork",
.offset = 0xF8,
};
/* kprobe pre_handler: called just before the probed instruction is executed */
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
printk(KERN_INFO "pre_handler: p->addr = 0x%p, ip = %lx,"
" flags = 0x%lx,rax = 0x%lx\n",
p->addr, regs->ip, regs->flags,regs->ax);
#endif
#ifdef CONFIG_ARM64
pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
" pstate = 0x%lx,x0 = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate,(long)regs->regs[0]);
#endif
/* A dump_stack() here will give a stack backtrace */
return 0;
}
/* kprobe post_handler: called after the probed instruction is executed */
static void handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
#ifdef CONFIG_X86
printk(KERN_INFO "post_handler: p->addr = 0x%p, flags = 0x%lx\n",
p->addr, regs->flags);
#endif
#ifdef CONFIG_ARM64
pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pstate);
#endif
}
/*
* fault_handler: this is called if an exception is generated for any
* instruction within the pre- or post-handler, or when Kprobes
* single-steps the probed instruction.
*/
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
printk(KERN_INFO "fault_handler: p->addr = 0x%p, trap #%dn",
p->addr, trapnr);
/* Return 0 because we don't handle the fault. */
return 0;
}
static int __init kprobe_init(void)
{
int ret;
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
kp.fault_handler = handler_fault;
ret = register_kprobe(&kp);
if (ret < 0) {
printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
return ret;
}
printk(KERN_INFO "Planted kprobe at %p\n", kp.addr);
return 0;
}
static void __exit kprobe_exit(void)
{
unregister_kprobe(&kp);
printk(KERN_INFO "kprobe at %p unregistered\n", kp.addr);
}
module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");
insmod kprobe_example.ko
rmmod kprobe_example.ko
dmesg
編譯加載后,成功打印出rax的值。
[ 245.080636] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.080640] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.080936] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.080938] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.457340] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.457345] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.457643] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.457645] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.719208] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.719213] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.719505] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.719507] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.820761] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.820765] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 245.821061] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 245.821063] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 246.092572] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 246.092577] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 246.095863] pre_handler: p->addr = 0x0000000050a6c3dd, ip = ffffffffa5ca0009, flags = 0x246,rax = 0x2
[ 246.095867] post_handler: p->addr = 0x0000000050a6c3dd, flags = 0x246
[ 246.126196] kprobe at 0000000050a6c3dd unregistered
與kprobes相比,jprobes能更容易地獲取傳給函數(shù)的參數(shù)。有幾點需要注意:
/*
* Here's a sample kernel module showing the use of jprobes to dump
* the arguments of _do_fork().
*
* For more information on theory of operation of jprobes, see
* Documentation/kprobes.txt
*
* Build and insert the kernel module as done in the kprobe example.
* You will see the trace data in /var/log/messages and on the
* console whenever _do_fork() is invoked to create a new process.
* (Some messages may be suppressed if syslogd is configured to
* eliminate duplicate messages.)
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
/*
* Jumper probe for _do_fork.
* Mirror principle enables access to arguments of the probed routine
* from the probe handler.
*/
/* Proxy routine having the same arguments as actual _do_fork() routine */
#define TRACE_SYMBOL "do_filp_open"
/*與do_filp_open 的參數(shù)完全相同*/
static struct file * jp_do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
if (pathname && !(strcmp(pathname->name, "testfile")))
printk(KERN_INFO "jprobe: dfd = %d, pathname = %s\n", dfd, pathname->name);
/* Always end with a call to jprobe_return(). */
jprobe_return();
return 0;
}
static struct jprobe my_jprobe = {
.entry = jp_do_filp_open,
.kp = {
.symbol_name = TRACE_SYMBOL,
},
};
static int __init jprobe_init(void)
{
int ret;
ret = register_jprobe(&my_jprobe);
if (ret < 0) {
printk(KERN_INFO "register_jprobe failed, returned %d\n", ret);
return -1;
}
printk(KERN_INFO "Planted jprobe at %p, handler addr %p\n",
my_jprobe.kp.addr, my_jprobe.entry);
return 0;
}
static void __exit jprobe_exit(void)
{
unregister_jprobe(&my_jprobe);
printk(KERN_INFO "jprobe at %p unregistered\n", my_jprobe.kp.addr);
}
module_init(jprobe_init)
module_exit(jprobe_exit)
MODULE_LICENSE("GPL");
使用kprobes時,必須通過寄存器或棧才能計算出參數(shù)的值。此外,計算方法還依賴于架構(gòu)。
如果使用jprobes,那么無須了解架構(gòu)的詳細(xì)知識,也能簡單地查看參數(shù)的值。
編譯加載驅(qū)動程序
insmod jprobe_example.ko
vim testfile
rmmod jprobe_example.ko
dmesg
成功打印出函數(shù)的參數(shù)
[ 612.670453] jprobe at ffffff80081fdf84 unregistered
[ 867.293765] Planted jprobe at ffffff80081fdf84, handler addr ffffff8000f1a000
[ 871.107502] jprobe: dfd = -100, pathname = testfile
[ 871.147747] jprobe: dfd = -100, pathname = testfile
[ 875.723761] jprobe at ffffff80081fdf84 unregistered
[ 907.706066] Planted jprobe at ffffff80081fdf84, handler addr ffffff8000f22000
[ 911.661891] jprobe: dfd = -100, pathname = testfile
[ 911.694903] jprobe: dfd = -100, pathname = testfile
[ 919.272187] jprobe at ffffff80081fdf84 unregistered
[ 2296.830613] Planted jprobe at ffffff80081fdf84, handler addr ffffff8000f2a000
[ 2302.164861] jprobe: dfd = -100, pathname = testfile
[ 2302.200634] jprobe: dfd = -100, pathname = testfile
[ 2307.407014] jprobe at ffffff80081fdf84 unregistered
kretprobe 也是基于kprobe的,相比于kprobe和jprobe,實現(xiàn)相對復(fù)雜。下面我們以內(nèi)核目錄下的例程,簡單分析下。
/*
* kretprobe_example.c
*
* Here's a sample kernel module showing the use of return probes to
* report the return value and total time taken for probed function
* to run.
*
* usage: insmod kretprobe_example.ko func=<func_name>
*
* If no func_name is specified, _do_fork is instrumented
*
* For more information on theory of operation of kretprobes, see
* Documentation/kprobes.txt
*
* Build and insert the kernel module as done in the kprobe example.
* You will see the trace data in /var/log/messages and on the console
* whenever the probed function returns. (Some messages may be suppressed
* if syslogd is configured to eliminate duplicate messages.)
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/ktime.h>
#include <linux/limits.h>
#include <linux/sched.h>
static char func_name[NAME_MAX] = "do_sys_open";
module_param_string(func, func_name, NAME_MAX, S_IRUGO);
MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
" function's execution time");
/* per-instance private data */
struct my_data {
ktime_t entry_stamp;
};
/* Here we use the entry_hanlder to timestamp function entry */
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct my_data *data;
if (!current->mm)
return 1; /* Skip kernel threads */
data = (struct my_data *)ri->data;
data->entry_stamp = ktime_get();
return 0;
}
/*
* Return-probe handler: Log the return value and duration. Duration may turn
* out to be zero consistently, depending upon the granularity of time
* accounting on the platform.
*/
static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
int retval = regs_return_value(regs);
struct my_data *data = (struct my_data *)ri->data;
s64 delta;
ktime_t now;
now = ktime_get();
delta = ktime_to_ns(ktime_sub(now, data->entry_stamp));
printk(KERN_INFO "%s returned %d and took %lld ns to execute\n",
func_name, retval, (long long)delta);
return 0;
}
static struct kretprobe my_kretprobe = {
.handler = ret_handler,
.entry_handler = entry_handler,
.data_size = sizeof(struct my_data),
/* Probe up to 20 instances concurrently. */
.maxactive = 20,
};
static int __init kretprobe_init(void)
{
int ret;
my_kretprobe.kp.symbol_name = func_name;
ret = register_kretprobe(&my_kretprobe);
if (ret < 0) {
printk(KERN_INFO "register_kretprobe failed, returned %d\n",
ret);
return -1;
}
printk(KERN_INFO "Planted return probe at %s: %p\n",
my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr);
return 0;
}
static void __exit kretprobe_exit(void)
{
unregister_kretprobe(&my_kretprobe);
printk(KERN_INFO "kretprobe at %p unregistered\n",
my_kretprobe.kp.addr);
/* nmissed > 0 suggests that maxactive was set too low. */
printk(KERN_INFO "Missed probing %d instances of %s\n",
my_kretprobe.nmissed, my_kretprobe.kp.symbol_name);
}
module_init(kretprobe_init)
module_exit(kretprobe_exit)
MODULE_LICENSE("GPL");
/*
* Function-return probe -
* Note:
* User needs to provide a handler function, and initialize maxactive.
* maxactive - The maximum number of instances of the probed function that
* can be active concurrently.
* nmissed - tracks the number of times the probed function's return was
* ignored, due to maxactive being too low.
*
*/
struct kretprobe {
struct kprobe kp;
kretprobe_handler_t handler;
kretprobe_handler_t entry_handler;
int maxactive;
int nmissed;
size_t data_size;
struct hlist_head free_instances;
raw_spinlock_t lock;
};
typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
struct pt_regs *);
struct kretprobe_instance {
struct hlist_node hlist;
struct kretprobe *rp;
kprobe_opcode_t *ret_addr;
struct task_struct *task;
char data[0];
};
kretprobe探測點的blackpoint,用來表示不支持kretprobe探測的函數(shù)的信息。name表示該函數(shù)名,addr表示該函數(shù)的地址。
struct kretprobe_blackpoint {
const char *name;
void *addr;
};
1234
blackpoint與架構(gòu)相關(guān),x86架構(gòu)不支持的kretprobe探測點如下:
// arch/x86/kernel/kprobes/core.c
// 不支持kretprobe探測的函數(shù),從blacklist這個名字中我們也知道其含義了。
struct kretprobe_blackpoint kretprobe_blacklist[] = {
{"__switch_to", }, /* This function switches only current task, but
doesn't switch kernel stack.*/
{NULL, NULL} /* Terminator */
};
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
123456789
函數(shù)的開頭首先處理 kretprobe_blacklis t,如果指定的被探測函數(shù)在這個blacklist中就直接返回EINVAL,表示不支持探測,在x86架構(gòu)中是__switch_to 這個函數(shù),表示這個函數(shù)不能被kretprobe。
int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
struct kretprobe_instance *inst;
int i;
void *addr;
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
if (IS_ERR(addr))
return PTR_ERR(addr);
//如果kretprobe到kretprobe_blacklist中函數(shù),則返回EINVAL
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
if (kretprobe_blacklist[i].addr == addr)
return -EINVAL;
}
}
//內(nèi)核設(shè)置回調(diào)函數(shù) pre_handler_kretprobe 。
//與kprobe不同的是:kretprobe不支持用戶定義pre_handler和post_handler等回調(diào)函數(shù)。
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;
rp->kp.break_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPT
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
rp->maxactive = num_possible_cpus();
#endif
}
raw_spin_lock_init(&rp->lock);
INIT_HLIST_HEAD(&rp->free_instances);
//根據(jù)maxactive值分配 struct kretprobe_instance 內(nèi)存空間
for (i = 0; i < rp->maxactive; i++) {
inst = kmalloc(sizeof(struct kretprobe_instance) +
rp->data_size, GFP_KERNEL);
if (inst == NULL) {
free_rp_inst(rp);
return -ENOMEM;
}
INIT_HLIST_NODE(&inst->hlist);
hlist_add_head(&inst->hlist, &rp->free_instances);
}
rp->nmissed = 0;
/* Establish function entry probe point */
//注冊kprobe探測點
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
最后調(diào)用 register_kprobe(&rp->kp),注冊kprobe點,可以看出kretprobe也是基于kprobe機(jī)制實現(xiàn)的,kretprobe也是一種特殊形式的kprobe。
kretprobe注冊完成后就默認(rèn)啟動探測。
pre_handler_kretprobe這個函數(shù)是內(nèi)核自己定義的,內(nèi)核已經(jīng)指定該回調(diào)函數(shù),不支持用戶自定義。這個 kprobe pre_handler 在每個 kretprobe 中注冊。 當(dāng)探針命中時,它將設(shè)置返回探針。
#ifdef CONFIG_KRETPROBES
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
*/
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
unsigned long hash, flags = 0;
struct kretprobe_instance *ri;
/*
* To avoid deadlocks, prohibit return probing in NMI contexts,
* just skip the probe and increase the (inexact) 'nmissed'
* statistical counter, so that the user is informed that
* something happened:
*/
if (unlikely(in_nmi())) {
rp->nmissed++;
return 0;
}
/* TODO: consider to only swap the RA after the last pre_handler fired */
hash = hash_ptr(current, KPROBE_HASH_BITS);
raw_spin_lock_irqsave(&rp->lock, flags);
if (!hlist_empty(&rp->free_instances)) {
ri = hlist_entry(rp->free_instances.first,
struct kretprobe_instance, hlist);
hlist_del(&ri->hlist);
raw_spin_unlock_irqrestore(&rp->lock, flags);
ri->rp = rp;
ri->task = current;
(1)
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
raw_spin_lock_irqsave(&rp->lock, flags);
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock_irqrestore(&rp->lock, flags);
return 0;
}
(2)
arch_prepare_kretprobe(ri, regs);
/* XXX(hch): why is there no hlist_move_head? */
INIT_HLIST_NODE(&ri->hlist);
kretprobe_table_lock(hash, &flags);
hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
kretprobe_table_unlock(hash, &flags);
} else {
rp->nmissed++;
raw_spin_unlock_irqrestore(&rp->lock, flags);
}
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
struct kretprobe *rp
rp->entry_handler && rp->entry_handler(ri, regs)
entry_handler這個回調(diào)函數(shù)就是用戶自己定義的回調(diào)函數(shù)(可選的用戶指定的處理程序),前面我們已經(jīng)介紹過了,在這里不再介紹。
/* Here we use the entry_hanlder to timestamp function entry */
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct my_data *data;
//內(nèi)核線程 task->mm == NULL
if (!current->mm)
return 1; /* Skip kernel threads */
data = (struct my_data *)ri->data;
data->entry_stamp = ktime_get();
return 0;
}
arch_prepare_kretprobe(ri, regs)該函數(shù)架構(gòu)相關(guān),struct kretprobe_instance結(jié)構(gòu)體 的 ret_addr 成員用于保存并替換regs中的返回地址。返回地址被替換為kretprobe_trampoline。
// arch/x86/kernel/kprobes/core.c
#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
// x86_64
// arch/x86/include/asm/ptrace.h
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
// arch/x86/kernel/kprobes/core.c
void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long *sara = stack_addr(regs);
ri->ret_addr = (kprobe_opcode_t *) *sara;
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) &kretprobe_trampoline;
}
NOKPROBE_SYMBOL(arch_prepare_kretprobe);
//struct kretprobe_instance *ri;
//ri->ret_addr;
struct kretprobe_instance {
kprobe_opcode_t *ret_addr; //用于保存原始被探測函數(shù)的返回地址
};
// arch/arm64/kernel/probes/kprobes.c
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
ri->ret_addr = (kprobe_opcode_t *)regs->regs[30];
/* replace return addr (x30) with trampoline */
regs->regs[30] = (long)&kretprobe_trampoline;
}
ARM64架構(gòu)中regs->regs[30]是LR(procedure link register)寄存器(X30 :LR)。
kretprobe是基于kprobe實現(xiàn)的,有一個固定的pre_handler回調(diào)函數(shù),在內(nèi)核中實現(xiàn),無需用戶編寫。而在kprobe中pre_handler函數(shù)是提供給用戶的回調(diào)函數(shù)。
rp->kp.pre_handler = pre_handler_kretprobe; //內(nèi)核中已經(jīng)實現(xiàn)
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;
rp->kp.break_handler = NULL;
kretprobe提供給用戶的兩個回調(diào)函數(shù):
kretprobe_handler_t handler;
kretprobe_handler_t entry_handler; // (可選)
pre_handler回調(diào)函數(shù)會為kretprobe探測函數(shù)執(zhí)行的返回值做準(zhǔn)備工作,其中最主要的就是替換掉正常流程的返回地址,讓被探測函數(shù)在執(zhí)行之后能夠跳轉(zhuǎn)到kretprobe設(shè)計的函數(shù) kretprobe_trampoline中去。
pre_handler_kretprobe函數(shù)返回后,kprobe流程接著執(zhí)行singlestep流程并返回到正常的執(zhí)行流程,被探測函數(shù)(do_fork)繼續(xù)執(zhí)行,直到它執(zhí)行完畢并返回。
由于返回地址被替換為kretprobe_trampoline,所以跳轉(zhuǎn)到kretprobe_trampoline執(zhí)行,該函數(shù)架構(gòu)相關(guān)且有嵌入?yún)R編實現(xiàn)。
該函數(shù)會獲取被探測函數(shù)的寄存器信息并調(diào)用用戶定義的回調(diào)函數(shù)輸出其中的返回值,最后函數(shù)返回正常的執(zhí)行流程。
static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long retval = regs_return_value(regs);
......
}
static struct kretprobe my_kretprobe = {
.handler = ret_handler,
};
(1)
kretprobe_trampoline
-->trampoline_handler
kretprobe_trampoline
(2) kretprobe_trampoline
// arch/x86/kernel/kprobes/core.c
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
*/
asm(
".global kretprobe_trampoline\n"
".type kretprobe_trampoline, @function\n"
"kretprobe_trampoline:\n"
#ifdef CONFIG_X86_64
/* We don't bother saving the ss register */
" pushq %rsp\n"
" pushfq\n"
SAVE_REGS_STRING
" movq %rsp, %rdi\n"
" call trampoline_handler\n"
/* Replace saved sp with true return address. */
" movq %rax, 152(%rsp)\n"
RESTORE_REGS_STRING
" popfq\n"
#else
" pushf\n"
SAVE_REGS_STRING
" movl %esp, %eax\n"
" call trampoline_handler\n"
/* Move flags to cs */
" movl 56(%esp), %edx\n"
" movl %edx, 52(%esp)\n"
/* Replace saved flags with true return address. */
" movl %eax, 56(%esp)\n"
RESTORE_REGS_STRING
" popf\n"
#endif
" ret\n"
".size kretprobe_trampoline, .-kretprobe_trampoline\n"
);
NOKPROBE_SYMBOL(kretprobe_trampoline);
STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
(3) trampoline_handler
// arch/x86/kernel/kprobes/core.c
/*
* Called from kretprobe_trampoline
*/
__visible __used void *trampoline_handler(struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
kprobe_opcode_t *correct_ret_addr = NULL;
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
/* fixup registers */
#ifdef CONFIG_X86_64
regs->cs = __KERNEL_CS;
#else
regs->cs = __KERNEL_CS | get_kernel_rpl();
regs->gs = 0;
#endif
regs->ip = trampoline_address;
regs->orig_ax = ~0UL;
/*
* It is possible to have multiple instances associated with a given
* task either because multiple functions in the call path have
* return probes installed on them, and/or more than one
* return probe was registered for a target function.
*
* We can handle this because:
* - instances are always pushed into the head of the list
* - when multiple return probes are registered for the same
* function, the (chronologically) first instance's ret_addr
* will be the real return address, and all the rest will
* point to kretprobe_trampoline.
*/
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_assert(ri, orig_ret_address, trampoline_address);
correct_ret_addr = ri->ret_addr;
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
__this_cpu_write(current_kprobe, &ri->rp->kp);
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
__this_cpu_write(current_kprobe, NULL);
}
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_hash_unlock(current, &flags);
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
return (void *)orig_ret_address;
}
NOKPROBE_SYMBOL(trampoline_handler);
(4) ri->rp->handler(ri, regs)表示執(zhí)行用戶態(tài)自定義的回調(diào)函數(shù)handler(用來獲取_do_fork函數(shù)的返回值),handler回調(diào)函數(shù)執(zhí)行完畢以后,調(diào)用recycle_rp_inst函數(shù)將當(dāng)前的kretprobe_instance實例從kretprobe_inst_table哈希表釋放,重新鏈入free_instances中,以備后面kretprobe觸發(fā)時使用,另外如果kretprobe已經(jīng)被注銷則將它添加到銷毀表中待銷毀。
ri->rp->handler(ri, regs);
->recycle_rp_inst(ri, &empty_rp);
12
void recycle_rp_inst(struct kretprobe_instance *ri,
struct hlist_head *head)
{
struct kretprobe *rp = ri->rp;
/* remove rp inst off the rprobe_inst_table */
hlist_del(&ri->hlist);
INIT_HLIST_NODE(&ri->hlist);
if (likely(rp)) {
raw_spin_lock(&rp->lock);
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock(&rp->lock);
} else
/* Unregistering */
hlist_add_head(&ri->hlist, head);
}
NOKPROBE_SYMBOL(recycle_rp_inst);
(5) trampoline_handler函數(shù)執(zhí)行完后,返回被探測函數(shù)的原始返回地址,執(zhí)行流程再次回到kretprobe_trampoline函數(shù)中,將保存的 sp 替換為真實的返回地址。 從rax寄存器中取出原始的返回地址,然后恢復(fù)原始函數(shù)調(diào)用棧空間,最后跳轉(zhuǎn)到原始返回地址執(zhí)行,至此函數(shù)調(diào)用的流程就回歸正常流程了,整個kretprobe探測結(jié)束。
/* Replace saved sp with true return address. */
" movq %rax, 152(%rsp)\n"
RESTORE_REGS_STRING
" popfq\n"
1234
(1)
kretprobe_trampoline
-->trampoline_probe_handler
kretprobe_trampoline
(2) kretprobe_trampoline
// arch/arm64/kernel/probes/kprobes_trampoline.S
ENTRY(kretprobe_trampoline)
sub sp, sp, #S_FRAME_SIZE
save_all_base_regs
mov x0, sp
bl trampoline_probe_handler
/*
* Replace trampoline address in lr with actual orig_ret_addr return
* address.
*/
mov lr, x0
restore_all_base_regs
add sp, sp, #S_FRAME_SIZE
ret
ENDPROC(kretprobe_trampoline)
(3) trampoline_probe_handler
// arch/arm64/kernel/probes/kprobes.c
void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address =
(unsigned long)&kretprobe_trampoline;
kprobe_opcode_t *correct_ret_addr = NULL;
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
/*
* It is possible to have multiple instances associated with a given
* task either because multiple functions in the call path have
* return probes installed on them, and/or more than one
* return probe was registered for a target function.
*
* We can handle this because:
* - instances are always pushed into the head of the list
* - when multiple return probes are registered for the same
* function, the (chronologically) first instance's ret_addr
* will be the real return address, and all the rest will
* point to kretprobe_trampoline.
*/
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_assert(ri, orig_ret_address, trampoline_address);
correct_ret_addr = ri->ret_addr;
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
__this_cpu_write(current_kprobe, &ri->rp->kp);
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
__this_cpu_write(current_kprobe, NULL);
}
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_hash_unlock(current, &flags);
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
return (void *)orig_ret_address;
}
(4) 將 lr寄存器中的trampoline地址替換為實際的 orig_ret_addr 返回地址。 從x0寄存器中取出原始的返回地址,然后恢復(fù)原始函數(shù)調(diào)用??臻g,最后跳轉(zhuǎn)到原始返回地址執(zhí)行,至此函數(shù)調(diào)用的流程就回歸正常流程了,整個kretprobe探測結(jié)束。
/*
* Replace trampoline address in lr with actual orig_ret_addr return
* address.
*/
mov lr, x0
restore_all_base_regs
add sp, sp, #S_FRAME_SIZE
ret
insmod kprobe_example.ko
vim testfile
rmmod kprobe_example.ko
dmesg
成功打印出函數(shù)的執(zhí)行時間
[ 1056.875938] do_sys_open returned -2 and took 10500 ns to execute
[ 1057.567400] do_sys_open returned 34 and took 59208 ns to execute
[ 1058.382932] do_sys_open returned 3 and took 31469101 ns to execute
[ 1058.567046] do_sys_open returned 34 and took 61250 ns to execute
[ 1058.975879] do_sys_open returned 3 and took 224084 ns to execute
[ 1058.975935] do_sys_open returned 3 and took 16917 ns to execute
[ 1058.976041] do_sys_open returned 3 and took 13417 ns to execute
[ 1058.976148] do_sys_open returned 3 and took 15167 ns to execute
[ 1058.976254] do_sys_open returned 3 and took 15750 ns to execute
[ 1058.976356] do_sys_open returned 3 and took 16042 ns to execute
[ 1058.978036] do_sys_open returned -2 and took 23041 ns to execute
[ 1058.978074] do_sys_open returned 3 and took 24500 ns to execute
[ 1058.978175] do_sys_open returned -2 and took 9334 ns to execute
[ 1058.978211] do_sys_open returned 3 and took 23333 ns to execute
[ 1058.978246] do_sys_open returned 3 and took 13417 ns to execute
[ 1058.978286] do_sys_open returned 3 and took 14583 ns to execute
[ 1058.989701] kretprobe at ffffff80081ed6c8 unregistered
[ 1058.989709] Missed probing 0 instances of do_sys_open
這些事件類似于基于tracepoint的事件。與Tracepoint不同,它是基于kprobes(kprobe和kretprobe)的。所以它可以探測任何kprobes可以探測的地方。與基于Tracepoint的事件不同的是,它可以動態(tài)地添加和刪除。
要啟用這個功能,在編譯內(nèi)核時CONFIG_KPROBE_EVENTS=y
與 Event Tracing類似,這不需要通過current_tracer來激活??梢酝ㄟ^/sys/kernel/debug/tracing/kprobe_events添加探測點,并通過/sys/kernel/debug/tracing/events/kprobes/<EVENT>/enable來啟用它。
你也可以使用/sys/kernel/debug/tracing/dynamic_events,而不是kprobe_events。該接口也將提供對其他動態(tài)事件的統(tǒng)一訪問。
kprobe和內(nèi)核的ftrac結(jié)合使用,需要對內(nèi)核進(jìn)行配置,然后添加探測點、進(jìn)行探測、查看結(jié)果。
CONFIG_KPROBES=y
CONFIG_OPTPROBES=y
CONFIG_KPROBES_ON_FTRACE=y
CONFIG_UPROBES=y
CONFIG_KRETPROBES=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_OPTPROBES=y
CONFIG_HAVE_KPROBES_ON_FTRACE=y
CONFIG_KPROBE_EVENT=y
kprobe事件相關(guān)的節(jié)點有如下:
/sys/kernel/debug/tracing/kprobe_events-----------------------配置kprobe事件屬性,增加事件之后會在kprobes下面生成對應(yīng)目錄。
/sys/kernel/debug/tracing/kprobe_profile----------------------kprobe事件統(tǒng)計屬性文件。
/sys/kernel/debug/tracing/kprobes/<GRP>/<EVENT>/enabled-------使能kprobe事件
/sys/kernel/debug/tracing/kprobes/<GRP>/<EVENT>/filter--------過濾kprobe事件
/sys/kernel/debug/tracing/kprobes/<GRP>/<EVENT>/format--------查詢kprobe事件顯示格式
新增一個kprobe事件,通過寫kprobe_events來設(shè)置。
p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS]-------------------設(shè)置一個probe探測點
r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS]------------------------------設(shè)置一個return probe探測點
-:[GRP/]EVENT----------------------------------------------------------刪除一個探測點
細(xì)節(jié)解釋如下:
GRP : Group name. If omitted, use "kprobes" for it.------------設(shè)置后會在events/kprobes下創(chuàng)建<GRP>目錄。
EVENT : Event name. If omitted, the event name is generated based on SYM+offs or MEMADDR.---指定后在events/kprobes/<GRP>生成<EVENT>目錄。 MOD : Module name which has given SYM.--------------------------模塊名,一般不設(shè)
SYM[+offs] : Symbol+offset where the probe is inserted.-------------被探測函數(shù)名和偏移
MEMADDR : Address where the probe is inserted.----------------------指定被探測的內(nèi)存絕對地址
FETCHARGS : Arguments. Each probe can have up to 128 args.----------指定要獲取的參數(shù)信息。 %REG : Fetch register REG---------------------------------------獲取指定寄存器值
@ADDR : Fetch memory at ADDR (ADDR should be in kernel)--------獲取指定內(nèi)存地址的值
@SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)---獲取全局變量的值 $stackN : Fetch Nth entry of stack (N >= 0)----------------------------------獲取指定??臻g值,即sp寄存器+N后的位置值
$stack : Fetch stack address.-----------------------------------------------獲取sp寄存器值
$retval : Fetch return value.(*)--------------------------------------------獲取返回值,用戶return kprobe
$comm : Fetch current task comm.----------------------------------------獲取對應(yīng)進(jìn)程名稱。
+|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)------------- NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "string" and bitfield are supported.----------------設(shè)置參數(shù)的類型,可以支持字符串和比特類型
(*) only for return probe.
(**) this is useful for fetching a field of data structures.
執(zhí)行如下兩條命令就會生成目錄/sys/kernel/debug/tracing/events/kprobes/myprobe;第三條命令則可以刪除指定kprobe事件,如果要全部刪除則echo > /sys/kernel/debug/tracing/kprobe_events。
echo 'p:myprobe do_sys_open dfd=%x0 filename=%x1 flags=%x2 mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events
echo 'r:myretprobe do_sys_open ret=$retval' >> /sys/kernel/debug/tracing/kprobe_events-----------------------------------------------------這里面一定要用">>",不然就會覆蓋前面的設(shè)置。
echo '-:myprobe' >> /sys/kernel/debug/tracing/kprobe_eventsecho '-:myretprobe' >> /sys/kernel/debug/tracing/kprobe_events
參數(shù)后面的寄存器是跟架構(gòu)相關(guān)的,%x0、%x1、%x2表示第1/2/3個參數(shù),超出部分使用$stack來存儲參數(shù)。
函數(shù)返回值保存在$retval中
對kprobe事件的是能通過往對應(yīng)事件的enable寫1開啟探測;寫0暫停探測。
echo > /sys/kernel/debug/tracing/trace
echo 'p:myprobe do_sys_open dfd=%x0 filename=%x1 flags=%x2 mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events
echo 'r:myretprobe do_sys_open ret=$retval' >> /sys/kernel/debug/tracing/kprobe_events
echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
ls
echo 0 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
echo 0 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
cat /sys/kernel/debug/tracing/trace
然后在/sys/kernel/debug/tracing/trace中可以看到結(jié)果。
ARM32,ARM64,X86寄存器及訪問方式
"r0", pt_regs->r0
"r1", pt_regs->r1
"r2", pt_regs->r2
"r3", pt_regs->r3
"r4", pt_regs->r4
"r5", pt_regs->r5
"r6", pt_regs->r6
"r7", pt_regs->r7
"r8", pt_regs->r8
"r9", pt_regs->r9
"r10",pt_regs->r10
"fp", pt_regs->fp
"ip", pt_regs->ip
"sp", pt_regs->sp
"lr", pt_regs->lr
"pc", pt_regs->pc
"x0", pt_regs->regs[0]
"x1", pt_regs->regs[1]
"x2", pt_regs->regs[2]
"x3", pt_regs->regs[3]
"x4", pt_regs->regs[4]
"x5", pt_regs->regs[5]
"x6", pt_regs->regs[6]
"x7", pt_regs->regs[7]
"x8", pt_regs->regs[8]
"x9", pt_regs->regs[9]
"x10", pt_regs->regs[10]
"x11", pt_regs->regs[11]
"x12", pt_regs->regs[12]
"x13", pt_regs->regs[13]
"x14", pt_regs->regs[14]
"x15", pt_regs->regs[15]
"x16", pt_regs->regs[16]
"x17", pt_regs->regs[17]
"x18", pt_regs->regs[18]
"x19", pt_regs->regs[19]
"x20", pt_regs->regs[20]
"x21", pt_regs->regs[21]
"x22", pt_regs->regs[22]
"x23", pt_regs->regs[23]
"x24", pt_regs->regs[24]
"x25", pt_regs->regs[25]
"x26", pt_regs->regs[26]
"x27", pt_regs->regs[27]
"x28", pt_regs->regs[28]
"x29", pt_regs->regs[29]
"x30", pt_regs->regs[30]
"sp", pt_regs->sp
"pc", pt_regs->pc
"pstate",pt_regs->pstate
rax pt_regs->ax
rcx pt_regs->cx
rdx pt_regs->cx
rbx pt_regs->bx
rsp pt_regs->sp
rbp pt_regs->bp
rdi pt_regs->di
rsi pt_regs->si
r8 pt_regs->r8
r9 pt_regs->r9
r10 pt_regs->r10
r11 pt_regs->r11
r12 pt_regs->r12
r13 pt_regs->r13
r14 pt_regs->r14
r15 pt_regs->r15
https://blog.csdn.net/jakelylll/article/details/123667320
https://www.cnblogs.com/LiuYanYGZ/p/12643846.html
https://blog.csdn.net/weixin_45030965/article/details/125922528
https://www.cnblogs.com/LiuYanYGZ/p/12643846.html
https://blog.csdn.net/jasonactions/article/details/121065795
https://blog.csdn.net/mrpre/article/details/106801888
https://blog.csdn.net/u011622208/article/details/115535291
kprobe https://blog.csdn.net/WANGYONGZIXUE/article/details/127525367
https://www.kernel.org/doc/html/latest/trace/kprobetrace.html#kprobe-based-event-tracing
https://www.cnblogs.com/arnoldlu/p/9752061.html