AIRobot

AIRobot quick note


  • 首页

  • 关于

  • 标签

  • 分类

  • 归档

  • 搜索

内核探针kprobe

发表于 2021-03-14
本文字数: 13k 阅读时长 ≈ 12 分钟

利用kprobe探测内核信息

printk是内核调试常用的调试手段,但是每次更改都需要重新编译启动内核,调试效率很低。

kprobe根据实现原理也可以分为三种:基于动态ftrace的kprobe,基于int3的kprobe和基于jump相对跳转指令实现的kprobe。

这里主要说int3的kprobe,int3是x86架构下的trap类型的断点指令。代码执行到对应位置会发生trap,然后执行kprobe的代码,执行后恢复现场。

使用

第一步是先获取要插入点的地址。可以通过cat /proc/kallsyms | grep <symbol_name>设置addr成员或者设置symbol_name成员由kprobe自行查找获取。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>

#define MAX_SYMBOL_LEN 64
static char symbol[MAX_SYMBOL_LEN] = "kernel_clone";
module_param_string(symbol, symbol, sizeof(symbol), 0644);

/* 分配kprobe结构,设置地址或者符号名, 按需要是否设置地址偏移量 */
static struct kprobe kp = {
.symbol_name = symbol,
// .addr = ,
// .offset = ,
};

/* kprobe pre_handler: 会在探测点前执行 */
static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs) //step 2
{
#ifdef CONFIG_X86
pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n",
p->symbol_name, p->addr, regs->ip, regs->flags);
#endif
#ifdef CONFIG_PPC
pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n",
p->symbol_name, p->addr, regs->nip, regs->msr);
#endif
#ifdef CONFIG_MIPS
pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n",
p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64
pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
" pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
#endif
#ifdef CONFIG_ARM
pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, cpsr = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->ARM_pc, (long)regs->ARM_cpsr);
#endif
#ifdef CONFIG_S390
pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
p->symbol_name, p->addr, regs->psw.addr, regs->flags);
#endif

/* A dump_stack() here will give a stack backtrace */
return 0;
}

/* kprobe post_handler: 会在探测点后执行 */
static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags) //step 3
{
#ifdef CONFIG_X86
pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n",
p->symbol_name, p->addr, regs->flags);
#endif
#ifdef CONFIG_PPC
pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n",
p->symbol_name, p->addr, regs->msr);
#endif
#ifdef CONFIG_MIPS
pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n",
p->symbol_name, p->addr, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64
pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pstate);
#endif
#ifdef CONFIG_ARM
pr_info("<%s> post_handler: p->addr = 0x%p, cpsr = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->ARM_cpsr);
#endif
#ifdef CONFIG_S390
pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",
p->symbol_name, p->addr, regs->flags);
#endif
}

/*
* fault_handler: 错误处理,pre_handler或者post_handler产生异常时执行.
*/
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);
/* Return 0 because we don't handle the fault. */
return 0;
}
/* NOKPROBE_SYMBOL() 标记不可以被kprobe探测 */
NOKPROBE_SYMBOL(handler_fault);

static int __init kprobe_init(void) //step 1
{
int ret;
   //设置handler
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
kp.fault_handler = handler_fault;

//注册探测点
ret = register_kprobe(&kp);
if (ret < 0) {
pr_err("register_kprobe failed, returned %d\n", ret);
return ret;
}
pr_info("Planted kprobe at %p\n", kp.addr);
return 0;
}

static void __exit kprobe_exit(void)
{
//注销
unregister_kprobe(&kp);
pr_info("kprobe at %p unregistered\n", kp.addr);
}

module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");

这是kprobe的示例代码, 以内核模块的方式编译生成ko文件,insmod file.ko即可生效。

举个例子,系统写数据到硬盘前是有缓存的,如果想知道系统真实提交给磁盘的io请求,就可以用kprobe来实现。

先看一下探测点放在哪里

submit_bio就是kernel提交io请求给块设备的函数了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/**
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
* submit_bio() is used to submit I/O requests to block devices. It is passed a
* fully set up &struct bio that describes the I/O that needs to be done. The
* bio will be send to the device described by the bi_disk and bi_partno fields.
*
* The success/failure status of the request, along with notification of
* completion, is delivered asynchronously through the ->bi_end_io() callback
* in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
* been called.
*/
blk_qc_t submit_bio(struct bio *bio)
{
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;

/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
unsigned int count;

if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
else
count = bio_sectors(bio);

if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}

if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
op_is_write(bio_op(bio)) ? "WRITE" : "READ",
(unsigned long long)bio->bi_iter.bi_sector,
bio_devname(bio, b), count);
}
}

/*
* If we're reading data that is part of the userspace workingset, count
* submission time as memory stall. When the device is congested, or
* the submitting cgroup IO-throttled, submission can be a significant
* part of overall IO time.
*/
if (unlikely(bio_op(bio) == REQ_OP_READ &&
bio_flagged(bio, BIO_WORKINGSET))) {
unsigned long pflags;
blk_qc_t ret;

psi_memstall_enter(&pflags);
ret = submit_bio_noacct(bio);
psi_memstall_leave(&pflags);

return ret;
}

return submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);

这里可以看到这个函数有个block_dump的判断分支,如果开启,会打印哪个程序/pid 读/写 了在哪个设备的哪个扇区多少块。

这个可以通过echo 1 > /proc/sys/vm/block_dump来开启这个功能。

这里也可以使用kprobe实现这个功能,或者对这个功能增加自己想要的处理.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>

#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/version.h>
#include <linux/blkdev.h>
#include <linux/string.h>

static struct kprobe kp = {
.symbol_name = "submit_bio",
};


//regs指向寄存器
static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
/*
第一个整形参数发在di寄存器中,这里就取出了submit_bio的参数
在x86_64中,整形和指针型参数会从左到右放到rdi, rsi, rdx, rcx, r8, r9中,
浮点型会放到xmm0, xmm1, ...中。多于这些寄存器的参数会放到栈中。
*/
   struct bio *bio = (struct bio *)(regs->di);
   //这里的版本是随便写的,没有去查哪个版本发生的变动, 旧版本是2个参数,新版本是一个参数
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
int rw = *(int *)(regs->si);
#endif
if(bio && bio->bi_io_vec != NULL)
{
char b[BDEVNAME_SIZE];
int sectors;

//这里的版本是随便写的
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
// printk(KERN_DEBUG "%s(%d): %s, start: %Lu, sectors: %d, size: %u,on %s\n",
// current->comm, task_pid_nr(current),
// (rw & WRITE) ? "WRITE" : "READ",
// (unsigned long long)bio->bi_sector,
// bio_sectors(bio),
// (bio)->bi_size,
// bdevname(bio->bi_bdev, b));
#else
sectors = unlikely(bio_op(bio) == REQ_OP_WRITE_SAME) ? (queue_logical_block_size(bio->bi_disk->queue) >> 9) : bio_sectors(bio);
printk(KERN_DEBUG "%s(%d): %s, start: %Lu, sectors: %d, size: %u,on %s\n",
current->comm, task_pid_nr(current),
bio_op(bio) ? "WRITE" : "READ",
(unsigned long long)bio->bi_iter.bi_sector,
sectors,
bio_sectors(bio) << 9,
bio_devname(bio, b));

#endif
}

return 0;
}


static int __init kprobe_init(void)
{
int ret;
kp.pre_handler = handler_pre;
// kp.post_handler = handler_post;
// kp.fault_handler = handler_fault;

ret = register_kprobe(&kp);
if (ret < 0) {
pr_err("register_kprobe failed, returned %d\n", ret);
return ret;
}
pr_info("Planted kprobe at %p\n", kp.addr);
return 0;
}

static void __exit kprobe_exit(void)
{
unregister_kprobe(&kp);
pr_info("kprobe at %p unregistered\n", kp.addr);
}

module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");

other

基于kprobe,还有jprobe,kretprobe等方便使用的包装。

这里提到内核探测,还有许多工具。

systemtap 是利用Kprobe 提供的API来实现动态地监控和跟踪运行中的Linux内核的工具,相比Kprobe,systemtap更加简单,提供给用户简单的命令行接口,以及编写内核指令的脚本语言。

ebpf是一个新的流行方向。

旧的内核不支持ebpf,而新内核ebpf可能成为了首选。

技巧-如何在任意地址做探测

前面提到的探测点都只是挂在函数前,函数后,那么函数中怎么办?

内核大部分用C语言写成,遗憾的是kprobe不能向源代码任意行插入侦测点,但是可以在任意地址插入侦测点。

objdump可以给出目标文件的汇编代码,可以通过工具帮助/二分法定位源代码对应的汇编代码。需要注意的是源代码和汇编不是一一对应的,还有编译器优化也可能改变些许逻辑。

我这里常用的技巧是,找callq指令,这是调用指令,找到源代码中的调用和汇编中的调用位置,可以快速确定相对位置。

对汇编不熟悉,查位置还是有些麻烦的,只能这样吗?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/*
* sys_execve() executes a new program.这里重名为my_do_execveat_common
*/
static int my_do_execveat_common(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags)
{
char *pathbuf = NULL;
struct linux_binprm *bprm;
struct file *file;
struct files_struct *displaced;
int retval;

if (IS_ERR(filename))
return PTR_ERR(filename);

/*
* We move the actual failure in case of RLIMIT_NPROC excess from
* set*uid() to execve() because too many poorly written programs
* don't check setuid() return code. Here we additionally recheck
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
retval = -EAGAIN;
goto out_ret;
}

/* We're below the limit (still or again), so we don't want to make
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;

retval = unshare_files(&displaced);
if (retval)
goto out_ret;

retval = -ENOMEM;
bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
if (!bprm)
goto out_files;

retval = prepare_bprm_creds(bprm);
if (retval)
goto out_free;

check_unsafe_exec(bprm);
current->in_execve = 1;

file = do_open_execat(fd, filename, flags);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;

sched_exec();

bprm->file = file;
if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
} else {
if (filename->name[0] == '\0')
pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
else
pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
fd, filename->name);
if (!pathbuf) {
retval = -ENOMEM;
goto out_unmark;
}
/*
* Record that a name derived from an O_CLOEXEC fd will be
* inaccessible after exec. Relies on having exclusive access to
* current->files (due to unshare_files above).
*/
if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
bprm->filename = pathbuf;
}
bprm->interp = bprm->filename;

retval = bprm_mm_init(bprm);
if (retval)
goto out_unmark;

bprm->argc = count(argv, MAX_ARG_STRINGS);
if ((retval = bprm->argc) < 0)
goto out;

bprm->envc = count(envp, MAX_ARG_STRINGS);
if ((retval = bprm->envc) < 0)
goto out;

retval = prepare_binprm(bprm);
if (retval < 0)
goto out;

retval = copy_strings_kernel(1, &bprm->filename, bprm);
if (retval < 0)
goto out;

bprm->exec = bprm->p;
retval = copy_strings(bprm->envc, envp, bprm);
if (retval < 0)
goto out;

retval = copy_strings(bprm->argc, argv, bprm);
if (retval < 0)
goto out;

retval = exec_binprm(bprm);
if (retval < 0)
goto out;

/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
acct_update_integrals(current);
task_numa_free(current);
free_bprm(bprm);
kfree(pathbuf);
putname(filename);
if (displaced)
put_files_struct(displaced);
return retval;

out:
if (bprm->mm) {
acct_arg_size(bprm, 0);
mmput(bprm->mm);
}

out_unmark:
current->fs->in_exec = 0;
current->in_execve = 0;

out_free:
free_bprm(bprm);
kfree(pathbuf);

out_files:
if (displaced)
reset_files_struct(displaced);
out_ret:
putname(filename);
return retval;
}
1
2
3
kernel: mymod: Unknown symbol xxx
kernel: mymod: Unknown symbol xxx
kernel: mymod: Unknown symbol xxx

这样编译生成的ko模块,安装时可能会报Unknown symbol xxx的问题,这是linux的限制,但是可以回避。

编辑一个addrs.dat,里面给出上面找不到符号的地址

1
2
3
4
5
SECTIONS
{
symbol1 = <这里写地址就可以了>;
symbol2 =
}

ld -r -o mymod.ko mymod.ko -R addrs.dat重新解析地址

再次安装就可以了

debian替换最新内核
vsyscall,vdso
  • 文章目录
  • 站点概览
AIRobot

AIRobot

AIRobot quick note
130 日志
15 分类
23 标签
GitHub E-Mail
Creative Commons
  1. 1. 利用kprobe探测内核信息
  2. 2. 使用
  3. 3. other
  4. 4. 技巧-如何在任意地址做探测
0%
© 2023 AIRobot | 716k | 10:51