内核探针kprobe

利用kprobe探测内核信息

printk是内核调试常用的调试手段，但是每次更改都需要重新编译启动内核，调试效率很低。

kprobe根据实现原理也可以分为三种：基于动态ftrace的kprobe，基于int3的kprobe和基于jump相对跳转指令实现的kprobe。

这里主要说int3的kprobe，int3是x86架构下的trap类型的断点指令。代码执行到对应位置会发生trap，然后执行kprobe的代码，执行后恢复现场。

使用

第一步是先获取要插入点的地址。可以通过cat /proc/kallsyms | grep <symbol_name>设置addr成员或者设置symbol_name成员由kprobe自行查找获取。

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>

#define MAX_SYMBOL_LEN	64
static char symbol[MAX_SYMBOL_LEN] = "kernel_clone";
module_param_string(symbol, symbol, sizeof(symbol), 0644);

/* 分配kprobe结构，设置地址或者符号名, 按需要是否设置地址偏移量 */
static struct kprobe kp = {
	.symbol_name	= symbol,
//    .addr			= ,
//    .offset		= ,
};

/* kprobe pre_handler: 会在探测点前执行 */
static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs)   //step 2
{
#ifdef CONFIG_X86
	pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n",
		p->symbol_name, p->addr, regs->ip, regs->flags);
#endif
#ifdef CONFIG_PPC
	pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n",
		p->symbol_name, p->addr, regs->nip, regs->msr);
#endif
#ifdef CONFIG_MIPS
	pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n",
		p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64
	pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
			" pstate = 0x%lx\n",
		p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
#endif
#ifdef CONFIG_ARM
	pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, cpsr = 0x%lx\n",
		p->symbol_name, p->addr, (long)regs->ARM_pc, (long)regs->ARM_cpsr);
#endif
#ifdef CONFIG_S390
	pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
		p->symbol_name, p->addr, regs->psw.addr, regs->flags);
#endif

	/* A dump_stack() here will give a stack backtrace */
	return 0;
}

/* kprobe post_handler: 会在探测点后执行 */
static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs,
				unsigned long flags)   //step 3
{
#ifdef CONFIG_X86
	pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n",
		p->symbol_name, p->addr, regs->flags);
#endif
#ifdef CONFIG_PPC
	pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n",
		p->symbol_name, p->addr, regs->msr);
#endif
#ifdef CONFIG_MIPS
	pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n",
		p->symbol_name, p->addr, regs->cp0_status);
#endif
#ifdef CONFIG_ARM64
	pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
		p->symbol_name, p->addr, (long)regs->pstate);
#endif
#ifdef CONFIG_ARM
	pr_info("<%s> post_handler: p->addr = 0x%p, cpsr = 0x%lx\n",
		p->symbol_name, p->addr, (long)regs->ARM_cpsr);
#endif
#ifdef CONFIG_S390
	pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",
		p->symbol_name, p->addr, regs->flags);
#endif
}

/*
 * fault_handler: 错误处理，pre_handler或者post_handler产生异常时执行.
 */
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
	pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);
	/* Return 0 because we don't handle the fault. */
	return 0;
}
/* NOKPROBE_SYMBOL() 标记不可以被kprobe探测 */
NOKPROBE_SYMBOL(handler_fault);

static int __init kprobe_init(void)  //step 1
{
	int ret;
    //设置handler
	kp.pre_handler = handler_pre;
	kp.post_handler = handler_post;
	kp.fault_handler = handler_fault;

	//注册探测点
	ret = register_kprobe(&kp);
	if (ret < 0) {
		pr_err("register_kprobe failed, returned %d\n", ret);
		return ret;
	}
	pr_info("Planted kprobe at %p\n", kp.addr);
	return 0;
}

static void __exit kprobe_exit(void)
{
	//注销
	unregister_kprobe(&kp);
	pr_info("kprobe at %p unregistered\n", kp.addr);
}

module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");

这是kprobe的示例代码, 以内核模块的方式编译生成ko文件，insmod file.ko即可生效。

举个例子，系统写数据到硬盘前是有缓存的，如果想知道系统真实提交给磁盘的io请求，就可以用kprobe来实现。

先看一下探测点放在哪里

submit_bio就是kernel提交io请求给块设备的函数了。

/**
 * submit_bio - submit a bio to the block device layer for I/O
 * @bio: The &struct bio which describes the I/O
 *
 * submit_bio() is used to submit I/O requests to block devices.  It is passed a
 * fully set up &struct bio that describes the I/O that needs to be done.  The
 * bio will be send to the device described by the bi_disk and bi_partno fields.
 *
 * The success/failure status of the request, along with notification of
 * completion, is delivered asynchronously through the ->bi_end_io() callback
 * in @bio.  The bio must NOT be touched by thecaller until ->bi_end_io() has
 * been called.
 */
blk_qc_t submit_bio(struct bio *bio)
{
	if (blkcg_punt_bio_submit(bio))
		return BLK_QC_T_NONE;

	/*
	 * If it's a regular read/write or a barrier with data attached,
	 * go through the normal accounting stuff before submission.
	 */
	if (bio_has_data(bio)) {
		unsigned int count;

		if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
			count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
		else
			count = bio_sectors(bio);

		if (op_is_write(bio_op(bio))) {
			count_vm_events(PGPGOUT, count);
		} else {
			task_io_account_read(bio->bi_iter.bi_size);
			count_vm_events(PGPGIN, count);
		}

		if (unlikely(block_dump)) {
			char b[BDEVNAME_SIZE];
			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
			current->comm, task_pid_nr(current),
				op_is_write(bio_op(bio)) ? "WRITE" : "READ",
				(unsigned long long)bio->bi_iter.bi_sector,
				bio_devname(bio, b), count);
		}
	}

	/*
	 * If we're reading data that is part of the userspace workingset, count
	 * submission time as memory stall.  When the device is congested, or
	 * the submitting cgroup IO-throttled, submission can be a significant
	 * part of overall IO time.
	 */
	if (unlikely(bio_op(bio) == REQ_OP_READ &&
	    bio_flagged(bio, BIO_WORKINGSET))) {
		unsigned long pflags;
		blk_qc_t ret;

		psi_memstall_enter(&pflags);
		ret = submit_bio_noacct(bio);
		psi_memstall_leave(&pflags);

		return ret;
	}

	return submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);

这里可以看到这个函数有个block_dump的判断分支，如果开启，会打印哪个程序/pid 读/写了在哪个设备的哪个扇区多少块。

这个可以通过echo 1 > /proc/sys/vm/block_dump来开启这个功能。

这里也可以使用kprobe实现这个功能，或者对这个功能增加自己想要的处理.

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>

#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/version.h>
#include <linux/blkdev.h>
#include <linux/string.h>

static struct kprobe kp = {
        .symbol_name	= "submit_bio",
};


//regs指向寄存器
static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs)
{
#ifdef CONFIG_X86
/*
 第一个整形参数发在di寄存器中，这里就取出了submit_bio的参数
 在x86_64中，整形和指针型参数会从左到右放到rdi, rsi, rdx, rcx, r8, r9中, 
 浮点型会放到xmm0, xmm1, ...中。多于这些寄存器的参数会放到栈中。
 */
    struct bio *bio = (struct bio *)(regs->di);
    //这里的版本是随便写的,没有去查哪个版本发生的变动, 旧版本是2个参数，新版本是一个参数
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
    int rw = *(int *)(regs->si);
#endif
    if(bio && bio->bi_io_vec != NULL)
    {
        char b[BDEVNAME_SIZE];
        int sectors;

//这里的版本是随便写的
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0)
//         printk(KERN_DEBUG "%s(%d): %s, start: %Lu, sectors: %d, size: %u,on %s\n",
//                 current->comm, task_pid_nr(current),
//                 (rw & WRITE) ? "WRITE" : "READ",
//                 (unsigned long long)bio->bi_sector, 
//                 bio_sectors(bio), 
//                 (bio)->bi_size,
//                 bdevname(bio->bi_bdev, b));
#else
       sectors  = unlikely(bio_op(bio) == REQ_OP_WRITE_SAME) ? (queue_logical_block_size(bio->bi_disk->queue) >> 9) : bio_sectors(bio);
        printk(KERN_DEBUG "%s(%d): %s, start: %Lu, sectors: %d, size: %u,on %s\n",
                current->comm, task_pid_nr(current),
                bio_op(bio) ? "WRITE" : "READ",
                (unsigned long long)bio->bi_iter.bi_sector, 
                sectors,
                bio_sectors(bio) << 9,
                bio_devname(bio, b));
        
#endif
    }
    
    return 0;
}


static int __init kprobe_init(void)
{
    int ret;
    kp.pre_handler = handler_pre;
//    kp.post_handler = handler_post;
//    kp.fault_handler = handler_fault;

    ret = register_kprobe(&kp);
    if (ret < 0) {
        pr_err("register_kprobe failed, returned %d\n", ret);
        return ret;
    }
    pr_info("Planted kprobe at %p\n", kp.addr);
    return 0;
}

static void __exit kprobe_exit(void)
{
    unregister_kprobe(&kp);
    pr_info("kprobe at %p unregistered\n", kp.addr);
}

module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");

other

基于kprobe，还有jprobe,kretprobe等方便使用的包装。

这里提到内核探测，还有许多工具。

systemtap 是利用Kprobe 提供的API来实现动态地监控和跟踪运行中的Linux内核的工具，相比Kprobe，systemtap更加简单，提供给用户简单的命令行接口，以及编写内核指令的脚本语言。

ebpf是一个新的流行方向。

旧的内核不支持ebpf，而新内核ebpf可能成为了首选。

技巧-如何在任意地址做探测

前面提到的探测点都只是挂在函数前，函数后，那么函数中怎么办？

内核大部分用C语言写成，遗憾的是kprobe不能向源代码任意行插入侦测点，但是可以在任意地址插入侦测点。

objdump可以给出目标文件的汇编代码，可以通过工具帮助/二分法定位源代码对应的汇编代码。需要注意的是源代码和汇编不是一一对应的，还有编译器优化也可能改变些许逻辑。

我这里常用的技巧是，找callq指令，这是调用指令，找到源代码中的调用和汇编中的调用位置，可以快速确定相对位置。

对汇编不熟悉，查位置还是有些麻烦的，只能这样吗？

/*
 * sys_execve() executes a new program.这里重名为my_do_execveat_common
 */
static int my_do_execveat_common(int fd, struct filename *filename,
			      struct user_arg_ptr argv,
			      struct user_arg_ptr envp,
			      int flags)
{
	char *pathbuf = NULL;
	struct linux_binprm *bprm;
	struct file *file;
	struct files_struct *displaced;
	int retval;

	if (IS_ERR(filename))
		return PTR_ERR(filename);

	/*
	 * We move the actual failure in case of RLIMIT_NPROC excess from
	 * set*uid() to execve() because too many poorly written programs
	 * don't check setuid() return code.  Here we additionally recheck
	 * whether NPROC limit is still exceeded.
	 */
	if ((current->flags & PF_NPROC_EXCEEDED) &&
	    atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
		retval = -EAGAIN;
		goto out_ret;
	}

	/* We're below the limit (still or again), so we don't want to make
	 * further execve() calls fail. */
	current->flags &= ~PF_NPROC_EXCEEDED;

	retval = unshare_files(&displaced);
	if (retval)
		goto out_ret;

	retval = -ENOMEM;
	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
	if (!bprm)
		goto out_files;

	retval = prepare_bprm_creds(bprm);
	if (retval)
		goto out_free;

	check_unsafe_exec(bprm);
	current->in_execve = 1;

	file = do_open_execat(fd, filename, flags);
	retval = PTR_ERR(file);
	if (IS_ERR(file))
		goto out_unmark;

	sched_exec();

	bprm->file = file;
	if (fd == AT_FDCWD || filename->name[0] == '/') {
		bprm->filename = filename->name;
	} else {
		if (filename->name[0] == '\0')
			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
		else
			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
					    fd, filename->name);
		if (!pathbuf) {
			retval = -ENOMEM;
			goto out_unmark;
		}
		/*
		 * Record that a name derived from an O_CLOEXEC fd will be
		 * inaccessible after exec. Relies on having exclusive access to
		 * current->files (due to unshare_files above).
		 */
		if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
			bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
		bprm->filename = pathbuf;
	}
	bprm->interp = bprm->filename;

	retval = bprm_mm_init(bprm);
	if (retval)
		goto out_unmark;

	bprm->argc = count(argv, MAX_ARG_STRINGS);
	if ((retval = bprm->argc) < 0)
		goto out;

	bprm->envc = count(envp, MAX_ARG_STRINGS);
	if ((retval = bprm->envc) < 0)
		goto out;

	retval = prepare_binprm(bprm);
	if (retval < 0)
		goto out;

	retval = copy_strings_kernel(1, &bprm->filename, bprm);
	if (retval < 0)
		goto out;

	bprm->exec = bprm->p;
	retval = copy_strings(bprm->envc, envp, bprm);
	if (retval < 0)
		goto out;

	retval = copy_strings(bprm->argc, argv, bprm);
	if (retval < 0)
		goto out;

	retval = exec_binprm(bprm);
	if (retval < 0)
		goto out;

	/* execve succeeded */
	current->fs->in_exec = 0;
	current->in_execve = 0;
	acct_update_integrals(current);
	task_numa_free(current);
	free_bprm(bprm);
	kfree(pathbuf);
	putname(filename);
	if (displaced)
		put_files_struct(displaced);
	return retval;

out:
	if (bprm->mm) {
		acct_arg_size(bprm, 0);
		mmput(bprm->mm);
	}

out_unmark:
	current->fs->in_exec = 0;
	current->in_execve = 0;

out_free:
	free_bprm(bprm);
	kfree(pathbuf);

out_files:
	if (displaced)
		reset_files_struct(displaced);
out_ret:
	putname(filename);
	return retval;
}

1
2
3

kernel: mymod: Unknown symbol xxx
kernel: mymod: Unknown symbol xxx
kernel: mymod: Unknown symbol xxx

这样编译生成的ko模块，安装时可能会报Unknown symbol xxx的问题，这是linux的限制，但是可以回避。

编辑一个addrs.dat，里面给出上面找不到符号的地址

SECTIONS
{
  symbol1 = <这里写地址就可以了>;
  symbol2 = 
}

ld -r -o mymod.ko mymod.ko -R addrs.dat重新解析地址

再次安装就可以了