简介

该技术的核心在于展示了如何利用一个理论上几乎不可能利用的、时间窗口极短的条件竞争漏洞。该方法是一种利用硬件特性（系统管理中断，SMI）来人为“拉长”这个时间窗口的创新技术。

前景提要：CVE-2021-0920

因为该技术是基于该漏洞发现了另外一个漏洞而出现的，所以为了便于理解就将该漏洞分析一次。

后续分析的内核版本为：Linux v4.14.201

SCM_RIGHTS 消息

SCM_RIGHTS控制消息：Linux 开发者可以使用 sendmsg 系统调用发送 SCM_RIGHTS 数据（参见https://man7.org/linux/man-pages/man7/unix.7.html）**将文件描述符 fd 从一个进程共享到另一个进程（该功能的本意是有权限打开文件的进程打开文件然后传递给没权限打开的进程使用，和dup2()不同的是，传递到对端的文件描述符数字和本端不一样）**。当sender进程将文件描述符传递给另一个进程receiver时，SCM_RIGHTS 将创建一个对 file 结构的引用（该 file 结构指针放在receiver端的 sock->sk_receive_queue 队列上），这样即使receiver进程尚未接收到文件描述符，sender也可以立即关闭该文件描述符。当文件描述符处于“queued” 排队状态（表示sender已发送并关闭了该 fd，但receiver尚未接收 fd 并取得所有权）时，需要进行专门的垃圾回收。为了跟踪这种“queued”状态，文章 io_uring, SCM_RIGHTS, and reference-count cycles 很好地解释了SCM_RIGHTS 引用计数和垃圾回收原理。

内存泄露问题：如果两个文件描述符 FD1 和 FD2 都把自己发送给对方，在都没有接收到文件描述符的情况下关闭 FD1 和 FD2，那么这两个 file 结构部将永远无法释放，造成内存泄露。所以要引入Linux垃圾回收机制，采用 inflight 飞行计数来统计正在被发送的文件描述符，识别内核中的不可破循环，并释放不可破循环中的 file 对象。

发送文件描述符

static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
			       size_t len)
{
	struct sock *sk = sock->sk;
	struct sock *other = NULL;
	int err, size;
	struct sk_buff *skb;
	int sent = 0;
	struct scm_cookie scm;
	bool fds_sent = false;
	int data_len;

	wait_for_unix_gc();
	err = scm_send(sock, msg, &scm, false); // 这里会处理scm消息，后续分析
	// ...... 如果前面函数失败则错误返回

	err = -EOPNOTSUPP; // 初始化err的只为不支持
	// ...... 如果消息的标识符有MSG_OOB则跳转至out_err分支

	if (msg->msg_namelen) { // 如果消息有名字长度
		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; // 这里确定错误类型
		goto out_err; // 跳转至错误退出分支
	} else {
		err = -ENOTCONN;
		other = unix_peer(sk); // 获取到目标sock结构体
		// ...... 如果无法找到则跳转至out_err分支
	}

	// ...... 如果目标套接字关闭则跳转至pipe_err分支

	while (sent < len) { // 如果发送大小小于数据大小则循环
		size = len - sent; // 拿到此次要发送的消息的大小
		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); // 这两步就是保证size不能超过这俩

		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
		data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); // 计算数据长度

		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
					   msg->msg_flags & MSG_DONTWAIT, &err, // 创建skb
					   get_order(UNIX_SKB_FRAGS_SZ));
		// ...... 如果分配skb失败则跳转至out_err

		err = unix_scm_to_skb(&scm, skb, !fds_sent); // 将scm设置到skb中
		// ...... 如果设置失败则释放skb并且跳转至out_err分支
		fds_sent = true;

		skb_put(skb, size - data_len);
		skb->data_len = data_len;
		skb->len = size; // 设置skb
		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); // 将消息拷贝至skb
		// ...... 如果错误则释放skb并跳转至out_err分支

		unix_state_lock(other); // 对目标套接字加锁
		// ...... 如果目标套接字的标志符为dead或连接关闭则跳转至pipe_err_free分支

		maybe_add_creds(skb, sock, other);
		skb_queue_tail(&other->sk_receive_queue, skb); // 将skb添加到目标套接字的receive_queue
		unix_state_unlock(other);
		other->sk_data_ready(other); // 告诉目标套接字数据以及准备好了
		sent += size; // 增加发送的数据长度
	}

	scm_destroy(&scm); // 释放scm
	return sent;

// ...... 错误分支，与此分析无关省略
}

上述是发送函数的所有逻辑，大致就是处理scm消息，将scm消息整合到skb中最终将skb添加到目标套接字的接收队列中。

struct scm_fp_list {
	short			count;
	short			max;
	struct user_struct	*user;
	struct file		*fp[SCM_MAX_FD];
};

struct scm_cookie {
	struct pid		*pid;		/* Skb credentials */
	struct scm_fp_list	*fp;		/* Passed files		*/
	struct scm_creds	creds;		/* Skb credentials	*/
#ifdef CONFIG_SECURITY_NETWORK
	u32			secid;		/* Passed security ID 	*/
#endif
};

上述两个结构体在后续很需要，在scm_fp_list->fp中存储的是要发送的文件描述符。

int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
{
	struct cmsghdr *cmsg;
	int err;

	for_each_cmsghdr(cmsg, msg) { // 遍历用户态传入的消息
		err = -EINVAL;
		// ...... 如果消息不正常则跳转至error分支
		// ...... 如果消息的等级不是SOL_SOCKET则跳过

		switch (cmsg->cmsg_type)
		{
		case SCM_RIGHTS:
      // ...... 如果当前套接字没有ops或所属的家族不是PF_UNIX则跳转至error分支
			err=scm_fp_copy(cmsg, &p->fp); // 这里传入的是scm_fp_list结构体，该函数的作用就是将用户态的传入的文件描述符写入fp数组
			// ...... 如果错误则跳转至error分支
			break;
		case SCM_CREDENTIALS:
		// ...... 与此漏洞无关
		}
	}

	if (p->fp && !p->fp->count)
	{
		kfree(p->fp);
		p->fp = NULL;
	}
	return 0;

// ...... 错误分支与此处无关
}

通过该函数会将用户态传入的文件描述符解析至unix_stream_sendmsg函数中的struct scm_cookie scm;变量中。

static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
{
	int err = 0;

	UNIXCB(skb).pid  = get_pid(scm->pid);
	UNIXCB(skb).uid = scm->creds.uid;
	UNIXCB(skb).gid = scm->creds.gid;
	UNIXCB(skb).fp = NULL;
	unix_get_secdata(scm, skb);
	if (scm->fp && send_fds)
		err = unix_attach_fds(scm, skb);

	skb->destructor = unix_destruct_scm;
	return err;
}

这里对skb进行初始化，会通过unix_attach_fds将struct file写到skb上。

int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
	int i;

	if (too_many_unix_fds(current))
		return -ETOOMANYREFS;

	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
	if (!UNIXCB(skb).fp)
		return -ENOMEM;

	for (i = scm->fp->count - 1; i >= 0; i--)
		unix_inflight(scm->fp->user, scm->fp->fp[i]);
	return 0;
}

在该函数中会通过scm_fp_dup函数增加该文件的引用计数，通过unix_inflight函数增加要传输文件的飞行计数。

接收文件描述符

static int unix_stream_read_generic(struct unix_stream_read_state *state,
				    bool freezable)
{
	struct scm_cookie scm;
	struct socket *sock = state->socket;
	struct sock *sk = sock->sk;
	struct unix_sock *u = unix_sk(sk);
	// ......
  
	do {
		// ......
		last = skb = skb_peek(&sk->sk_receive_queue);
		// ...... 找不到skb的情况
		// ...... 跳过数据长度大于skb长度，进一步找下一个skb
		// ...... 凭证相关

		if (!(flags & MSG_PEEK)) { // 如果不是PEEK模式
			UNIXCB(skb).consumed += chunk;

			sk_peek_offset_bwd(sk, chunk);

			if (UNIXCB(skb).fp) // 如果包含文件描述符
				unix_detach_fds(&scm, skb); // 将文件描述符从skb中分离设置到scm

			if (unix_skb_len(skb))
				break;

			skb_unlink(skb, &sk->sk_receive_queue); // 将skb从消息队列中去除
			consume_skb(skb); // 释放skb

			if (scm.fp) // 拿到fp之后退出循环
				break;
		} else { // PEEK模式
			if (UNIXCB(skb).fp) // 如果包含文件描述符
				scm.fp = scm_fp_dup(UNIXCB(skb).fp); // 又是拿到文件描述符

			sk_peek_offset_fwd(sk, chunk);

			if (UNIXCB(skb).fp) // 如果有文件描述符则退出
				break;

			// ......
			break;
		}
	} while (size);

	mutex_unlock(&u->iolock);
	if (state->msg) // 如果有消息结构
		scm_recv(sock, state->msg, &scm, flags); // 处理消息
	else
		scm_destroy(&scm);
out:
	return copied ? : err;
}

接收消息的大致流程如上，主要分为PEEK模式和非PEEK模式的正常接收。

不带PEEK调用recvmsg

void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
	int i;

	scm->fp = UNIXCB(skb).fp;
	UNIXCB(skb).fp = NULL;

	for (i = scm->fp->count-1; i >= 0; i--)
		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
}

在分离文件描述的函数内部实现如上，首先将文件描述符赋值到scm中，然后清空skb上的fp指针，最后在下面减少飞行计数。

void unix_notinflight(struct user_struct *user, struct file *fp)
{
	struct sock *s = unix_get_socket(fp);

	spin_lock(&unix_gc_lock);

	if (s) {
		struct unix_sock *u = unix_sk(s);

		BUG_ON(!atomic_long_read(&u->inflight));
		BUG_ON(list_empty(&u->link));

		if (atomic_long_dec_and_test(&u->inflight)) // [1]
			list_del_init(&u->link);
		/* Paired with READ_ONCE() in wait_for_unix_gc() */
		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
	}
	user->unix_inflight--;
	spin_unlock(&unix_gc_lock);
}

在减去飞行计数的时候会加上gc锁，在[1]处会减少飞行计数。

void consume_skb(struct sk_buff *skb)
{
	if (!skb_unref(skb))
		return;

	trace_consume_skb(skb);
	__kfree_skb(skb);
}

在进入到__kfree_skb之后会进入一下调用链，__kfree_skb=>skb_realease_all=>skb_release_head_state=>skb->destructor，这里调用的解构函数实际是unix_destruct_scm。

void unix_destruct_scm(struct sk_buff *skb)
{
	struct scm_cookie scm;

	memset(&scm, 0, sizeof(scm));
	scm.pid  = UNIXCB(skb).pid;
	if (UNIXCB(skb).fp)
		unix_detach_fds(&scm, skb);

	/* Alas, it calls VFS */
	/* So fscking what? fput() had been SMP-safe since the last Summer */
	scm_destroy(&scm);
	sock_wfree(skb);
}

在此处如果skb如果还是有文件描述符则会再一次调用unix_detach_fds函数。

带PEEK调用recvmsg

可以发现的是，在带PEEK模式下的recvmsg是直接通过scm_fp_dup拿到文件描述符。

struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
{
	struct scm_fp_list *new_fpl;
	int i;

	if (!fpl)
		return NULL;

	new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
			  GFP_KERNEL);
	if (new_fpl) {
		for (i = 0; i < fpl->count; i++)
			get_file(fpl->fp[i]);
		new_fpl->max = new_fpl->count;
		new_fpl->user = get_uid(fpl->user);
	}
	return new_fpl;
}

分配scm_fp_list，然后通过get_file函数增加文件的引用计数最后返回scm_fp_list。

并且从前面的recvmsg函数来看，当带PEEK模式时不会销毁skb并且会拿到一个新的文件描述符

垃圾回收

垃圾回收的主要目的是为了清除内存中无法主动删除掉的内存块，即以下情况：

Socket A ==发送A==> Socket B

Socket B ==发送B==> Socket A

close(A);

close(B);

状态如下：

Socket A Socket B

ref = 1 ref = 1

inflight = 1 inflight = 1

在上述情况中，即没有Socket A的引用，同时没有Socket B的引用，导致形成了一个不可破循环，面对此类情况就应当进行gc。

void unix_gc(void)
{
	struct unix_sock *u;
	struct unix_sock *next;
	struct sk_buff_head hitlist;
	struct list_head cursor;
	LIST_HEAD(not_cycle_list);

	spin_lock(&unix_gc_lock); // 加上gc锁
	// ...... 如果已经有gc在程序内则退出如果没有则设置正在程序中

	list_for_each_entry_safe(u, next, &gc_inflight_list, link) { // 遍历飞行中的sock
		long total_refs;
		long inflight_refs;

		total_refs = file_count(u->sk.sk_socket->file); // 取文件引用计数
		inflight_refs = atomic_long_read(&u->inflight); // 取飞行计数

		BUG_ON(inflight_refs < 1);
		BUG_ON(total_refs < inflight_refs);
		if (total_refs == inflight_refs) { // 如果引用计数与飞行计数一致则将sock添加至gc的候选列表
			list_move_tail(&u->link, &gc_candidates);
			__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
			__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
		}
	}

	list_for_each_entry(u, &gc_candidates, link) // 遍历所有候选列表
		scan_children(&u->sk, dec_inflight, NULL); // 对候选列表中的飞行计数减一

	list_add(&cursor, &gc_candidates);
	while (cursor.next != &gc_candidates) {
		u = list_entry(cursor.next, struct unix_sock, link);

		list_move(&cursor, &u->link);

		if (atomic_long_read(&u->inflight) > 0) { // 如果减一之后飞行计数仍大于0则代表有外部引用，那么该sock不该被gc
			list_move_tail(&u->link, &not_cycle_list);
			__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
			scan_children(&u->sk, inc_inflight_move_tail, NULL); // 恢复飞行计数
		}
	}
	list_del(&cursor);

	skb_queue_head_init(&hitlist);
	list_for_each_entry(u, &gc_candidates, link) // 现在剩下的就是需要被gc的sock
		scan_children(&u->sk, inc_inflight, &hitlist); // 增加sock的飞行计数并转移到hitlist链表

	while (!list_empty(&not_cycle_list)) {
		u = list_entry(not_cycle_list.next, struct unix_sock, link);
		__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
		list_move_tail(&u->link, &gc_inflight_list);
	}

	spin_unlock(&unix_gc_lock);

	__skb_queue_purge(&hitlist); // 删除该链表中的skb

	spin_lock(&unix_gc_lock);

	BUG_ON(!list_empty(&gc_candidates));
	WRITE_ONCE(gc_in_progress, false);
	wake_up(&unix_gc_wait);

 out:
	spin_unlock(&unix_gc_lock);
}

所以这里的垃圾回收机制就是：

如果引用计数与飞行计数一致则进入gc的候选列表。
遍历候选列表中的sock，对其接收列表中的sock进行飞行计数减一
如果飞行计数大于0则移除gc的候选列表并恢复其飞行计数
恢复gc候选列表中的sock的飞行计数

static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
			  struct sk_buff_head *hitlist)
{
	if (x->sk_state != TCP_LISTEN) {
		scan_inflight(x, func, hitlist);
	} else {
		struct sk_buff *skb;
		struct sk_buff *next;
		struct unix_sock *u;
		LIST_HEAD(embryos);

		spin_lock(&x->sk_receive_queue.lock);
		skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
			u = unix_sk(skb->sk);

			BUG_ON(!list_empty(&u->link));
			list_add_tail(&u->link, &embryos);
		}
		spin_unlock(&x->sk_receive_queue.lock);

		while (!list_empty(&embryos)) {
			u = list_entry(embryos.next, struct unix_sock, link);
			scan_inflight(&u->sk, func, hitlist);
			list_del_init(&u->link);
		}
	}
}

此处会先将接收队列中的skb添加到embryos列表中，最后调用scan_inflight函数。

static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
			  struct sk_buff_head *hitlist)
{
	struct sk_buff *skb;
	struct sk_buff *next;

	spin_lock(&x->sk_receive_queue.lock);
	skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
		if (UNIXCB(skb).fp) {
			bool hit = false;
			int nfd = UNIXCB(skb).fp->count;
			struct file **fp = UNIXCB(skb).fp->fp;

			while (nfd--) {
				struct sock *sk = unix_get_socket(*fp++);

				if (sk) {
					struct unix_sock *u = unix_sk(sk);

					if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
						hit = true;

						func(u);
					}
				}
			}
			if (hit && hitlist != NULL) {
				__skb_unlink(skb, &x->sk_receive_queue);
				__skb_queue_tail(hitlist, skb);
			}
		}
	}
	spin_unlock(&x->sk_receive_queue.lock);
}

此处又一次遍历套接字的接收队列，如果skb有文件描述符则会进入内部调用外部传入的函数（减去飞行计数或增加飞行计数等）。

static inline void __skb_queue_purge(struct sk_buff_head *list)
{
	struct sk_buff *skb;
	while ((skb = __skb_dequeue(list)) != NULL)
		kfree_skb(skb);
}

最后通过该函数删除skb。

漏洞分析

此处的漏洞出现在使用带PEEK的模式调用recvmsg时没有与gc进行同步，所以在gc的过程中可以通过调用带PEEK的recvmsg达到条件竞争的效果。

触发UAF的流程如下：

Unix_gc	线程一：Socket A	线程二：Socket B	线程三：Socket C
	将套接字A发送到B并关闭A A：ref => 1; inflight => 1	将套接字B发送到A不关闭B 将套接字B发送到C并关闭B B：ref => 2; inflight => 2
触发GC 将A和B添加进GC候选列表
			对套接字C使用带PEEK的recvmsg接收套接字B B：ref => 3; inflight => 2
对候选列表中的飞行计数减一 A：ref => 1; inflight => 0 B：ref => 3; inflight => 1
			对套接字B使用不带PEEK的recvmsg A：ref => 1; inflight => 1 UNIXCB(skb).fp = NULL; 此处会因为unix_notinflight阻塞
因为B的飞行计数大于0 所以会恢复该消息且恢复其接收列表的飞行计数但因为UNIXCB(skb).fp = NULL;会导致A的飞行计数依旧为0
			对套接字B使用带PEEK的recvmsg skb = skb_peek(&sk->sk_receive_queue);拿到skb
gc释放skb
			state->recv_actor(skb, skip, chunk, state);达到UAF

总的来说，这里的触发漏洞还是比较复杂的，然后在此处增加时间窗口只需要创建多个类似于A <==> B ==> C这样的形式即可。

patch分析

---
 net/unix/af_unix.c |   51 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1507,6 +1507,53 @@ out:
 	return err;
 }

+static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+
+	/*
+	 * Garbage collection of unix sockets starts by selecting a set of
+	 * candidate sockets which have reference only from being in flight
+	 * (total_refs == inflight_refs).  This condition is checked once during
+	 * the candidate collection phase, and candidates are marked as such, so
+	 * that non-candidates can later be ignored.  While inflight_refs is
+	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
+	 * is an instantaneous decision.
+	 *
+	 * Once a candidate, however, the socket must not be reinstalled into a
+	 * file descriptor while the garbage collection is in progress.
+	 *
+	 * If the above conditions are met, then the directed graph of
+	 * candidates (*) does not change while unix_gc_lock is held.
+	 *
+	 * Any operations that changes the file count through file descriptors
+	 * (dup, close, sendmsg) does not change the graph since candidates are
+	 * not installed in fds.
+	 *
+	 * Dequeing a candidate via recvmsg would install it into an fd, but
+	 * that takes unix_gc_lock to decrement the inflight count, so it's
+	 * serialized with garbage collection.
+	 *
+	 * MSG_PEEK is special in that it does not change the inflight count,
+	 * yet does install the socket into an fd.  The following lock/unlock
+	 * pair is to ensure serialization with garbage collection.  It must be
+	 * done between incrementing the file count and installing the file into
+	 * an fd.
+	 *
+	 * If garbage collection starts after the barrier provided by the
+	 * lock/unlock, then it will see the elevated refcount and not mark this
+	 * as a candidate.  If a garbage collection is already in progress
+	 * before the file count was incremented, then the lock/unlock pair will
+	 * ensure that garbage collection is finished before progressing to
+	 * installing the fd.
+	 *
+	 * (*) A -> B where B is on the queue of A or B is on the queue of C
+	 * which is on the queue of listening socket A.
+	 */
+	spin_lock(&unix_gc_lock);
+	spin_unlock(&unix_gc_lock);
+}
+
 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
 {
 	int err = 0;
@@ -2140,7 +2187,7 @@ static int unix_dgram_recvmsg(struct soc
 		sk_peek_offset_fwd(sk, size);

 		if (UNIXCB(skb).fp)
-			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
+			unix_peek_fds(&scm, skb);
 	}
 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;

@@ -2385,7 +2432,7 @@ unlock:
 			/* It is questionable, see note in unix_dgram_recvmsg.
 			 */
 			if (UNIXCB(skb).fp)
-				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
+				unix_peek_fds(&scm, skb);

 			sk_peek_offset_fwd(sk, chunk);

针对该漏洞的修复是在将文件描述符设置到scm时在末尾加上gc锁。