Racing_against_the_clock--hitting_a_tiny_kernel_race_window前序
简介 该技术的核心在于展示了如何利用一个理论上几乎不可能利用的、时间窗口极短的条件竞争漏洞。该方法是一种利用硬件特性(系统管理中断,SMI)来人为“拉长”这个时间窗口的创新技术。
前景提要:CVE-2021-0920 因为该技术是基于该漏洞发现了另外一个漏洞而出现的,所以为了便于理解就将该漏洞分析一次。
后续分析的内核版本为:Linux v4.14.201
SCM_RIGHTS 消息 SCM_RIGHTS控制消息 :Linux 开发者可以使用 sendmsg 系统调用发送 SCM_RIGHTS 数据(参见https://man7.org/linux/man-pages/man7/unix.7.html)**将文件描述符 fd 从一个进程共享到另一个进程(该功能的本意是有权限打开文件的进程打开文件然后传递给没权限打开的进程使用,和dup2()不同的是,传递到对端的文件描述符数字和本端不一样)**。当sender进程将文件描述符传递给另一个进程receiver时,SCM_RIGHTS 将创建一个对 file 结构的引用(该 file 结构指针放在receiver端的 sock->sk_receive_queue 队列上),这样即使receiver进程尚未接收到文件描述符,sender也可以立即关闭该文件描述符。当文件描述符处于“queued” 排队状态(表示sender已发送并关闭了该 fd,但receiver尚未接收 fd 并取得所有权)时,需要进行专门的垃圾回收。为了跟踪这种“queued”状态,文章 io_uring, SCM_RIGHTS, and reference-count cycles 很好地解释了SCM_RIGHTS 引用计数和垃圾回收原理。
内存泄露问题 :如果两个文件描述符 FD1 和 FD2 都把自己发送给对方,在都没有接收到文件描述符的情况下关闭 FD1 和 FD2,那么这两个 file 结构部将永远无法释放,造成内存泄露。所以要引入Linux垃圾回收机制,采用 inflight 飞行计数来统计正在被发送的文件描述符,识别内核中的不可破循环,并释放不可破循环中的 file 对象 。
发送文件描述符 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 static int unix_stream_sendmsg (struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sock *other = NULL ; int err, size; struct sk_buff *skb ; int sent = 0 ; struct scm_cookie scm ; bool fds_sent = false ; int data_len; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false ); err = -EOPNOTSUPP; if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { err = -ENOTCONN; other = unix_peer(sk); } while (sent < len) { size = len - sent; size = min_t (int , size, (sk->sk_sndbuf >> 1 ) - 64 ); size = min_t (int , size, SKB_MAX_HEAD(0 ) + UNIX_SKB_FRAGS_SZ); data_len = max_t (int , 0 , size - SKB_MAX_HEAD(0 )); data_len = min_t (size_t , size, PAGE_ALIGN(data_len)); skb = sock_alloc_send_pskb(sk, size - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, get_order(UNIX_SKB_FRAGS_SZ)); err = unix_scm_to_skb(&scm, skb, !fds_sent); fds_sent = true ; skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; err = skb_copy_datagram_from_iter(skb, 0 , &msg->msg_iter, size); unix_state_lock(other); maybe_add_creds(skb, sock, other); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other); sent += size; } scm_destroy(&scm); return sent; }
上述是发送函数的所有逻辑,大致就是处理scm消息,将scm消息整合到skb中最终将skb添加到目标套接字的接收队列中。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 struct scm_fp_list { short count; short max; struct user_struct *user ; struct file *fp [SCM_MAX_FD ]; }; struct scm_cookie { struct pid *pid ; struct scm_fp_list *fp ; struct scm_creds creds ; #ifdef CONFIG_SECURITY_NETWORK u32 secid; #endif };
上述两个结构体在后续很需要,在scm_fp_list->fp中存储的是要发送的文件描述符。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p){ struct cmsghdr *cmsg ; int err; for_each_cmsghdr(cmsg, msg) { err = -EINVAL; switch (cmsg->cmsg_type) { case SCM_RIGHTS: err=scm_fp_copy(cmsg, &p->fp); break ; case SCM_CREDENTIALS: } } if (p->fp && !p->fp->count) { kfree(p->fp); p->fp = NULL ; } return 0 ; }
通过该函数会将用户态传入的文件描述符解析至unix_stream_sendmsg函数中的struct scm_cookie scm;变量中。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 static int unix_scm_to_skb (struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0 ; UNIXCB(skb).pid = get_pid(scm->pid); UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL ; unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); skb->destructor = unix_destruct_scm; return err; }
这里对skb进行初始化,会通过unix_attach_fds将struct file写到skb上。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 int unix_attach_fds (struct scm_cookie *scm, struct sk_buff *skb) { int i; if (too_many_unix_fds(current)) return -ETOOMANYREFS; UNIXCB(skb).fp = scm_fp_dup(scm->fp); if (!UNIXCB(skb).fp) return -ENOMEM; for (i = scm->fp->count - 1 ; i >= 0 ; i--) unix_inflight(scm->fp->user, scm->fp->fp[i]); return 0 ; }
在该函数中会通过scm_fp_dup函数增加该文件的引用计数,通过unix_inflight函数增加要传输文件的飞行计数。
接收文件描述符 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 static int unix_stream_read_generic (struct unix_stream_read_state *state, bool freezable) { struct scm_cookie scm ; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); do { last = skb = skb_peek(&sk->sk_receive_queue); if (!(flags & MSG_PEEK)) { UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); if (unix_skb_len(skb)) break ; skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); if (scm.fp) break ; } else { if (UNIXCB(skb).fp) scm.fp = scm_fp_dup(UNIXCB(skb).fp); sk_peek_offset_fwd(sk, chunk); if (UNIXCB(skb).fp) break ; break ; } } while (size); mutex_unlock(&u->iolock); if (state->msg) scm_recv(sock, state->msg, &scm, flags); else scm_destroy(&scm); out: return copied ? : err; }
接收消息的大致流程如上,主要分为PEEK模式和非PEEK模式的正常接收。
不带PEEK调用recvmsg 1 2 3 4 5 6 7 8 9 10 void unix_detach_fds (struct scm_cookie *scm, struct sk_buff *skb) { int i; scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL ; for (i = scm->fp->count-1 ; i >= 0 ; i--) unix_notinflight(scm->fp->user, scm->fp->fp[i]); }
在分离文件描述的函数内部实现如上,首先将文件描述符赋值到scm中,然后清空skb上的fp指针,最后在下面减少飞行计数。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 void unix_notinflight (struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); spin_lock(&unix_gc_lock); if (s) { struct unix_sock *u = unix_sk(s); BUG_ON(!atomic_long_read(&u->inflight)); BUG_ON(list_empty(&u->link)); if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1 ); } user->unix_inflight--; spin_unlock(&unix_gc_lock); }
在减去飞行计数的时候会加上gc锁,在[1]处会减少飞行计数。
1 2 3 4 5 6 7 8 void consume_skb (struct sk_buff *skb) { if (!skb_unref(skb)) return ; trace_consume_skb(skb); __kfree_skb(skb); }
在进入到__kfree_skb之后会进入一下调用链,__kfree_skb=>skb_realease_all=>skb_release_head_state=>skb->destructor,这里调用的解构函数实际是unix_destruct_scm。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 void unix_destruct_scm (struct sk_buff *skb) { struct scm_cookie scm ; memset (&scm, 0 , sizeof (scm)); scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); scm_destroy(&scm); sock_wfree(skb); }
在此处如果skb如果还是有文件描述符则会再一次调用unix_detach_fds函数。
带PEEK调用recvmsg 可以发现的是,在带PEEK模式下的recvmsg是直接通过scm_fp_dup拿到文件描述符。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 struct scm_fp_list *scm_fp_dup (struct scm_fp_list *fpl) { struct scm_fp_list *new_fpl ; int i; if (!fpl) return NULL ; new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), GFP_KERNEL); if (new_fpl) { for (i = 0 ; i < fpl->count; i++) get_file(fpl->fp[i]); new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); } return new_fpl; }
分配scm_fp_list,然后通过get_file函数增加文件的引用计数最后返回scm_fp_list。
并且从前面的recvmsg函数来看,当带PEEK模式时不会销毁skb并且会拿到一个新的文件描述符
垃圾回收 垃圾回收的主要目的是为了清除内存中无法主动删除掉的内存块,即以下情况:
Socket A ==发送A==> Socket B
Socket B ==发送B==> Socket A
close(A);
close(B);
状态如下:
Socket A Socket B
ref = 1 ref = 1
inflight = 1 inflight = 1
在上述情况中,即没有Socket A的引用,同时没有Socket B的引用,导致形成了一个不可破循环,面对此类情况就应当进行gc。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 void unix_gc (void ) { struct unix_sock *u ; struct unix_sock *next ; struct sk_buff_head hitlist ; struct list_head cursor ; LIST_HEAD(not_cycle_list); spin_lock(&unix_gc_lock); list_for_each_entry_safe(u, next, &gc_inflight_list, link) { long total_refs; long inflight_refs; total_refs = file_count(u->sk.sk_socket->file); inflight_refs = atomic_long_read(&u->inflight); BUG_ON(inflight_refs < 1 ); BUG_ON(total_refs < inflight_refs); if (total_refs == inflight_refs) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); } } list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, dec_inflight, NULL ); list_add(&cursor, &gc_candidates); while (cursor.next != &gc_candidates) { u = list_entry(cursor.next, struct unix_sock, link); list_move(&cursor, &u->link); if (atomic_long_read(&u->inflight) > 0 ) { list_move_tail(&u->link, ¬_cycle_list); __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL ); } } list_del(&cursor); skb_queue_head_init(&hitlist); list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, inc_inflight, &hitlist); while (!list_empty(¬_cycle_list)) { u = list_entry(not_cycle_list.next, struct unix_sock, link); __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); list_move_tail(&u->link, &gc_inflight_list); } spin_unlock(&unix_gc_lock); __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); BUG_ON(!list_empty(&gc_candidates)); WRITE_ONCE(gc_in_progress, false ); wake_up(&unix_gc_wait); out: spin_unlock(&unix_gc_lock); }
所以这里的垃圾回收机制就是:
如果引用计数与飞行计数一致则进入gc的候选列表。
遍历候选列表中的sock,对其接收列表中的sock进行飞行计数减一
如果飞行计数大于0则移除gc的候选列表并恢复其飞行计数
恢复gc候选列表中的sock的飞行计数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 static void scan_children (struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { if (x->sk_state != TCP_LISTEN) { scan_inflight(x, func, hitlist); } else { struct sk_buff *skb; struct sk_buff *next ; struct unix_sock *u ; LIST_HEAD(embryos); spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { u = unix_sk(skb->sk); BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &embryos); } spin_unlock(&x->sk_receive_queue.lock); while (!list_empty(&embryos)) { u = list_entry(embryos.next, struct unix_sock, link); scan_inflight(&u->sk, func, hitlist); list_del_init(&u->link); } } }
此处会先将接收队列中的skb添加到embryos列表中,最后调用scan_inflight函数。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 static void scan_inflight (struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { struct sk_buff *skb ; struct sk_buff *next ; spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { if (UNIXCB(skb).fp) { bool hit = false ; int nfd = UNIXCB(skb).fp->count; struct file **fp = UNIXCB(skb).fp->fp; while (nfd--) { struct sock *sk = unix_get_socket(*fp++); if (sk) { struct unix_sock *u = unix_sk(sk); if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { hit = true ; func(u); } } } if (hit && hitlist != NULL ) { __skb_unlink(skb, &x->sk_receive_queue); __skb_queue_tail(hitlist, skb); } } } spin_unlock(&x->sk_receive_queue.lock); }
此处又一次遍历套接字的接收队列,如果skb有文件描述符则会进入内部调用外部传入的函数(减去飞行计数或增加飞行计数等)。
1 2 3 4 5 6 static inline void __skb_queue_purge(struct sk_buff_head *list ){ struct sk_buff *skb ; while ((skb = __skb_dequeue(list )) != NULL ) kfree_skb(skb); }
最后通过该函数删除skb。
漏洞分析 此处的漏洞出现在使用带PEEK的模式调用recvmsg时没有与gc进行同步,所以在gc的过程中可以通过调用带PEEK的recvmsg达到条件竞争的效果。
触发UAF的流程如下:
Unix_gc
线程一:Socket A
线程二:Socket B
线程三:Socket C
将套接字A发送到B并关闭A A:ref => 1; inflight => 1
将套接字B发送到A不关闭B 将套接字B发送到C并关闭B B:ref => 2; inflight => 2
触发GC 将A和B添加进GC候选列表
对套接字C使用带PEEK的recvmsg接收套接字B B:ref => 3; inflight => 2
对候选列表中的飞行计数减一 A:ref => 1; inflight => 0 B:ref => 3; inflight => 1
对套接字B使用不带PEEK的recvmsg A:ref => 1; inflight => 1 UNIXCB(skb).fp = NULL; 此处会因为unix_notinflight阻塞
因为B的飞行计数大于0 所以会恢复该消息 且恢复其接收列表的飞行计数 但因为UNIXCB(skb).fp = NULL;会导致A的飞行计数依旧为0
对套接字B使用带PEEK的recvmsg skb = skb_peek(&sk->sk_receive_queue);拿到skb
gc释放skb
state->recv_actor(skb, skip, chunk, state);达到UAF
总的来说,这里的触发漏洞还是比较复杂的,然后在此处增加时间窗口只需要创建多个类似于A <==> B ==> C这样的形式即可。
patch分析 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 net/unix/af_unix.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) @@ -1507,6 +1507,53 @@ out: return err; } +static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->fp = scm_fp_dup(UNIXCB(skb).fp); + + /* + * Garbage collection of unix sockets starts by selecting a set of + * candidate sockets which have reference only from being in flight + * (total_refs == inflight_refs). This condition is checked once during + * the candidate collection phase, and candidates are marked as such, so + * that non-candidates can later be ignored. While inflight_refs is + * protected by unix_gc_lock, total_refs (file count) is not, hence this + * is an instantaneous decision. + * + * Once a candidate, however, the socket must not be reinstalled into a + * file descriptor while the garbage collection is in progress. + * + * If the above conditions are met, then the directed graph of + * candidates (*) does not change while unix_gc_lock is held. + * + * Any operations that changes the file count through file descriptors + * (dup, close, sendmsg) does not change the graph since candidates are + * not installed in fds. + * + * Dequeing a candidate via recvmsg would install it into an fd, but + * that takes unix_gc_lock to decrement the inflight count, so it's + * serialized with garbage collection. + * + * MSG_PEEK is special in that it does not change the inflight count, + * yet does install the socket into an fd. The following lock/unlock + * pair is to ensure serialization with garbage collection. It must be + * done between incrementing the file count and installing the file into + * an fd. + * + * If garbage collection starts after the barrier provided by the + * lock/unlock, then it will see the elevated refcount and not mark this + * as a candidate. If a garbage collection is already in progress + * before the file count was incremented, then the lock/unlock pair will + * ensure that garbage collection is finished before progressing to + * installing the fd. + * + * (*) A -> B where B is on the queue of A or B is on the queue of C + * which is on the queue of listening socket A. + */ + spin_lock(&unix_gc_lock); + spin_unlock(&unix_gc_lock); +} + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; @@ -2140,7 +2187,7 @@ static int unix_dgram_recvmsg(struct soc sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; @@ -2385,7 +2432,7 @@ unlock: /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb); sk_peek_offset_fwd(sk, chunk);
针对该漏洞的修复是在将文件描述符设置到scm时在末尾加上gc锁。