// SPDX-License-Identifier: GPL-2.0-only /* * COW (Copy On Write) tests. * * Copyright 2022, Red Hat, Inc. * * Author(s): David Hildenbrand */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include "local_config.h" #ifdef LOCAL_CONFIG_HAVE_LIBURING #include #endif /* LOCAL_CONFIG_HAVE_LIBURING */ #include "../../../../mm/gup_test.h" #include "../kselftest.h" #include "vm_util.h" #include "thp_settings.h" static size_t pagesize; static int pagemap_fd; static size_t pmdsize; static int nr_thpsizes; static size_t thpsizes[20]; static int nr_hugetlbsizes; static size_t hugetlbsizes[10]; static int gup_fd; static bool has_huge_zeropage; static int sz2ord(size_t size) { return __builtin_ctzll(size / pagesize); } static int detect_thp_sizes(size_t sizes[], int max) { int count = 0; unsigned long orders; size_t kb; int i; /* thp not supported at all. */ if (!pmdsize) return 0; orders = 1UL << sz2ord(pmdsize); orders |= thp_supported_orders(); for (i = 0; orders && count < max; i++) { if (!(orders & (1UL << i))) continue; orders &= ~(1UL << i); kb = (pagesize >> 10) << i; sizes[count++] = kb * 1024; ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb); } return count; } static void detect_huge_zeropage(void) { int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", O_RDONLY); size_t enabled = 0; char buf[15]; int ret; if (fd < 0) return; ret = pread(fd, buf, sizeof(buf), 0); if (ret > 0 && ret < sizeof(buf)) { buf[ret] = 0; enabled = strtoul(buf, NULL, 10); if (enabled == 1) { has_huge_zeropage = true; ksft_print_msg("[INFO] huge zeropage is enabled\n"); } } close(fd); } static bool range_is_swapped(void *addr, size_t size) { for (; size; addr += pagesize, size -= pagesize) if (!pagemap_is_swapped(pagemap_fd, addr)) return false; return true; } struct comm_pipes { int child_ready[2]; int parent_ready[2]; }; static int setup_comm_pipes(struct comm_pipes *comm_pipes) { if (pipe(comm_pipes->child_ready) < 0) return -errno; if (pipe(comm_pipes->parent_ready) < 0) { close(comm_pipes->child_ready[0]); close(comm_pipes->child_ready[1]); return -errno; } return 0; } static void close_comm_pipes(struct comm_pipes *comm_pipes) { close(comm_pipes->child_ready[0]); close(comm_pipes->child_ready[1]); close(comm_pipes->parent_ready[0]); close(comm_pipes->parent_ready[1]); } static int child_memcmp_fn(char *mem, size_t size, struct comm_pipes *comm_pipes) { char *old = malloc(size); char buf; /* Backup the original content. */ memcpy(old, mem, size); /* Wait until the parent modified the page. */ write(comm_pipes->child_ready[1], "0", 1); while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) ; /* See if we still read the old values. */ return memcmp(old, mem, size); } static int child_vmsplice_memcmp_fn(char *mem, size_t size, struct comm_pipes *comm_pipes) { struct iovec iov = { .iov_base = mem, .iov_len = size, }; ssize_t cur, total, transferred; char *old, *new; int fds[2]; char buf; old = malloc(size); new = malloc(size); /* Backup the original content. */ memcpy(old, mem, size); if (pipe(fds) < 0) return -errno; /* Trigger a read-only pin. */ transferred = vmsplice(fds[1], &iov, 1, 0); if (transferred < 0) return -errno; if (transferred == 0) return -EINVAL; /* Unmap it from our page tables. */ if (munmap(mem, size) < 0) return -errno; /* Wait until the parent modified it. */ write(comm_pipes->child_ready[1], "0", 1); while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) ; /* See if we still read the old values via the pipe. */ for (total = 0; total < transferred; total += cur) { cur = read(fds[0], new + total, transferred - total); if (cur < 0) return -errno; } return memcmp(old, new, transferred); } typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, child_fn fn) { struct comm_pipes comm_pipes; char buf; int ret; ret = setup_comm_pipes(&comm_pipes); if (ret) { ksft_test_result_fail("pipe() failed\n"); return; } ret = fork(); if (ret < 0) { ksft_test_result_fail("fork() failed\n"); goto close_comm_pipes; } else if (!ret) { exit(fn(mem, size, &comm_pipes)); } while (read(comm_pipes.child_ready[0], &buf, 1) != 1) ; if (do_mprotect) { /* * mprotect() optimizations might try avoiding * write-faults by directly mapping pages writable. */ ret = mprotect(mem, size, PROT_READ); ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); if (ret) { ksft_test_result_fail("mprotect() failed\n"); write(comm_pipes.parent_ready[1], "0", 1); wait(&ret); goto close_comm_pipes; } } /* Modify the page. */ memset(mem, 0xff, size); write(comm_pipes.parent_ready[1], "0", 1); wait(&ret); if (WIFEXITED(ret)) ret = WEXITSTATUS(ret); else ret = -EINVAL; ksft_test_result(!ret, "No leak from parent into child\n"); close_comm_pipes: close_comm_pipes(&comm_pipes); } static void test_cow_in_parent(char *mem, size_t size) { do_test_cow_in_parent(mem, size, false, child_memcmp_fn); } static void test_cow_in_parent_mprotect(char *mem, size_t size) { do_test_cow_in_parent(mem, size, true, child_memcmp_fn); } static void test_vmsplice_in_child(char *mem, size_t size) { do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); } static void test_vmsplice_in_child_mprotect(char *mem, size_t size) { do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); } static void do_test_vmsplice_in_parent(char *mem, size_t size, bool before_fork) { struct iovec iov = { .iov_base = mem, .iov_len = size, }; ssize_t cur, total, transferred; struct comm_pipes comm_pipes; char *old, *new; int ret, fds[2]; char buf; old = malloc(size); new = malloc(size); memcpy(old, mem, size); ret = setup_comm_pipes(&comm_pipes); if (ret) { ksft_test_result_fail("pipe() failed\n"); goto free; } if (pipe(fds) < 0) { ksft_test_result_fail("pipe() failed\n"); goto close_comm_pipes; } if (before_fork) { transferred = vmsplice(fds[1], &iov, 1, 0); if (transferred <= 0) { ksft_test_result_fail("vmsplice() failed\n"); goto close_pipe; } } ret = fork(); if (ret < 0) { ksft_test_result_fail("fork() failed\n"); goto close_pipe; } else if (!ret) { write(comm_pipes.child_ready[1], "0", 1); while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) ; /* Modify page content in the child. */ memset(mem, 0xff, size); exit(0); } if (!before_fork) { transferred = vmsplice(fds[1], &iov, 1, 0); if (transferred <= 0) { ksft_test_result_fail("vmsplice() failed\n"); wait(&ret); goto close_pipe; } } while (read(comm_pipes.child_ready[0], &buf, 1) != 1) ; if (munmap(mem, size) < 0) { ksft_test_result_fail("munmap() failed\n"); goto close_pipe; } write(comm_pipes.parent_ready[1], "0", 1); /* Wait until the child is done writing. */ wait(&ret); if (!WIFEXITED(ret)) { ksft_test_result_fail("wait() failed\n"); goto close_pipe; } /* See if we still read the old values. */ for (total = 0; total < transferred; total += cur) { cur = read(fds[0], new + total, transferred - total); if (cur < 0) { ksft_test_result_fail("read() failed\n"); goto close_pipe; } } ksft_test_result(!memcmp(old, new, transferred), "No leak from child into parent\n"); close_pipe: close(fds[0]); close(fds[1]); close_comm_pipes: close_comm_pipes(&comm_pipes); free: free(old); free(new); } static void test_vmsplice_before_fork(char *mem, size_t size) { do_test_vmsplice_in_parent(mem, size, true); } static void test_vmsplice_after_fork(char *mem, size_t size) { do_test_vmsplice_in_parent(mem, size, false); } #ifdef LOCAL_CONFIG_HAVE_LIBURING static void do_test_iouring(char *mem, size_t size, bool use_fork) { struct comm_pipes comm_pipes; struct io_uring_cqe *cqe; struct io_uring_sqe *sqe; struct io_uring ring; ssize_t cur, total; struct iovec iov; char *buf, *tmp; int ret, fd; FILE *file; ret = setup_comm_pipes(&comm_pipes); if (ret) { ksft_test_result_fail("pipe() failed\n"); return; } file = tmpfile(); if (!file) { ksft_test_result_fail("tmpfile() failed\n"); goto close_comm_pipes; } fd = fileno(file); assert(fd); tmp = malloc(size); if (!tmp) { ksft_test_result_fail("malloc() failed\n"); goto close_file; } /* Skip on errors, as we might just lack kernel support. */ ret = io_uring_queue_init(1, &ring, 0); if (ret < 0) { ksft_test_result_skip("io_uring_queue_init() failed\n"); goto free_tmp; } /* * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN * | FOLL_LONGTERM the range. * * Skip on errors, as we might just lack kernel support or might not * have sufficient MEMLOCK permissions. */ iov.iov_base = mem; iov.iov_len = size; ret = io_uring_register_buffers(&ring, &iov, 1); if (ret) { ksft_test_result_skip("io_uring_register_buffers() failed\n"); goto queue_exit; } if (use_fork) { /* * fork() and keep the child alive until we're done. Note that * we expect the pinned page to not get shared with the child. */ ret = fork(); if (ret < 0) { ksft_test_result_fail("fork() failed\n"); goto unregister_buffers; } else if (!ret) { write(comm_pipes.child_ready[1], "0", 1); while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) ; exit(0); } while (read(comm_pipes.child_ready[0], &buf, 1) != 1) ; } else { /* * Map the page R/O into the page table. Enable softdirty * tracking to stop the page from getting mapped R/W immediately * again by mprotect() optimizations. Note that we don't have an * easy way to test if that worked (the pagemap does not export * if the page is mapped R/O vs. R/W). */ ret = mprotect(mem, size, PROT_READ); clear_softdirty(); ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); if (ret) { ksft_test_result_fail("mprotect() failed\n"); goto unregister_buffers; } } /* * Modify the page and write page content as observed by the fixed * buffer pin to the file so we can verify it. */ memset(mem, 0xff, size); sqe = io_uring_get_sqe(&ring); if (!sqe) { ksft_test_result_fail("io_uring_get_sqe() failed\n"); goto quit_child; } io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); ret = io_uring_submit(&ring); if (ret < 0) { ksft_test_result_fail("io_uring_submit() failed\n"); goto quit_child; } ret = io_uring_wait_cqe(&ring, &cqe); if (ret < 0) { ksft_test_result_fail("io_uring_wait_cqe() failed\n"); goto quit_child; } if (cqe->res != size) { ksft_test_result_fail("write_fixed failed\n"); goto quit_child; } io_uring_cqe_seen(&ring, cqe); /* Read back the file content to the temporary buffer. */ total = 0; while (total < size) { cur = pread(fd, tmp + total, size - total, total); if (cur < 0) { ksft_test_result_fail("pread() failed\n"); goto quit_child; } total += cur; } /* Finally, check if we read what we expected. */ ksft_test_result(!memcmp(mem, tmp, size), "Longterm R/W pin is reliable\n"); quit_child: if (use_fork) { write(comm_pipes.parent_ready[1], "0", 1); wait(&ret); } unregister_buffers: io_uring_unregister_buffers(&ring); queue_exit: io_uring_queue_exit(&ring); free_tmp: free(tmp); close_file: fclose(file); close_comm_pipes: close_comm_pipes(&comm_pipes); } static void test_iouring_ro(char *mem, size_t size) { do_test_iouring(mem, size, false); } static void test_iouring_fork(char *mem, size_t size) { do_test_iouring(mem, size, true); } #endif /* LOCAL_CONFIG_HAVE_LIBURING */ enum ro_pin_test { RO_PIN_TEST, RO_PIN_TEST_SHARED, RO_PIN_TEST_PREVIOUSLY_SHARED, RO_PIN_TEST_RO_EXCLUSIVE, }; static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, bool fast) { struct pin_longterm_test args; struct comm_pipes comm_pipes; char *tmp, buf; __u64 tmp_val; int ret; if (gup_fd < 0) { ksft_test_result_skip("gup_test not available\n"); return; } tmp = malloc(size); if (!tmp) { ksft_test_result_fail("malloc() failed\n"); return; } ret = setup_comm_pipes(&comm_pipes); if (ret) { ksft_test_result_fail("pipe() failed\n"); goto free_tmp; } switch (test) { case RO_PIN_TEST: break; case RO_PIN_TEST_SHARED: case RO_PIN_TEST_PREVIOUSLY_SHARED: /* * Share the pages with our child. As the pages are not pinned, * this should just work. */ ret = fork(); if (ret < 0) { ksft_test_result_fail("fork() failed\n"); goto close_comm_pipes; } else if (!ret) { write(comm_pipes.child_ready[1], "0", 1); while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) ; exit(0); } /* Wait until our child is ready. */ while (read(comm_pipes.child_ready[0], &buf, 1) != 1) ; if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { /* * Tell the child to quit now and wait until it quit. * The pages should now be mapped R/O into our page * tables, but they are no longer shared. */ write(comm_pipes.parent_ready[1], "0", 1); wait(&ret); if (!WIFEXITED(ret)) ksft_print_msg("[INFO] wait() failed\n"); } break; case RO_PIN_TEST_RO_EXCLUSIVE: /* * Map the page R/O into the page table. Enable softdirty * tracking to stop the page from getting mapped R/W immediately * again by mprotect() optimizations. Note that we don't have an * easy way to test if that worked (the pagemap does not export * if the page is mapped R/O vs. R/W). */ ret = mprotect(mem, size, PROT_READ); clear_softdirty(); ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); if (ret) { ksft_test_result_fail("mprotect() failed\n"); goto close_comm_pipes; } break; default: assert(false); } /* Take a R/O pin. This should trigger unsharing. */ args.addr = (__u64)(uintptr_t)mem; args.size = size; args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); if (ret) { if (errno == EINVAL) ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); else ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); goto wait; } /* Modify the page. */ memset(mem, 0xff, size); /* * Read back the content via the pin to the temporary buffer and * test if we observed the modification. */ tmp_val = (__u64)(uintptr_t)tmp; ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); if (ret) ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); else ksft_test_result(!memcmp(mem, tmp, size), "Longterm R/O pin is reliable\n"); ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); if (ret) ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); wait: switch (test) { case RO_PIN_TEST_SHARED: write(comm_pipes.parent_ready[1], "0", 1); wait(&ret); if (!WIFEXITED(ret)) ksft_print_msg("[INFO] wait() failed\n"); break; default: break; } close_comm_pipes: close_comm_pipes(&comm_pipes); free_tmp: free(tmp); } static void test_ro_pin_on_shared(char *mem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); } static void test_ro_fast_pin_on_shared(char *mem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); } static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); } static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); } static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); } static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); } typedef void (*test_fn)(char *mem, size_t size); static void do_run_with_base_page(test_fn fn, bool swapout) { char *mem; int ret; mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); return; } ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); /* Ignore if not around on a kernel. */ if (ret && errno != EINVAL) { ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); goto munmap; } /* Populate a base page. */ memset(mem, 0, pagesize); if (swapout) { madvise(mem, pagesize, MADV_PAGEOUT); if (!pagemap_is_swapped(pagemap_fd, mem)) { ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); goto munmap; } } fn(mem, pagesize); munmap: munmap(mem, pagesize); } static void run_with_base_page(test_fn fn, const char *desc) { ksft_print_msg("[RUN] %s ... with base page\n", desc); do_run_with_base_page(fn, false); } static void run_with_base_page_swap(test_fn fn, const char *desc) { ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); do_run_with_base_page(fn, true); } enum thp_run { THP_RUN_PMD, THP_RUN_PMD_SWAPOUT, THP_RUN_PTE, THP_RUN_PTE_SWAPOUT, THP_RUN_SINGLE_PTE, THP_RUN_SINGLE_PTE_SWAPOUT, THP_RUN_PARTIAL_MREMAP, THP_RUN_PARTIAL_SHARED, }; static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) { char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; size_t size, mmap_size, mremap_size; int ret; /* For alignment purposes, we need twice the thp size. */ mmap_size = 2 * thpsize; mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (mmap_mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); return; } /* We need a THP-aligned memory area. */ mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); ret = madvise(mem, thpsize, MADV_HUGEPAGE); if (ret) { ksft_test_result_fail("MADV_HUGEPAGE failed\n"); goto munmap; } /* * Try to populate a THP. Touch the first sub-page and test if * we get the last sub-page populated automatically. */ mem[0] = 0; if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { ksft_test_result_skip("Did not get a THP populated\n"); goto munmap; } memset(mem, 0, thpsize); size = thpsize; switch (thp_run) { case THP_RUN_PMD: case THP_RUN_PMD_SWAPOUT: assert(thpsize == pmdsize); break; case THP_RUN_PTE: case THP_RUN_PTE_SWAPOUT: /* * Trigger PTE-mapping the THP by temporarily mapping a single * subpage R/O. This is a noop if the THP is not pmdsize (and * therefore already PTE-mapped). */ ret = mprotect(mem + pagesize, pagesize, PROT_READ); if (ret) { ksft_test_result_fail("mprotect() failed\n"); goto munmap; } ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); if (ret) { ksft_test_result_fail("mprotect() failed\n"); goto munmap; } break; case THP_RUN_SINGLE_PTE: case THP_RUN_SINGLE_PTE_SWAPOUT: /* * Discard all but a single subpage of that PTE-mapped THP. What * remains is a single PTE mapping a single subpage. */ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); if (ret) { ksft_test_result_fail("MADV_DONTNEED failed\n"); goto munmap; } size = pagesize; break; case THP_RUN_PARTIAL_MREMAP: /* * Remap half of the THP. We need some new memory location * for that. */ mremap_size = thpsize / 2; mremap_mem = mmap(NULL, mremap_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } tmp = mremap(mem + mremap_size, mremap_size, mremap_size, MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); if (tmp != mremap_mem) { ksft_test_result_fail("mremap() failed\n"); goto munmap; } size = mremap_size; break; case THP_RUN_PARTIAL_SHARED: /* * Share the first page of the THP with a child and quit the * child. This will result in some parts of the THP never * have been shared. */ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); if (ret) { ksft_test_result_fail("MADV_DONTFORK failed\n"); goto munmap; } ret = fork(); if (ret < 0) { ksft_test_result_fail("fork() failed\n"); goto munmap; } else if (!ret) { exit(0); } wait(&ret); /* Allow for sharing all pages again. */ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); if (ret) { ksft_test_result_fail("MADV_DOFORK failed\n"); goto munmap; } break; default: assert(false); } switch (thp_run) { case THP_RUN_PMD_SWAPOUT: case THP_RUN_PTE_SWAPOUT: case THP_RUN_SINGLE_PTE_SWAPOUT: madvise(mem, size, MADV_PAGEOUT); if (!range_is_swapped(mem, size)) { ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); goto munmap; } break; default: break; } fn(mem, size); munmap: munmap(mmap_mem, mmap_size); if (mremap_mem != MAP_FAILED) munmap(mremap_mem, mremap_size); } static void run_with_thp(test_fn fn, const char *desc, size_t size) { ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n", desc, size / 1024); do_run_with_thp(fn, THP_RUN_PMD, size); } static void run_with_thp_swap(test_fn fn, const char *desc, size_t size) { ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n", desc, size / 1024); do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size); } static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size) { ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n", desc, size / 1024); do_run_with_thp(fn, THP_RUN_PTE, size); } static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size) { ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n", desc, size / 1024); do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size); } static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size) { ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n", desc, size / 1024); do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size); } static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size) { ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n", desc, size / 1024); do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size); } static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size) { ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n", desc, size / 1024); do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size); } static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size) { ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n", desc, size / 1024); do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size); } static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) { int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; char *mem, *dummy; ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, hugetlbsize / 1024); flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); if (mem == MAP_FAILED) { ksft_test_result_skip("need more free huge pages\n"); return; } /* Populate an huge page. */ memset(mem, 0, hugetlbsize); /* * We need a total of two hugetlb pages to handle COW/unsharing * properly, otherwise we might get zapped by a SIGBUS. */ dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); if (dummy == MAP_FAILED) { ksft_test_result_skip("need more free huge pages\n"); goto munmap; } munmap(dummy, hugetlbsize); fn(mem, hugetlbsize); munmap: munmap(mem, hugetlbsize); } struct test_case { const char *desc; test_fn fn; }; /* * Test cases that are specific to anonymous pages: pages in private mappings * that may get shared via COW during fork(). */ static const struct test_case anon_test_cases[] = { /* * Basic COW tests for fork() without any GUP. If we miss to break COW, * either the child can observe modifications by the parent or the * other way around. */ { "Basic COW after fork()", test_cow_in_parent, }, /* * Basic test, but do an additional mprotect(PROT_READ)+ * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. */ { "Basic COW after fork() with mprotect() optimization", test_cow_in_parent_mprotect, }, /* * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If * we miss to break COW, the child observes modifications by the parent. * This is CVE-2020-29374 reported by Jann Horn. */ { "vmsplice() + unmap in child", test_vmsplice_in_child }, /* * vmsplice() test, but do an additional mprotect(PROT_READ)+ * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. */ { "vmsplice() + unmap in child with mprotect() optimization", test_vmsplice_in_child_mprotect }, /* * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after * fork(); modify in the child. If we miss to break COW, the parent * observes modifications by the child. */ { "vmsplice() before fork(), unmap in parent after fork()", test_vmsplice_before_fork, }, /* * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the * child. If we miss to break COW, the parent observes modifications by * the child. */ { "vmsplice() + unmap in parent after fork()", test_vmsplice_after_fork, }, #ifdef LOCAL_CONFIG_HAVE_LIBURING /* * Take a R/W longterm pin and then map the page R/O into the page * table to trigger a write fault on next access. When modifying the * page, the page content must be visible via the pin. */ { "R/O-mapping a page registered as iouring fixed buffer", test_iouring_ro, }, /* * Take a R/W longterm pin and then fork() a child. When modifying the * page, the page content must be visible via the pin. We expect the * pinned page to not get shared with the child. */ { "fork() with an iouring fixed buffer", test_iouring_fork, }, #endif /* LOCAL_CONFIG_HAVE_LIBURING */ /* * Take a R/O longterm pin on a R/O-mapped shared anonymous page. * When modifying the page via the page table, the page content change * must be visible via the pin. */ { "R/O GUP pin on R/O-mapped shared page", test_ro_pin_on_shared, }, /* Same as above, but using GUP-fast. */ { "R/O GUP-fast pin on R/O-mapped shared page", test_ro_fast_pin_on_shared, }, /* * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that * was previously shared. When modifying the page via the page table, * the page content change must be visible via the pin. */ { "R/O GUP pin on R/O-mapped previously-shared page", test_ro_pin_on_ro_previously_shared, }, /* Same as above, but using GUP-fast. */ { "R/O GUP-fast pin on R/O-mapped previously-shared page", test_ro_fast_pin_on_ro_previously_shared, }, /* * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. * When modifying the page via the page table, the page content change * must be visible via the pin. */ { "R/O GUP pin on R/O-mapped exclusive page", test_ro_pin_on_ro_exclusive, }, /* Same as above, but using GUP-fast. */ { "R/O GUP-fast pin on R/O-mapped exclusive page", test_ro_fast_pin_on_ro_exclusive, }, }; static void run_anon_test_case(struct test_case const *test_case) { int i; run_with_base_page(test_case->fn, test_case->desc); run_with_base_page_swap(test_case->fn, test_case->desc); for (i = 0; i < nr_thpsizes; i++) { size_t size = thpsizes[i]; struct thp_settings settings = *thp_current_settings(); settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER; settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; thp_push_settings(&settings); if (size == pmdsize) { run_with_thp(test_case->fn, test_case->desc, size); run_with_thp_swap(test_case->fn, test_case->desc, size); } run_with_pte_mapped_thp(test_case->fn, test_case->desc, size); run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size); run_with_single_pte_of_thp(test_case->fn, test_case->desc, size); run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size); run_with_partial_mremap_thp(test_case->fn, test_case->desc, size); run_with_partial_shared_thp(test_case->fn, test_case->desc, size); thp_pop_settings(); } for (i = 0; i < nr_hugetlbsizes; i++) run_with_hugetlb(test_case->fn, test_case->desc, hugetlbsizes[i]); } static void run_anon_test_cases(void) { int i; ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) run_anon_test_case(&anon_test_cases[i]); } static int tests_per_anon_test_case(void) { int tests = 2 + nr_hugetlbsizes; tests += 6 * nr_thpsizes; if (pmdsize) tests += 2; return tests; } enum anon_thp_collapse_test { ANON_THP_COLLAPSE_UNSHARED, ANON_THP_COLLAPSE_FULLY_SHARED, ANON_THP_COLLAPSE_LOWER_SHARED, ANON_THP_COLLAPSE_UPPER_SHARED, }; static void do_test_anon_thp_collapse(char *mem, size_t size, enum anon_thp_collapse_test test) { struct comm_pipes comm_pipes; char buf; int ret; ret = setup_comm_pipes(&comm_pipes); if (ret) { ksft_test_result_fail("pipe() failed\n"); return; } /* * Trigger PTE-mapping the THP by temporarily mapping a single subpage * R/O, such that we can try collapsing it later. */ ret = mprotect(mem + pagesize, pagesize, PROT_READ); if (ret) { ksft_test_result_fail("mprotect() failed\n"); goto close_comm_pipes; } ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); if (ret) { ksft_test_result_fail("mprotect() failed\n"); goto close_comm_pipes; } switch (test) { case ANON_THP_COLLAPSE_UNSHARED: /* Collapse before actually COW-sharing the page. */ ret = madvise(mem, size, MADV_COLLAPSE); if (ret) { ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", strerror(errno)); goto close_comm_pipes; } break; case ANON_THP_COLLAPSE_FULLY_SHARED: /* COW-share the full PTE-mapped THP. */ break; case ANON_THP_COLLAPSE_LOWER_SHARED: /* Don't COW-share the upper part of the THP. */ ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); if (ret) { ksft_test_result_fail("MADV_DONTFORK failed\n"); goto close_comm_pipes; } break; case ANON_THP_COLLAPSE_UPPER_SHARED: /* Don't COW-share the lower part of the THP. */ ret = madvise(mem, size / 2, MADV_DONTFORK); if (ret) { ksft_test_result_fail("MADV_DONTFORK failed\n"); goto close_comm_pipes; } break; default: assert(false); } ret = fork(); if (ret < 0) { ksft_test_result_fail("fork() failed\n"); goto close_comm_pipes; } else if (!ret) { switch (test) { case ANON_THP_COLLAPSE_UNSHARED: case ANON_THP_COLLAPSE_FULLY_SHARED: exit(child_memcmp_fn(mem, size, &comm_pipes)); break; case ANON_THP_COLLAPSE_LOWER_SHARED: exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); break; case ANON_THP_COLLAPSE_UPPER_SHARED: exit(child_memcmp_fn(mem + size / 2, size / 2, &comm_pipes)); break; default: assert(false); } } while (read(comm_pipes.child_ready[0], &buf, 1) != 1) ; switch (test) { case ANON_THP_COLLAPSE_UNSHARED: break; case ANON_THP_COLLAPSE_UPPER_SHARED: case ANON_THP_COLLAPSE_LOWER_SHARED: /* * Revert MADV_DONTFORK such that we merge the VMAs and are * able to actually collapse. */ ret = madvise(mem, size, MADV_DOFORK); if (ret) { ksft_test_result_fail("MADV_DOFORK failed\n"); write(comm_pipes.parent_ready[1], "0", 1); wait(&ret); goto close_comm_pipes; } /* FALLTHROUGH */ case ANON_THP_COLLAPSE_FULLY_SHARED: /* Collapse before anyone modified the COW-shared page. */ ret = madvise(mem, size, MADV_COLLAPSE); if (ret) { ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", strerror(errno)); write(comm_pipes.parent_ready[1], "0", 1); wait(&ret); goto close_comm_pipes; } break; default: assert(false); } /* Modify the page. */ memset(mem, 0xff, size); write(comm_pipes.parent_ready[1], "0", 1); wait(&ret); if (WIFEXITED(ret)) ret = WEXITSTATUS(ret); else ret = -EINVAL; ksft_test_result(!ret, "No leak from parent into child\n"); close_comm_pipes: close_comm_pipes(&comm_pipes); } static void test_anon_thp_collapse_unshared(char *mem, size_t size) { do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); } static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) { do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); } static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) { do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); } static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) { do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); } /* * Test cases that are specific to anonymous THP: pages in private mappings * that may get shared via COW during fork(). */ static const struct test_case anon_thp_test_cases[] = { /* * Basic COW test for fork() without any GUP when collapsing a THP * before fork(). * * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place * collapse") might easily get COW handling wrong when not collapsing * exclusivity information properly. */ { "Basic COW after fork() when collapsing before fork()", test_anon_thp_collapse_unshared, }, /* Basic COW test, but collapse after COW-sharing a full THP. */ { "Basic COW after fork() when collapsing after fork() (fully shared)", test_anon_thp_collapse_fully_shared, }, /* * Basic COW test, but collapse after COW-sharing the lower half of a * THP. */ { "Basic COW after fork() when collapsing after fork() (lower shared)", test_anon_thp_collapse_lower_shared, }, /* * Basic COW test, but collapse after COW-sharing the upper half of a * THP. */ { "Basic COW after fork() when collapsing after fork() (upper shared)", test_anon_thp_collapse_upper_shared, }, }; static void run_anon_thp_test_cases(void) { int i; if (!pmdsize) return; ksft_print_msg("[INFO] Anonymous THP tests\n"); for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { struct test_case const *test_case = &anon_thp_test_cases[i]; ksft_print_msg("[RUN] %s\n", test_case->desc); do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize); } } static int tests_per_anon_thp_test_case(void) { return pmdsize ? 1 : 0; } typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); static void test_cow(char *mem, const char *smem, size_t size) { char *old = malloc(size); /* Backup the original content. */ memcpy(old, smem, size); /* Modify the page. */ memset(mem, 0xff, size); /* See if we still read the old values via the other mapping. */ ksft_test_result(!memcmp(smem, old, size), "Other mapping not modified\n"); free(old); } static void test_ro_pin(char *mem, const char *smem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST, false); } static void test_ro_fast_pin(char *mem, const char *smem, size_t size) { do_test_ro_pin(mem, size, RO_PIN_TEST, true); } static void run_with_zeropage(non_anon_test_fn fn, const char *desc) { char *mem, *smem, tmp; ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); if (mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); return; } smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); if (mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } /* Read from the page to populate the shared zeropage. */ tmp = *mem + *smem; asm volatile("" : "+r" (tmp)); fn(mem, smem, pagesize); munmap: munmap(mem, pagesize); if (smem != MAP_FAILED) munmap(smem, pagesize); } static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) { char *mem, *smem, *mmap_mem, *mmap_smem, tmp; size_t mmap_size; int ret; ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); if (!has_huge_zeropage) { ksft_test_result_skip("Huge zeropage not enabled\n"); return; } /* For alignment purposes, we need twice the thp size. */ mmap_size = 2 * pmdsize; mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (mmap_mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); return; } mmap_smem = mmap(NULL, mmap_size, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (mmap_smem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } /* We need a THP-aligned memory area. */ mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1)); smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1)); ret = madvise(mem, pmdsize, MADV_HUGEPAGE); ret |= madvise(smem, pmdsize, MADV_HUGEPAGE); if (ret) { ksft_test_result_fail("MADV_HUGEPAGE failed\n"); goto munmap; } /* * Read from the memory to populate the huge shared zeropage. Read from * the first sub-page and test if we get another sub-page populated * automatically. */ tmp = *mem + *smem; asm volatile("" : "+r" (tmp)); if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || !pagemap_is_populated(pagemap_fd, smem + pagesize)) { ksft_test_result_skip("Did not get THPs populated\n"); goto munmap; } fn(mem, smem, pmdsize); munmap: munmap(mmap_mem, mmap_size); if (mmap_smem != MAP_FAILED) munmap(mmap_smem, mmap_size); } static void run_with_memfd(non_anon_test_fn fn, const char *desc) { char *mem, *smem, tmp; int fd; ksft_print_msg("[RUN] %s ... with memfd\n", desc); fd = memfd_create("test", 0); if (fd < 0) { ksft_test_result_fail("memfd_create() failed\n"); return; } /* File consists of a single page filled with zeroes. */ if (fallocate(fd, 0, 0, pagesize)) { ksft_test_result_fail("fallocate() failed\n"); goto close; } /* Create a private mapping of the memfd. */ mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto close; } smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); if (mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } /* Fault the page in. */ tmp = *mem + *smem; asm volatile("" : "+r" (tmp)); fn(mem, smem, pagesize); munmap: munmap(mem, pagesize); if (smem != MAP_FAILED) munmap(smem, pagesize); close: close(fd); } static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) { char *mem, *smem, tmp; FILE *file; int fd; ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); file = tmpfile(); if (!file) { ksft_test_result_fail("tmpfile() failed\n"); return; } fd = fileno(file); if (fd < 0) { ksft_test_result_skip("fileno() failed\n"); return; } /* File consists of a single page filled with zeroes. */ if (fallocate(fd, 0, 0, pagesize)) { ksft_test_result_fail("fallocate() failed\n"); goto close; } /* Create a private mapping of the memfd. */ mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto close; } smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); if (mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } /* Fault the page in. */ tmp = *mem + *smem; asm volatile("" : "+r" (tmp)); fn(mem, smem, pagesize); munmap: munmap(mem, pagesize); if (smem != MAP_FAILED) munmap(smem, pagesize); close: fclose(file); } static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, size_t hugetlbsize) { int flags = MFD_HUGETLB; char *mem, *smem, tmp; int fd; ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, hugetlbsize / 1024); flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; fd = memfd_create("test", flags); if (fd < 0) { ksft_test_result_skip("memfd_create() failed\n"); return; } /* File consists of a single page filled with zeroes. */ if (fallocate(fd, 0, 0, hugetlbsize)) { ksft_test_result_skip("need more free huge pages\n"); goto close; } /* Create a private mapping of the memfd. */ mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (mem == MAP_FAILED) { ksft_test_result_skip("need more free huge pages\n"); goto close; } smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); if (mem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } /* Fault the page in. */ tmp = *mem + *smem; asm volatile("" : "+r" (tmp)); fn(mem, smem, hugetlbsize); munmap: munmap(mem, hugetlbsize); if (mem != MAP_FAILED) munmap(smem, hugetlbsize); close: close(fd); } struct non_anon_test_case { const char *desc; non_anon_test_fn fn; }; /* * Test cases that target any pages in private mappings that are not anonymous: * pages that may get shared via COW ndependent of fork(). This includes * the shared zeropage(s), pagecache pages, ... */ static const struct non_anon_test_case non_anon_test_cases[] = { /* * Basic COW test without any GUP. If we miss to break COW, changes are * visible via other private/shared mappings. */ { "Basic COW", test_cow, }, /* * Take a R/O longterm pin. When modifying the page via the page table, * the page content change must be visible via the pin. */ { "R/O longterm GUP pin", test_ro_pin, }, /* Same as above, but using GUP-fast. */ { "R/O longterm GUP-fast pin", test_ro_fast_pin, }, }; static void run_non_anon_test_case(struct non_anon_test_case const *test_case) { int i; run_with_zeropage(test_case->fn, test_case->desc); run_with_memfd(test_case->fn, test_case->desc); run_with_tmpfile(test_case->fn, test_case->desc); if (pmdsize) run_with_huge_zeropage(test_case->fn, test_case->desc); for (i = 0; i < nr_hugetlbsizes; i++) run_with_memfd_hugetlb(test_case->fn, test_case->desc, hugetlbsizes[i]); } static void run_non_anon_test_cases(void) { int i; ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) run_non_anon_test_case(&non_anon_test_cases[i]); } static int tests_per_non_anon_test_case(void) { int tests = 3 + nr_hugetlbsizes; if (pmdsize) tests += 1; return tests; } int main(int argc, char **argv) { int err; struct thp_settings default_settings; ksft_print_header(); pagesize = getpagesize(); pmdsize = read_pmd_pagesize(); if (pmdsize) { /* Only if THP is supported. */ thp_read_settings(&default_settings); default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT; thp_save_settings(); thp_push_settings(&default_settings); ksft_print_msg("[INFO] detected PMD size: %zu KiB\n", pmdsize / 1024); nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); } nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, ARRAY_SIZE(hugetlbsizes)); detect_huge_zeropage(); ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) ksft_exit_fail_msg("opening pagemap failed\n"); run_anon_test_cases(); run_anon_thp_test_cases(); run_non_anon_test_cases(); if (pmdsize) { /* Only if THP is supported. */ thp_restore_settings(); } err = ksft_get_fail_cnt(); if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", err, ksft_test_num()); return ksft_exit_pass(); }