1

I am trying to use the write protection feature of Linux's userfaultfd, but it does not appear to be enabled in my kernel even though I am using version 5.13 (write protection should be fully supported in 5.10+).

When I run

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <unistd.h>

#define errExit(msg)        \
    do {                    \
        perror(msg);        \
        exit(EXIT_FAILURE); \
    } while (0)

static int has_bit(uint64_t val, uint64_t bit) {
    return (val & bit) == bit;
}

int main() {
    long uffd;     /* userfaultfd file descriptor */
    struct uffdio_api uffdio_api;

    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1)
        errExit("userfaultfd");

    uffdio_api.api = UFFD_API;
    uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
        errExit("ioctl-UFFDIO_API");

    printf("UFFDIO_API: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_API));
    printf("UFFDIO_REGISTER: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_REGISTER));
    printf("UFFDIO_UNREGISTER: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_UNREGISTER));
    printf("UFFDIO_WRITEPROTECT: %d\n", has_bit(uffdio_api.ioctls, 1UL << _UFFDIO_WRITEPROTECT));
    printf("UFFD_FEATURE_PAGEFAULT_FLAG_WP: %d\n", has_bit(uffdio_api.features, UFFD_FEATURE_PAGEFAULT_FLAG_WP));
}

The output is

UFFDIO_API: 1
UFFDIO_REGISTER: 1
UFFDIO_UNREGISTER: 1
UFFDIO_WRITEPROTECT: 0
UFFD_FEATURE_PAGEFAULT_FLAG_WP: 1

The UFFD_FEATURE_PAGEFAULT_FLAG_WP feature is enabled, but the UFFDIO_WRITEPROTECT ioctl is marked as not supported, which is necessary to enable write protection.

What might lead to this feature being disabled, and how can I enable it?

I am using Ubuntu MATE 21.10 with Linux kernel version 5.13.0-30-generic.

EDIT:

It seems like despite the man page section on the UFFD_API ioctl (https://man7.org/linux/man-pages/man2/ioctl_userfaultfd.2.html), this might be the intended behavior for a system where write protection is enabled. However, when I run a full program that spawns a poller thread and writes to the protected memory, the poller thread does not receive any notification.

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

#define errExit(msg)        \
    do {                    \
        perror(msg);        \
        exit(EXIT_FAILURE); \
    } while (0)

static int page_size;

static void* fault_handler_thread(void* arg) {
    long uffd;                  /* userfaultfd file descriptor */
    uffd = (long) arg;

    /* Loop, handling incoming events on the userfaultfd
       file descriptor. */

    for (;;) {
        /* See what poll() tells us about the userfaultfd. */

        struct pollfd pollfd;
        int nready;
        pollfd.fd = uffd;
        pollfd.events = POLLIN;
        nready = poll(&pollfd, 1, -1);
        if (nready == -1)
            errExit("poll");

        printf("\nfault_handler_thread():\n");
        printf(
            "    poll() returns: nready = %d; "
            "POLLIN = %d; POLLERR = %d\n",
            nready, (pollfd.revents & POLLIN) != 0,
            (pollfd.revents & POLLERR) != 0);

        // received fault, exit the program
        exit(EXIT_FAILURE);
    }
}

int main() {
    long uffd;     /* userfaultfd file descriptor */
    char* addr;    /* Start of region handled by userfaultfd */
    uint64_t len;  /* Length of region handled by userfaultfd */
    pthread_t thr; /* ID of thread that handles page faults */
    struct uffdio_api uffdio_api;
    struct uffdio_register uffdio_register;
    struct uffdio_writeprotect uffdio_wp;
    int s;

    page_size = sysconf(_SC_PAGE_SIZE);
    len = page_size;

    /* Create and enable userfaultfd object. */

    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1)
        errExit("userfaultfd");

    uffdio_api.api = UFFD_API;
    uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
        errExit("ioctl-UFFDIO_API");

    addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (addr == MAP_FAILED)
        errExit("mmap");

    printf("Address returned by mmap() = %p\n", addr);

    /* Register the memory range of the mapping we just created for
       handling by the userfaultfd object. */

    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
        errExit("ioctl-UFFDIO_REGISTER");

    printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
    printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");

    uffdio_wp.range.start = (unsigned long) addr;
    uffdio_wp.range.len = len;
    uffdio_wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
    if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffdio_wp) == -1)
        errExit("ioctl-UFFDIO_WRITEPROTECT");

    /* Create a thread that will process the userfaultfd events. */

    s = pthread_create(&thr, NULL, fault_handler_thread, (void*) uffd);
    if (s != 0) {
        errno = s;
        errExit("pthread_create");
    }

    /* Main thread now touches memory in the mapping, touching
       locations 1024 bytes apart. This will trigger userfaultfd
       events for all pages in the region. */

    usleep(100000);

    size_t l;
    l = 0xf; /* Ensure that faulting address is not on a page
                boundary, in order to test that we correctly
                handle that case in fault_handling_thread(). */
    char i = 0;
    while (l < len) {
        printf("Write address %p in main(): ", addr + l);
        addr[l] = i++;
        printf("%d\n", addr[l]);
        l += 1024;
        usleep(100000); /* Slow things down a little */
    }

    exit(EXIT_SUCCESS);
}
Zach
  • 4,652
  • 18
  • 22
  • I think you need to set the `UFFD_FEATURE_PAGEFAULT_FLAG_WP` bitmask in `uffdio_api.features` before using the `UFFD_API` ioctl and check it afterwards to determine if write protection is supported. – Ian Abbott Mar 17 '22 at 14:28
  • Thanks, I tried doing this and the feature is enabled, but the write protection ioctl is still missing (and write protection doesn't appear to work). I have updated the question accordingly. – Zach Mar 17 '22 at 16:05
  • Looking at the kernel source, `1ULL << _UFFDIO_WRITEPROTECT` does not appear in the `ioctls` member of `struct uffdio_api` (for the `UFFD_API` ioctl), but it does appear in the `ioctls` member of `struct uffdio_register` (for the `UFFDIO_REGISTER` ioctl) when `UFFDIO_REGISTER_MODE_WP` is set in the `mode` member (of `struct uffdio_register`). – Ian Abbott Mar 17 '22 at 16:48
  • I've edited the title to make it clearer, hope you don't mind. – Marco Bonelli Mar 17 '22 at 17:39

2 Answers2

1

The UFFD_API ioctl does not seem to ever report _UFFD_WRITEPROTECT as can be seen here in the kernel source code (1, 2). I assume that this is because whether this operation is supported or not depends on the kind of underlying mapping.

The feature is in fact reporeted on a per-registered-range basis. You will have to set the API with ioctl(uffd, UFFDIO_API, ...) first, then register a range with ioctl(uffd, UFFDIO_REGISTER, ...) and then check the uffdio_register.ioctls field.

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <unistd.h>

#define errExit(msg)        \
    do {                    \
        perror(msg);        \
        exit(EXIT_FAILURE); \
    } while (0)

int main(void) {
    long uffd;

    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1)
        errExit("userfaultfd");

    struct uffdio_api uffdio_api = { .api = UFFD_API };

    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
        errExit("ioctl(UFFDIO_API)");

    const size_t region_sz = 0x4000;
    void *region = mmap(NULL, region_sz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
    if (region == MAP_FAILED)
        errExit("mmap");

    if (posix_memalign((void **)region, sysconf(_SC_PAGESIZE), region_sz))
        errExit("posix_memalign");

    printf("Region mapped at %p - %p\n", region, region + region_sz);

    struct uffdio_register uffdio_register = {
        .range = { .start = (unsigned long)region, .len = region_sz },
        .mode = UFFDIO_REGISTER_MODE_WP
    };

    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
        errExit("ioctl(UFFDIO_REGISTER)");

    printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
    printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");

    if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != UFFD_API_RANGE_IOCTLS)
        errExit("bad ioctl set");

    struct uffdio_writeprotect wp = {
        .range = { .start = (unsigned long)region, .len = region_sz },
        .mode = UFFDIO_WRITEPROTECT_MODE_WP
    };

    if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp) == -1)
        errExit("ioctl(UFFDIO_WRITEPROTECT)");

    puts("ioctl(UFFDIO_WRITEPROTECT) successful.");
    return EXIT_SUCCESS;
}

Output:

Region mapped at 0x7f45c48fe000 - 0x7f45c4902000
uffdio_register.ioctls = 0x5c
Have _UFFDIO_WRITEPROTECT? YES
ioctl(UFFDIO_WRITEPROTECT) successful.
Marco Bonelli
  • 63,369
  • 21
  • 118
  • 128
  • Thanks for the details. This seems inconsistent with the man page (the UFFD_API section), so this is helpful for the actual behavior. When I run your example, an error is thrown with "bad ioctl set" (the `UFFD_API_RANGE_IOCTLS` check). I'm not sure what this is for, and the example succeeds if I remove it. Also, once the memory is marked as write protected, I don't seem to get notifications on writes. I'll update the question with the full program I am using. – Zach Mar 17 '22 at 20:12
0

I found the solution. The write-protected pages must be touched after registering but before marking them as write-protected. This is an undocumented requirement, from what I can tell.

In other words, add

for (size_t i = 0; i < len; i += page_size) {
    addr[i] = 0;
}

between registering and write-protecting.

It works if I change the full example to

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

#define errExit(msg)        \
    do {                    \
        perror(msg);        \
        exit(EXIT_FAILURE); \
    } while (0)

static int page_size;

static void* fault_handler_thread(void* arg) {
    long uffd;                  /* userfaultfd file descriptor */
    uffd = (long) arg;

    /* Loop, handling incoming events on the userfaultfd
       file descriptor. */

    for (;;) {
        /* See what poll() tells us about the userfaultfd. */

        struct pollfd pollfd;
        int nready;
        pollfd.fd = uffd;
        pollfd.events = POLLIN;
        nready = poll(&pollfd, 1, -1);
        if (nready == -1)
            errExit("poll");

        printf("\nfault_handler_thread():\n");
        printf(
            "    poll() returns: nready = %d; "
            "POLLIN = %d; POLLERR = %d\n",
            nready, (pollfd.revents & POLLIN) != 0,
            (pollfd.revents & POLLERR) != 0);

        // received fault, exit the program
        exit(EXIT_FAILURE);
    }
}

int main() {
    long uffd;     /* userfaultfd file descriptor */
    char* addr;    /* Start of region handled by userfaultfd */
    uint64_t len;  /* Length of region handled by userfaultfd */
    pthread_t thr; /* ID of thread that handles page faults */
    struct uffdio_api uffdio_api;
    struct uffdio_register uffdio_register;
    struct uffdio_writeprotect uffdio_wp;
    int s;

    page_size = sysconf(_SC_PAGE_SIZE);
    len = page_size;

    /* Create and enable userfaultfd object. */

    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    if (uffd == -1)
        errExit("userfaultfd");

    uffdio_api.api = UFFD_API;
    uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
    if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
        errExit("ioctl-UFFDIO_API");

    addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (addr == MAP_FAILED)
        errExit("mmap");

    printf("Address returned by mmap() = %p\n", addr);

    /* Register the memory range of the mapping we just created for
       handling by the userfaultfd object. */

    uffdio_register.range.start = (unsigned long) addr;
    uffdio_register.range.len = len;
    uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
        errExit("ioctl-UFFDIO_REGISTER");

    printf("uffdio_register.ioctls = 0x%llx\n", uffdio_register.ioctls);
    printf("Have _UFFDIO_WRITEPROTECT? %s\n", (uffdio_register.ioctls & _UFFDIO_WRITEPROTECT) ? "YES" : "NO");

    for (size_t i = 0; i < len; i += page_size) {
        addr[i] = 0;
    }

    uffdio_wp.range.start = (unsigned long) addr;
    uffdio_wp.range.len = len;
    uffdio_wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
    if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffdio_wp) == -1)
        errExit("ioctl-UFFDIO_WRITEPROTECT");

    /* Create a thread that will process the userfaultfd events. */

    s = pthread_create(&thr, NULL, fault_handler_thread, (void*) uffd);
    if (s != 0) {
        errno = s;
        errExit("pthread_create");
    }

    /* Main thread now touches memory in the mapping, touching
       locations 1024 bytes apart. This will trigger userfaultfd
       events for all pages in the region. */

    usleep(100000);

    size_t l;
    l = 0xf; /* Ensure that faulting address is not on a page
                boundary, in order to test that we correctly
                handle that case in fault_handling_thread(). */
    char i = 0;
    while (l < len) {
        printf("Write address %p in main(): ", addr + l);
        addr[l] = i++;
        printf("%d\n", addr[l]);
        l += 1024;
        usleep(100000); /* Slow things down a little */
    }

    exit(EXIT_SUCCESS);
}
Zach
  • 4,652
  • 18
  • 22
  • I think if a page was never touched, that first access counts as "missing" not as "write protected". Instead of manually touching them in user space, you could also add UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP to UFFDIO_REGISTER. – Alexander Meißner Feb 25 '23 at 12:56