Projects STRLCPY criu Commits 27be93b7
🤬
  • Revert "kdat: Relax uffd checks (v2)"

    This reverts commit a840995689f7ab898ba34fd10e014704165e2f83, that
    got into master by mistake.
    
    Signed-off-by: Pavel Emelyanov <[email protected]>
  • Loading...
  • Pavel Emelyanov committed 7 years ago
    27be93b7
    1 parent f734928c
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■ ■
    criu/uffd.c
    1  -#include <stddef.h>
    2  -#include <stdio.h>
    3  -#include <errno.h>
    4  -#include <dirent.h>
    5  -#include <unistd.h>
    6  -#include <stdlib.h>
    7  -#include <fcntl.h>
    8  -#include <poll.h>
    9  -#include <string.h>
    10  -#include <time.h>
    11  -#include <sys/stat.h>
    12  -#include <sys/mman.h>
    13  -#include <sys/syscall.h>
    14  -#include <sys/ioctl.h>
    15  -#include <sys/un.h>
    16  -#include <sys/socket.h>
    17  -#include <sys/wait.h>
    18  - 
    19  -#include "linux/userfaultfd.h"
    20  - 
    21  -#include "int.h"
    22  -#include "page.h"
    23  -#include "criu-log.h"
    24  -#include "criu-plugin.h"
    25  -#include "pagemap.h"
    26  -#include "files-reg.h"
    27  -#include "kerndat.h"
    28  -#include "mem.h"
    29  -#include "uffd.h"
    30  -#include "util-pie.h"
    31  -#include "protobuf.h"
    32  -#include "pstree.h"
    33  -#include "crtools.h"
    34  -#include "cr_options.h"
    35  -#include "xmalloc.h"
    36  -#include <compel/plugins/std/syscall-codes.h>
    37  -#include "restorer.h"
    38  -#include "page-xfer.h"
    39  -#include "common/lock.h"
    40  -#include "rst-malloc.h"
    41  -#include "util.h"
    42  - 
    43  -#undef LOG_PREFIX
    44  -#define LOG_PREFIX "uffd: "
    45  - 
    46  -#define lp_debug(lpi, fmt, arg...) pr_debug("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg)
    47  -#define lp_info(lpi, fmt, arg...) pr_info("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg)
    48  -#define lp_warn(lpi, fmt, arg...) pr_warn("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg)
    49  -#define lp_err(lpi, fmt, arg...) pr_err("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg)
    50  -#define lp_perror(lpi, fmt, arg...) pr_perror("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg)
    51  - 
    52  -#define NEED_UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \
    53  - UFFD_FEATURE_EVENT_REMAP | \
    54  - UFFD_FEATURE_EVENT_UNMAP | \
    55  - UFFD_FEATURE_EVENT_REMOVE)
    56  - 
    57  -#define LAZY_PAGES_SOCK_NAME "lazy-pages.socket"
    58  - 
    59  -static mutex_t *lazy_sock_mutex;
    60  - 
    61  -struct lazy_iov {
    62  - struct list_head l;
    63  - unsigned long base; /* run-time start address, tracks remaps */
    64  - unsigned long img_base; /* start address at the dump time */
    65  - unsigned long len;
    66  -};
    67  - 
    68  -struct lp_req {
    69  - unsigned long addr; /* actual #PF (or background) destination */
    70  - unsigned long img_addr; /* the corresponding address at the dump time */
    71  - struct list_head l;
    72  -};
    73  - 
    74  -struct lazy_pages_info {
    75  - int pid;
    76  - 
    77  - struct list_head iovs;
    78  - struct list_head reqs;
    79  - 
    80  - struct lazy_pages_info *parent;
    81  - 
    82  - struct page_read pr;
    83  - 
    84  - unsigned long total_pages;
    85  - unsigned long copied_pages;
    86  - 
    87  - struct epoll_rfd lpfd;
    88  - 
    89  - struct list_head l;
    90  - 
    91  - void *buf;
    92  -};
    93  - 
    94  -/* global lazy-pages daemon state */
    95  -static LIST_HEAD(lpis);
    96  -static LIST_HEAD(exiting_lpis);
    97  -static LIST_HEAD(pending_lpis);
    98  -static int epollfd;
    99  - 
    100  -static int handle_uffd_event(struct epoll_rfd *lpfd);
    101  - 
    102  -static struct lazy_pages_info *lpi_init(void)
    103  -{
    104  - struct lazy_pages_info *lpi = NULL;
    105  - 
    106  - lpi = xmalloc(sizeof(*lpi));
    107  - if (!lpi)
    108  - return NULL;
    109  - 
    110  - memset(lpi, 0, sizeof(*lpi));
    111  - INIT_LIST_HEAD(&lpi->iovs);
    112  - INIT_LIST_HEAD(&lpi->reqs);
    113  - INIT_LIST_HEAD(&lpi->l);
    114  - lpi->lpfd.revent = handle_uffd_event;
    115  - 
    116  - return lpi;
    117  -}
    118  - 
    119  -static void free_lazy_iovs(struct lazy_pages_info *lpi)
    120  -{
    121  - struct lazy_iov *p, *n;
    122  - 
    123  - list_for_each_entry_safe(p, n, &lpi->iovs, l) {
    124  - list_del(&p->l);
    125  - xfree(p);
    126  - }
    127  -}
    128  - 
    129  -static void lpi_fini(struct lazy_pages_info *lpi)
    130  -{
    131  - 
    132  - if (!lpi)
    133  - return;
    134  - free(lpi->buf);
    135  - free_lazy_iovs(lpi);
    136  - if (lpi->lpfd.fd > 0)
    137  - close(lpi->lpfd.fd);
    138  - if (lpi->pr.close)
    139  - lpi->pr.close(&lpi->pr);
    140  - free(lpi);
    141  -}
    142  - 
    143  -static int prepare_sock_addr(struct sockaddr_un *saddr)
    144  -{
    145  - int len;
    146  - 
    147  - memset(saddr, 0, sizeof(struct sockaddr_un));
    148  - 
    149  - saddr->sun_family = AF_UNIX;
    150  - len = snprintf(saddr->sun_path, sizeof(saddr->sun_path),
    151  - "%s", LAZY_PAGES_SOCK_NAME);
    152  - if (len >= sizeof(saddr->sun_path)) {
    153  - pr_err("Wrong UNIX socket name: %s\n", LAZY_PAGES_SOCK_NAME);
    154  - return -1;
    155  - }
    156  - 
    157  - return 0;
    158  -}
    159  - 
    160  -static int send_uffd(int sendfd, int pid)
    161  -{
    162  - int fd;
    163  - int ret = -1;
    164  - 
    165  - if (sendfd < 0)
    166  - return -1;
    167  - 
    168  - fd = get_service_fd(LAZY_PAGES_SK_OFF);
    169  - if (fd < 0) {
    170  - pr_err("%s: get_service_fd\n", __func__);
    171  - return -1;
    172  - }
    173  - 
    174  - mutex_lock(lazy_sock_mutex);
    175  - 
    176  - /* The "transfer protocol" is first the pid as int and then
    177  - * the FD for UFFD */
    178  - pr_debug("Sending PID %d\n", pid);
    179  - if (send(fd, &pid, sizeof(pid), 0) < 0) {
    180  - pr_perror("PID sending error");
    181  - goto out;
    182  - }
    183  - 
    184  - /* for a zombie process pid will be negative */
    185  - if (pid < 0) {
    186  - ret = 0;
    187  - goto out;
    188  - }
    189  - 
    190  - if (send_fd(fd, NULL, 0, sendfd) < 0) {
    191  - pr_err("send_fd error\n");
    192  - goto out;
    193  - }
    194  - 
    195  - ret = 0;
    196  -out:
    197  - mutex_unlock(lazy_sock_mutex);
    198  - close(fd);
    199  - return ret;
    200  -}
    201  - 
    202  -int lazy_pages_setup_zombie(int pid)
    203  -{
    204  - if (!opts.lazy_pages)
    205  - return 0;
    206  - 
    207  - if (send_uffd(0, -pid))
    208  - return -1;
    209  - 
    210  - return 0;
    211  -}
    212  - 
    213  -/* This function is used by 'criu restore --lazy-pages' */
    214  -int setup_uffd(int pid, struct task_restore_args *task_args)
    215  -{
    216  - struct uffdio_api uffdio_api;
    217  - 
    218  - if (!opts.lazy_pages) {
    219  - task_args->uffd = -1;
    220  - return 0;
    221  - }
    222  - 
    223  - /*
    224  - * Open userfaulfd FD which is passed to the restorer blob and
    225  - * to a second process handling the userfaultfd page faults.
    226  - */
    227  - task_args->uffd = syscall(SYS_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    228  - if (task_args->uffd < 0) {
    229  - pr_perror("Unable to open an userfaultfd descriptor");
    230  - return -1;
    231  - }
    232  - 
    233  - /*
    234  - * Check if the UFFD_API is the one which is expected
    235  - */
    236  - uffdio_api.api = UFFD_API;
    237  - uffdio_api.features = kdat.uffd_features & NEED_UFFD_API_FEATURES;
    238  - if (ioctl(task_args->uffd, UFFDIO_API, &uffdio_api)) {
    239  - pr_err("Checking for UFFDIO_API failed.\n");
    240  - goto err;
    241  - }
    242  - if (uffdio_api.api != UFFD_API) {
    243  - pr_err("Result of looking up UFFDIO_API does not match: %Lu\n", uffdio_api.api);
    244  - goto err;
    245  - }
    246  - 
    247  - if (send_uffd(task_args->uffd, pid) < 0)
    248  - goto err;
    249  - 
    250  - return 0;
    251  -err:
    252  - close(task_args->uffd);
    253  - return -1;
    254  -}
    255  - 
    256  -int prepare_lazy_pages_socket(void)
    257  -{
    258  - int fd, new_fd;
    259  - int len;
    260  - struct sockaddr_un sun;
    261  - 
    262  - if (!opts.lazy_pages)
    263  - return 0;
    264  - 
    265  - if (prepare_sock_addr(&sun))
    266  - return -1;
    267  - 
    268  - lazy_sock_mutex = shmalloc(sizeof(*lazy_sock_mutex));
    269  - if (!lazy_sock_mutex)
    270  - return -1;
    271  - 
    272  - mutex_init(lazy_sock_mutex);
    273  - 
    274  - if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
    275  - return -1;
    276  - 
    277  - new_fd = install_service_fd(LAZY_PAGES_SK_OFF, fd);
    278  - close(fd);
    279  - if (new_fd < 0)
    280  - return -1;
    281  - 
    282  - len = offsetof(struct sockaddr_un, sun_path) + strlen(sun.sun_path);
    283  - if (connect(new_fd, (struct sockaddr *) &sun, len) < 0) {
    284  - pr_perror("connect to %s failed", sun.sun_path);
    285  - close(new_fd);
    286  - return -1;
    287  - }
    288  - 
    289  - return 0;
    290  -}
    291  - 
    292  -static int server_listen(struct sockaddr_un *saddr)
    293  -{
    294  - int fd;
    295  - int len;
    296  - 
    297  - if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
    298  - return -1;
    299  - 
    300  - unlink(saddr->sun_path);
    301  - 
    302  - len = offsetof(struct sockaddr_un, sun_path) + strlen(saddr->sun_path);
    303  - 
    304  - if (bind(fd, (struct sockaddr *) saddr, len) < 0) {
    305  - goto out;
    306  - }
    307  - 
    308  - if (listen(fd, 10) < 0) {
    309  - goto out;
    310  - }
    311  - 
    312  - return fd;
    313  - 
    314  -out:
    315  - close(fd);
    316  - return -1;
    317  -}
    318  - 
    319  -static MmEntry *init_mm_entry(struct lazy_pages_info *lpi)
    320  -{
    321  - struct cr_img *img;
    322  - MmEntry *mm;
    323  - int ret;
    324  - 
    325  - img = open_image(CR_FD_MM, O_RSTR, lpi->pid);
    326  - if (!img)
    327  - return NULL;
    328  - 
    329  - ret = pb_read_one_eof(img, &mm, PB_MM);
    330  - close_image(img);
    331  - if (ret == -1)
    332  - return NULL;
    333  - lp_debug(lpi, "Found %zd VMAs in image\n", mm->n_vmas);
    334  - 
    335  - return mm;
    336  -}
    337  - 
    338  -static struct lazy_iov *find_lazy_iov(struct lazy_pages_info *lpi,
    339  - unsigned long addr)
    340  -{
    341  - struct lazy_iov *iov;
    342  - 
    343  - list_for_each_entry(iov, &lpi->iovs, l)
    344  - if (addr >= iov->base && addr < iov->base + iov->len)
    345  - return iov;
    346  - 
    347  - return NULL;
    348  -}
    349  - 
    350  -static int split_iov(struct lazy_iov *iov, unsigned long addr, bool new_below)
    351  -{
    352  - struct lazy_iov *new;
    353  - 
    354  - new = xzalloc(sizeof(*new));
    355  - if (!new)
    356  - return -1;
    357  - 
    358  - if (new_below) {
    359  - new->base = iov->base;
    360  - new->img_base = iov->img_base;
    361  - new->len = addr - iov->base;
    362  - iov->base = addr;
    363  - iov->img_base += new->len;
    364  - iov->len -= new->len;
    365  - list_add_tail(&new->l, &iov->l);
    366  - } else {
    367  - new->base = addr;
    368  - new->img_base = iov->img_base + addr - iov->base;
    369  - new->len = iov->len - (addr - iov->base);
    370  - iov->len -= new->len;
    371  - list_add(&new->l, &iov->l);
    372  - }
    373  - 
    374  - return 0;
    375  -}
    376  - 
    377  -static int copy_lazy_iovs(struct lazy_pages_info *src,
    378  - struct lazy_pages_info *dst)
    379  -{
    380  - struct lazy_iov *iov, *new, *n;
    381  - int max_iov_len = 0;
    382  - 
    383  - list_for_each_entry(iov, &src->iovs, l) {
    384  - new = xzalloc(sizeof(*new));
    385  - if (!new)
    386  - return -1;
    387  - 
    388  - new->base = iov->base;
    389  - new->img_base = iov->img_base;
    390  - new->len = iov->len;
    391  - 
    392  - list_add_tail(&new->l, &dst->iovs);
    393  - 
    394  - if (new->len > max_iov_len)
    395  - max_iov_len = new->len;
    396  - }
    397  - 
    398  - if (posix_memalign(&dst->buf, PAGE_SIZE, max_iov_len))
    399  - goto free_iovs;
    400  - 
    401  - return 0;
    402  - 
    403  -free_iovs:
    404  - list_for_each_entry_safe(iov, n, &dst->iovs, l)
    405  - xfree(iov);
    406  - return -1;
    407  -}
    408  - 
    409  -/*
    410  - * Purge range (addr, addr + len) from lazy_iovs. The range may
    411  - * cover several continuous IOVs.
    412  - */
    413  -static int drop_lazy_iovs(struct lazy_pages_info *lpi, unsigned long addr,
    414  - int len)
    415  -{
    416  - struct lazy_iov *iov, *n;
    417  - 
    418  - list_for_each_entry_safe(iov, n, &lpi->iovs, l) {
    419  - unsigned long start = iov->base;
    420  - unsigned long end = start + iov->len;
    421  - 
    422  - if (len <= 0 || addr + len < start)
    423  - break;
    424  - 
    425  - if (addr >= end)
    426  - continue;
    427  - 
    428  - if (addr < start) {
    429  - len -= (start - addr);
    430  - addr = start;
    431  - }
    432  - 
    433  - /*
    434  - * The range completely fits into the current IOV.
    435  - * If addr equals iov_base we just "drop" the
    436  - * beginning of the IOV. Otherwise, we make the IOV to
    437  - * end at addr, and add a new IOV start starts at
    438  - * addr + len.
    439  - */
    440  - if (addr + len < end) {
    441  - if (addr == start) {
    442  - iov->base += len;
    443  - iov->img_base += len;
    444  - iov->len -= len;
    445  - } else {
    446  - if (split_iov(iov, addr + len, false))
    447  - return -1;
    448  - iov->len -= len;
    449  - }
    450  - break;
    451  - }
    452  - 
    453  - /*
    454  - * The range spawns beyond the end of the current IOV.
    455  - * If addr equals iov_base we just "drop" the entire
    456  - * IOV. Otherwise, we cut the beginning of the IOV
    457  - * and continue to the next one with the updated range
    458  - */
    459  - if (addr == start) {
    460  - list_del(&iov->l);
    461  - xfree(iov);
    462  - } else {
    463  - iov->len -= (end - addr);
    464  - }
    465  - 
    466  - len -= (end - addr);
    467  - addr = end;
    468  - }
    469  - 
    470  - return 0;
    471  -}
    472  - 
    473  -static int remap_lazy_iovs(struct lazy_pages_info *lpi, unsigned long from,
    474  - unsigned long to, unsigned long len)
    475  -{
    476  - unsigned long off = to - from;
    477  - struct lazy_iov *iov, *n, *p;
    478  - LIST_HEAD(remaps);
    479  - 
    480  - list_for_each_entry_safe(iov, n, &lpi->iovs, l) {
    481  - unsigned long iov_end = iov->base + iov->len;
    482  - 
    483  - if (from > iov_end)
    484  - continue;
    485  - 
    486  - if (len <= 0 || from + len < iov->base)
    487  - break;
    488  - 
    489  - if (from < iov->base) {
    490  - len -= (iov->base - from);
    491  - from = iov->base;
    492  - }
    493  - 
    494  - if (from > iov->base)
    495  - if (split_iov(iov, from, true))
    496  - return -1;
    497  - if (from + len < iov_end)
    498  - if (split_iov(iov, from + len, false))
    499  - return -1;
    500  - 
    501  - list_safe_reset_next(iov, n, l);
    502  - 
    503  - /* here we have iov->base = from, iov_end <= from + len */
    504  - from = iov_end;
    505  - len -= iov->len;
    506  - iov->base += off;
    507  - list_move_tail(&iov->l, &remaps);
    508  - }
    509  - 
    510  - list_for_each_entry_safe(iov, n, &remaps, l) {
    511  - list_for_each_entry(p, &lpi->iovs, l) {
    512  - if (iov->base < p->base) {
    513  - list_move_tail(&iov->l, &p->l);
    514  - break;
    515  - }
    516  - if (list_is_last(&p->l, &lpi->iovs) &&
    517  - iov->base > p->base) {
    518  - list_move(&iov->l, &p->l);
    519  - break;
    520  - }
    521  - }
    522  - }
    523  - 
    524  - return 0;
    525  -}
    526  - 
    527  -/*
    528  - * Create a list of IOVs that can be handled using userfaultfd. The
    529  - * IOVs generally correspond to lazy pagemap entries, except the cases
    530  - * when a single pagemap entry covers several VMAs. In those cases
    531  - * IOVs are split at VMA boundaries because UFFDIO_COPY may be done
    532  - * only inside a single VMA.
    533  - * We assume here that pagemaps and VMAs are sorted.
    534  - */
    535  -static int collect_lazy_iovs(struct lazy_pages_info *lpi)
    536  -{
    537  - struct page_read *pr = &lpi->pr;
    538  - struct lazy_iov *iov, *n;
    539  - MmEntry *mm;
    540  - int nr_pages = 0, n_vma = 0, max_iov_len = 0;
    541  - int ret = -1;
    542  - unsigned long start, end, len;
    543  - 
    544  - mm = init_mm_entry(lpi);
    545  - if (!mm)
    546  - return -1;
    547  - 
    548  - while (pr->advance(pr)) {
    549  - if (!pagemap_lazy(pr->pe))
    550  - continue;
    551  - 
    552  - start = pr->pe->vaddr;
    553  - end = start + pr->pe->nr_pages * page_size();
    554  - nr_pages += pr->pe->nr_pages;
    555  - 
    556  - for (; n_vma < mm->n_vmas; n_vma++) {
    557  - VmaEntry *vma = mm->vmas[n_vma];
    558  - 
    559  - if (start >= vma->end)
    560  - continue;
    561  - 
    562  - iov = xzalloc(sizeof(*iov));
    563  - if (!iov)
    564  - goto free_iovs;
    565  - 
    566  - len = min_t(uint64_t, end, vma->end) - start;
    567  - iov->base = start;
    568  - iov->img_base = start;
    569  - iov->len = len;
    570  - list_add_tail(&iov->l, &lpi->iovs);
    571  - 
    572  - if (len > max_iov_len)
    573  - max_iov_len = len;
    574  - 
    575  - if (end <= vma->end)
    576  - break;
    577  - 
    578  - start = vma->end;
    579  - }
    580  - }
    581  - 
    582  - if (posix_memalign(&lpi->buf, PAGE_SIZE, max_iov_len))
    583  - goto free_iovs;
    584  - 
    585  - ret = nr_pages;
    586  - goto free_mm;
    587  - 
    588  -free_iovs:
    589  - list_for_each_entry_safe(iov, n, &lpi->iovs, l)
    590  - xfree(iov);
    591  -free_mm:
    592  - mm_entry__free_unpacked(mm, NULL);
    593  - 
    594  - return ret;
    595  -}
    596  - 
    597  -static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr);
    598  - 
    599  -static int ud_open(int client, struct lazy_pages_info **_lpi)
    600  -{
    601  - struct lazy_pages_info *lpi;
    602  - int ret = -1;
    603  - int pr_flags = PR_TASK;
    604  - 
    605  - lpi = lpi_init();
    606  - if (!lpi)
    607  - goto out;
    608  - 
    609  - /* The "transfer protocol" is first the pid as int and then
    610  - * the FD for UFFD */
    611  - ret = recv(client, &lpi->pid, sizeof(lpi->pid), 0);
    612  - if (ret != sizeof(lpi->pid)) {
    613  - if (ret < 0)
    614  - pr_perror("PID recv error");
    615  - else
    616  - pr_err("PID recv: short read\n");
    617  - goto out;
    618  - }
    619  - 
    620  - if (lpi->pid < 0) {
    621  - pr_debug("Zombie PID: %d\n", lpi->pid);
    622  - lpi_fini(lpi);
    623  - return 0;
    624  - }
    625  - 
    626  - lpi->lpfd.fd = recv_fd(client);
    627  - if (lpi->lpfd.fd < 0) {
    628  - pr_err("recv_fd error\n");
    629  - goto out;
    630  - }
    631  - pr_debug("Received PID: %d, uffd: %d\n", lpi->pid, lpi->lpfd.fd);
    632  - 
    633  - if (opts.use_page_server)
    634  - pr_flags |= PR_REMOTE;
    635  - ret = open_page_read(lpi->pid, &lpi->pr, pr_flags);
    636  - if (ret <= 0) {
    637  - ret = -1;
    638  - goto out;
    639  - }
    640  - 
    641  - lpi->pr.io_complete = uffd_io_complete;
    642  - 
    643  - /*
    644  - * Find the memory pages belonging to the restored process
    645  - * so that it is trackable when all pages have been transferred.
    646  - */
    647  - ret = collect_lazy_iovs(lpi);
    648  - if (ret < 0)
    649  - goto out;
    650  - lpi->total_pages = ret;
    651  - 
    652  - lp_debug(lpi, "Found %ld pages to be handled by UFFD\n", lpi->total_pages);
    653  - 
    654  - list_add_tail(&lpi->l, &lpis);
    655  - *_lpi = lpi;
    656  - 
    657  - return 0;
    658  - 
    659  -out:
    660  - lpi_fini(lpi);
    661  - return -1;
    662  -}
    663  - 
    664  -static int handle_exit(struct lazy_pages_info *lpi)
    665  -{
    666  - lp_debug(lpi, "EXIT\n");
    667  - if (epoll_del_rfd(epollfd, &lpi->lpfd))
    668  - return -1;
    669  - free_lazy_iovs(lpi);
    670  - close(lpi->lpfd.fd);
    671  - 
    672  - /* keep it for summary */
    673  - list_move_tail(&lpi->l, &lpis);
    674  - 
    675  - return 0;
    676  -}
    677  - 
    678  -static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int nr_pages)
    679  -{
    680  - struct uffdio_copy uffdio_copy;
    681  - unsigned long len = nr_pages * page_size();
    682  - int rc;
    683  - 
    684  - uffdio_copy.dst = address;
    685  - uffdio_copy.src = (unsigned long)lpi->buf;
    686  - uffdio_copy.len = len;
    687  - uffdio_copy.mode = 0;
    688  - uffdio_copy.copy = 0;
    689  - 
    690  - lp_debug(lpi, "uffd_copy: 0x%llx/%ld\n", uffdio_copy.dst, len);
    691  - rc = ioctl(lpi->lpfd.fd, UFFDIO_COPY, &uffdio_copy);
    692  - if (rc) {
    693  - if (errno == ENOSPC) {
    694  - handle_exit(lpi);
    695  - return 0;
    696  - }
    697  - if (uffdio_copy.copy != -EEXIST) {
    698  - lp_debug(lpi, "uffd_copy: rc:%d copy:%Ld, errno:%d\n",
    699  - rc, uffdio_copy.copy, errno);
    700  - return -1;
    701  - }
    702  - } else if (uffdio_copy.copy != len) {
    703  - lp_err(lpi, "UFFDIO_COPY unexpected size %Ld\n", uffdio_copy.copy);
    704  - return -1;
    705  - }
    706  - 
    707  - lpi->copied_pages += nr_pages;
    708  - 
    709  - return 0;
    710  -}
    711  - 
    712  -static int complete_page_fault(struct lazy_pages_info *lpi, unsigned long img_addr, int nr)
    713  -{
    714  - unsigned long addr = 0;
    715  - struct lp_req *req;
    716  - 
    717  - list_for_each_entry(req, &lpi->reqs, l) {
    718  - if (req->img_addr == img_addr) {
    719  - addr = req->addr;
    720  - list_del(&req->l);
    721  - xfree(req);
    722  - break;
    723  - }
    724  - }
    725  - 
    726  - BUG_ON(!addr);
    727  - 
    728  - if (uffd_copy(lpi, addr, nr))
    729  - return -1;
    730  - 
    731  - return drop_lazy_iovs(lpi, addr, nr * PAGE_SIZE);
    732  -}
    733  - 
    734  -static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr)
    735  -{
    736  - struct lazy_pages_info *lpi;
    737  - 
    738  - lpi = container_of(pr, struct lazy_pages_info, pr);
    739  - return complete_page_fault(lpi, img_addr, nr);
    740  -}
    741  - 
    742  -static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages)
    743  -{
    744  - struct uffdio_zeropage uffdio_zeropage;
    745  - unsigned long len = page_size() * nr_pages;
    746  - int rc;
    747  - 
    748  - uffdio_zeropage.range.start = address;
    749  - uffdio_zeropage.range.len = len;
    750  - uffdio_zeropage.mode = 0;
    751  - 
    752  - lp_debug(lpi, "zero page at 0x%llx\n", address);
    753  - rc = ioctl(lpi->lpfd.fd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
    754  - if (rc) {
    755  - lp_err(lpi, "UFFDIO_ZEROPAGE error %d\n", rc);
    756  - return -1;
    757  - }
    758  - 
    759  - return 0;
    760  -}
    761  - 
    762  -/*
    763  - * Seek for the requested address in the pagemap. If it is found, the
    764  - * subsequent call to pr->page_read will bring us the data. If the
    765  - * address is not found in the pagemap, but no error occured, the
    766  - * address should be mapped to zero pfn.
    767  - *
    768  - * Returns 0 for zero pages, 1 for "real" pages and negative value on
    769  - * error
    770  - */
    771  -static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr)
    772  -{
    773  - int ret;
    774  - 
    775  - lpi->pr.reset(&lpi->pr);
    776  - 
    777  - ret = lpi->pr.seek_pagemap(&lpi->pr, address);
    778  - if (!ret) {
    779  - lp_err(lpi, "no pagemap covers %llx\n", address);
    780  - return ret;
    781  - }
    782  - 
    783  - lpi->pr.skip_pages(&lpi->pr, address - lpi->pr.pe->vaddr);
    784  - 
    785  - return 0;
    786  -}
    787  - 
    788  -static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags)
    789  -{
    790  - int ret;
    791  - 
    792  - ret = uffd_seek_pages(lpi, address, nr);
    793  - if (ret)
    794  - return ret;
    795  - 
    796  - ret = lpi->pr.read_pages(&lpi->pr, address, nr, lpi->buf, flags);
    797  - if (ret <= 0) {
    798  - lp_err(lpi, "failed reading pages at %llx\n", address);
    799  - return ret;
    800  - }
    801  - 
    802  - return 0;
    803  -}
    804  - 
    805  -static int handle_remaining_pages(struct lazy_pages_info *lpi)
    806  -{
    807  - struct lazy_iov *iov;
    808  - struct lp_req *req;
    809  - int nr_pages, err;
    810  - 
    811  - iov = list_first_entry(&lpi->iovs, struct lazy_iov, l);
    812  - nr_pages = iov->len / PAGE_SIZE;
    813  - 
    814  - req = xzalloc(sizeof(*req));
    815  - if (!req)
    816  - return -1;
    817  - 
    818  - req->addr = iov->base;
    819  - req->img_addr = iov->img_base;
    820  - list_add(&req->l, &lpi->reqs);
    821  - 
    822  - err = uffd_handle_pages(lpi, req->img_addr, nr_pages, 0);
    823  - if (err < 0) {
    824  - lp_err(lpi, "Error during UFFD copy\n");
    825  - return -1;
    826  - }
    827  - 
    828  - return 0;
    829  -}
    830  - 
    831  -static int handle_remove(struct lazy_pages_info *lpi, struct uffd_msg *msg)
    832  -{
    833  - struct uffdio_range unreg;
    834  - 
    835  - unreg.start = msg->arg.remove.start;
    836  - unreg.len = msg->arg.remove.end - msg->arg.remove.start;
    837  - 
    838  - lp_debug(lpi, "%s: %Lx(%Lx)\n",
    839  - msg->event == UFFD_EVENT_REMOVE ? "REMOVE" : "UNMAP",
    840  - unreg.start, unreg.len);
    841  - 
    842  - /*
    843  - * The REMOVE event does not change the VMA, so we need to
    844  - * make sure that we won't handle #PFs in the removed
    845  - * range. With UNMAP, there's no VMA to worry about
    846  - */
    847  - if (msg->event == UFFD_EVENT_REMOVE &&
    848  - ioctl(lpi->lpfd.fd, UFFDIO_UNREGISTER, &unreg)) {
    849  - pr_perror("Failed to unregister (%llx - %llx)", unreg.start,
    850  - unreg.start + unreg.len);
    851  - return -1;
    852  - }
    853  - 
    854  - return drop_lazy_iovs(lpi, unreg.start, unreg.len);
    855  -}
    856  - 
    857  -static int handle_remap(struct lazy_pages_info *lpi, struct uffd_msg *msg)
    858  -{
    859  - unsigned long from = msg->arg.remap.from;
    860  - unsigned long to = msg->arg.remap.to;
    861  - unsigned long len = msg->arg.remap.len;
    862  - 
    863  - lp_debug(lpi, "REMAP: %lx -> %lx (%ld)\n", from , to, len);
    864  - 
    865  - return remap_lazy_iovs(lpi, from, to, len);
    866  -}
    867  - 
    868  -static int handle_fork(struct lazy_pages_info *parent_lpi, struct uffd_msg *msg)
    869  -{
    870  - struct lazy_pages_info *lpi;
    871  - int uffd = msg->arg.fork.ufd;
    872  - 
    873  - lp_debug(parent_lpi, "FORK: child with ufd=%d\n", uffd);
    874  - 
    875  - lpi = lpi_init();
    876  - if (!lpi)
    877  - return -1;
    878  - 
    879  - if (copy_lazy_iovs(parent_lpi, lpi))
    880  - goto out;
    881  - 
    882  - lpi->pid = parent_lpi->pid;
    883  - lpi->lpfd.fd = uffd;
    884  - lpi->parent = parent_lpi->parent ? parent_lpi->parent : parent_lpi;
    885  - lpi->copied_pages = lpi->parent->copied_pages;
    886  - lpi->total_pages = lpi->parent->total_pages;
    887  - list_add_tail(&lpi->l, &pending_lpis);
    888  - 
    889  - dup_page_read(&lpi->parent->pr, &lpi->pr);
    890  - 
    891  - return 1;
    892  - 
    893  -out:
    894  - lpi_fini(lpi);
    895  - return -1;
    896  -}
    897  - 
    898  -static int complete_forks(int epollfd, struct epoll_event **events, int *nr_fds)
    899  -{
    900  - struct lazy_pages_info *lpi, *n;
    901  - 
    902  - list_for_each_entry(lpi, &pending_lpis, l)
    903  - (*nr_fds)++;
    904  - 
    905  - *events = xrealloc(*events, sizeof(struct epoll_event) * (*nr_fds));
    906  - if (!*events)
    907  - return -1;
    908  - 
    909  - list_for_each_entry_safe(lpi, n, &pending_lpis, l) {
    910  - if (epoll_add_rfd(epollfd, &lpi->lpfd))
    911  - return -1;
    912  - 
    913  - list_del_init(&lpi->l);
    914  - list_add_tail(&lpi->l, &lpis);
    915  - }
    916  - 
    917  - return 0;
    918  -}
    919  - 
    920  -static int handle_page_fault(struct lazy_pages_info *lpi, struct uffd_msg *msg)
    921  -{
    922  - struct lp_req *req;
    923  - struct lazy_iov *iov;
    924  - __u64 address;
    925  - int ret;
    926  - 
    927  - /* Align requested address to the next page boundary */
    928  - address = msg->arg.pagefault.address & ~(page_size() - 1);
    929  - lp_debug(lpi, "#PF at 0x%llx\n", address);
    930  - 
    931  - list_for_each_entry(req, &lpi->reqs, l)
    932  - if (req->addr == address)
    933  - return 0;
    934  - 
    935  - iov = find_lazy_iov(lpi, address);
    936  - if (!iov)
    937  - return uffd_zero(lpi, address, 1);
    938  - 
    939  - req = xzalloc(sizeof(*req));
    940  - if (!req)
    941  - return -1;
    942  - req->addr = address;
    943  - req->img_addr = iov->img_base + (address - iov->base);
    944  - list_add(&req->l, &lpi->reqs);
    945  - 
    946  - ret = uffd_handle_pages(lpi, req->img_addr, 1, PR_ASYNC | PR_ASAP);
    947  - if (ret < 0) {
    948  - lp_err(lpi, "Error during regular page copy\n");
    949  - return -1;
    950  - }
    951  - 
    952  - return 0;
    953  -}
    954  - 
    955  -static int handle_uffd_event(struct epoll_rfd *lpfd)
    956  -{
    957  - struct lazy_pages_info *lpi;
    958  - struct uffd_msg msg;
    959  - int ret;
    960  - 
    961  - lpi = container_of(lpfd, struct lazy_pages_info, lpfd);
    962  - 
    963  - ret = read(lpfd->fd, &msg, sizeof(msg));
    964  - if (!ret)
    965  - return 1;
    966  - 
    967  - if (ret != sizeof(msg)) {
    968  - /* we've already handled the page fault for another thread */
    969  - if (errno == EAGAIN)
    970  - return 0;
    971  - if (ret < 0)
    972  - lp_perror(lpi, "Can't read uffd message");
    973  - else
    974  - lp_err(lpi, "Can't read uffd message: short read");
    975  - return -1;
    976  - }
    977  - 
    978  - switch (msg.event) {
    979  - case UFFD_EVENT_PAGEFAULT:
    980  - return handle_page_fault(lpi, &msg);
    981  - case UFFD_EVENT_REMOVE:
    982  - case UFFD_EVENT_UNMAP:
    983  - return handle_remove(lpi, &msg);
    984  - case UFFD_EVENT_REMAP:
    985  - return handle_remap(lpi, &msg);
    986  - case UFFD_EVENT_FORK:
    987  - return handle_fork(lpi, &msg);
    988  - default:
    989  - lp_err(lpi, "unexpected uffd event %u\n", msg.event);
    990  - return -1;
    991  - }
    992  - 
    993  - return 0;
    994  -}
    995  - 
    996  -static int lazy_pages_summary(struct lazy_pages_info *lpi)
    997  -{
    998  - lp_debug(lpi, "UFFD transferred pages: (%ld/%ld)\n",
    999  - lpi->copied_pages, lpi->total_pages);
    1000  - 
    1001  -#if 0
    1002  - if ((lpi->copied_pages != lpi->total_pages) && (lpi->total_pages > 0)) {
    1003  - lp_warn(lpi, "Only %ld of %ld pages transferred via UFFD\n"
    1004  - "Something probably went wrong.\n",
    1005  - lpi->copied_pages, lpi->total_pages);
    1006  - return 1;
    1007  - }
    1008  -#endif
    1009  - 
    1010  - return 0;
    1011  -}
    1012  - 
    1013  -#define POLL_TIMEOUT 1000
    1014  - 
    1015  -static int handle_requests(int epollfd, struct epoll_event *events, int nr_fds)
    1016  -{
    1017  - struct lazy_pages_info *lpi;
    1018  - int poll_timeout = POLL_TIMEOUT;
    1019  - int ret;
    1020  - 
    1021  - for (;;) {
    1022  - bool remaining = false;
    1023  - 
    1024  - ret = epoll_run_rfds(epollfd, events, nr_fds, poll_timeout);
    1025  - if (ret < 0)
    1026  - goto out;
    1027  - if (ret > 0) {
    1028  - if (complete_forks(epollfd, &events, &nr_fds))
    1029  - return -1;
    1030  - continue;
    1031  - }
    1032  - 
    1033  - if (poll_timeout)
    1034  - pr_debug("Start handling remaining pages\n");
    1035  - 
    1036  - poll_timeout = 0;
    1037  - list_for_each_entry(lpi, &lpis, l) {
    1038  - if (!list_empty(&lpi->iovs)) {
    1039  - remaining = true;
    1040  - ret = handle_remaining_pages(lpi);
    1041  - if (ret < 0)
    1042  - goto out;
    1043  - break;
    1044  - }
    1045  - }
    1046  - 
    1047  - if (!remaining)
    1048  - break;
    1049  - }
    1050  - 
    1051  - list_for_each_entry(lpi, &lpis, l)
    1052  - ret += lazy_pages_summary(lpi);
    1053  - 
    1054  -out:
    1055  - return ret;
    1056  - 
    1057  -}
    1058  - 
    1059  -static int prepare_lazy_socket(void)
    1060  -{
    1061  - int listen;
    1062  - struct sockaddr_un saddr;
    1063  - 
    1064  - if (prepare_sock_addr(&saddr))
    1065  - return -1;
    1066  - 
    1067  - pr_debug("Waiting for incoming connections on %s\n", saddr.sun_path);
    1068  - if ((listen = server_listen(&saddr)) < 0) {
    1069  - pr_perror("server_listen error");
    1070  - return -1;
    1071  - }
    1072  - 
    1073  - return listen;
    1074  -}
    1075  - 
    1076  -static int prepare_uffds(int listen, int epollfd)
    1077  -{
    1078  - int i;
    1079  - int client;
    1080  - socklen_t len;
    1081  - struct sockaddr_un saddr;
    1082  - 
    1083  - /* accept new client request */
    1084  - len = sizeof(struct sockaddr_un);
    1085  - if ((client = accept(listen, (struct sockaddr *) &saddr, &len)) < 0) {
    1086  - pr_perror("server_accept error");
    1087  - close(listen);
    1088  - return -1;
    1089  - }
    1090  - 
    1091  - for (i = 0; i < task_entries->nr_tasks; i++) {
    1092  - struct lazy_pages_info *lpi = NULL;
    1093  - if (ud_open(client, &lpi))
    1094  - goto close_uffd;
    1095  - if (lpi == NULL)
    1096  - continue;
    1097  - if (epoll_add_rfd(epollfd, &lpi->lpfd))
    1098  - goto close_uffd;
    1099  - }
    1100  - 
    1101  - close_safe(&client);
    1102  - close(listen);
    1103  - return 0;
    1104  - 
    1105  -close_uffd:
    1106  - close_safe(&client);
    1107  - close(listen);
    1108  - return -1;
    1109  -}
    1110  - 
    1111  -int cr_lazy_pages(bool daemon)
    1112  -{
    1113  - struct epoll_event *events;
    1114  - int nr_fds;
    1115  - int lazy_sk;
    1116  - int ret;
    1117  - 
    1118  - if (kerndat_uffd() || !kdat.has_uffd)
    1119  - return -1;
    1120  - 
    1121  - if (prepare_dummy_pstree())
    1122  - return -1;
    1123  - 
    1124  - lazy_sk = prepare_lazy_socket();
    1125  - if (lazy_sk < 0)
    1126  - return -1;
    1127  - 
    1128  - if (daemon) {
    1129  - ret = cr_daemon(1, 0, &lazy_sk, -1);
    1130  - if (ret == -1) {
    1131  - pr_err("Can't run in the background\n");
    1132  - return -1;
    1133  - }
    1134  - if (ret > 0) { /* parent task, daemon started */
    1135  - if (opts.pidfile) {
    1136  - if (write_pidfile(ret) == -1) {
    1137  - pr_perror("Can't write pidfile");
    1138  - kill(ret, SIGKILL);
    1139  - waitpid(ret, NULL, 0);
    1140  - return -1;
    1141  - }
    1142  - }
    1143  - 
    1144  - return 0;
    1145  - }
    1146  - }
    1147  - 
    1148  - if (close_status_fd())
    1149  - return -1;
    1150  - 
    1151  - nr_fds = task_entries->nr_tasks + (opts.use_page_server ? 1 : 0);
    1152  - epollfd = epoll_prepare(nr_fds, &events);
    1153  - if (epollfd < 0)
    1154  - return -1;
    1155  - 
    1156  - if (prepare_uffds(lazy_sk, epollfd))
    1157  - return -1;
    1158  - 
    1159  - if (opts.use_page_server) {
    1160  - if (connect_to_page_server_to_recv(epollfd))
    1161  - return -1;
    1162  - }
    1163  - 
    1164  - ret = handle_requests(epollfd, events, nr_fds);
    1165  - 
    1166  - return ret;
    1167  -}
    1168  - 
Please wait...
Page is in error, reload to recover