zoukankan      html  css  js  c++  java
  • vhost 控制平面 + handle_kick + VhostOps

    vhost是virtio的另一种方案,用于跳过qemu,减少qemu和内核之间上下文切换的开销,对于网络IO而言提升尤其明显。vhost目前有两种实现方案,内核态和用户态,本文重点讨论内核态的vhost
    
    vhost内核模块主要处理数据面的事情,控制面上还是交给qemu,vhost的数据结构如下
    
    struct vhost_dev {
        MemoryListener memory_listener;  /* MemoryListener是物理内存操作的回调函数集合 */
        struct vhost_memory *mem;
        int n_mem_sections;
        MemoryRegionSection *mem_sections;
        struct vhost_virtqueue *vqs;  /* vhost_virtqueue列表和个数 */
        int nvqs;
        /* the first virtuque which would be used by this vhost dev */
        int vq_index;
        unsigned long long features;  /* vhost设备支持的features */
        unsigned long long acked_features;  /* guest acked的features */
        unsigned long long backend_features;  /* backend, e.g. tap设备,支持的features */
        bool started;
        bool log_enabled;
        vhost_log_chunk_t *log;
        unsigned long long log_size;
        Error *migration_blocker;
        bool force;
        bool memory_changed;
        hwaddr mem_changed_start_addr;
        hwaddr mem_changed_end_addr;
        const VhostOps *vhost_ops; /* VhostOps基于kernel和user两种形态的vhost有不同的实现,内核的实现最终调用ioctl完成 */
        void *opaque;
    };
     
    struct vhost_virtqueue {
        int kick;
        int call;
        void *desc;
        void *avail;
        void *used;
        int num;
        unsigned long long used_phys;
        unsigned used_size;
        void *ring;
        unsigned long long ring_phys;
        unsigned ring_size;
        EventNotifier masked_notifier;

    /* The routine to call when the Guest pings us, or timeout. */
    vhost_work_fn_t handle_kick;

    }; vhost的内存布局,也是由一组vhost_memory_region构成,
    struct vhost_memory_region { __u64 guest_phys_addr; __u64 memory_size; /* bytes */ __u64 userspace_addr; __u64 flags_padding; /* No flags are currently specified. */ }; /* All region addresses and sizes must be 4K aligned. */ #define VHOST_PAGE_SIZE 0x1000 struct vhost_memory { __u32 nregions; __u32 padding; struct vhost_memory_region regions[0]; };

    1、/dev/vhost-net实现ioctl操作,这样vhost_kernel_call调用/dev/vhost-net的ioctl 发送ring_kick或者ring_call

    2、怎么实现类似kvm_vm_ioctl(...,KVM_IOEVENTFD,...)一样的操作

    3、怎么实现vhost_dev的VhostOps的vhost_set_vring_kick和vhost_set_vring_call

    kernnel vhost方式通过vhost_vring_ioctl 设置VHOST_SET_VRING_CALL,设置irqfd,把中断注入guest;通过  VHOST_SET_VRING_KICK,设置ioeventfd, 获取guest notify

    4、非vhost 通过kvm_irqchip_assign_irqfd最终调用kvm_vm_ioctl来设置kvm模块的irqfd的

     

     

     handle_kick 

    /* Init poll
    /* Init poll structure */
    void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
                 __poll_t mask, struct vhost_dev *dev)
    {
        init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
        init_poll_funcptr(&poll->table, vhost_poll_func);
        poll->mask = mask;
        poll->dev = dev;
        poll->wqh = NULL;
    
        vhost_work_init(&poll->work, fn);
    }
    void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
    {
        clear_bit(VHOST_WORK_QUEUED, &work->flags);
        work->fn = fn;
    }

     

    static int vhost_net_open(struct inode *inode, struct file *f)
    {
        struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
        struct vhost_dev *dev;
        int r;
     
        if (!n)
            return -ENOMEM;
     
        dev = &n->dev;
        n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; /* TX virtqueue->kick的callback函数 */
        n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; /* RX virtqueue->kick的callback函数 */
        r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
     
        vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);  /* 初始化vhost_net的TX vhost_poll */
        vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);   /* 初始化vhost_net的RX vhost_poll */
     
     
        return 0;
    void vhost_dev_init(struct vhost_dev *dev,
                struct vhost_virtqueue **vqs, int nvqs,
                int iov_limit, int weight, int byte_weight,
                bool use_worker,
                int (*msg_handler)(struct vhost_dev *dev,
                           struct vhost_iotlb_msg *msg))
    {
     
    
        for (i = 0; i < dev->nvqs; ++i) {
            vq = dev->vqs[i];
            vq->log = NULL;
            vq->indirect = NULL;
            vq->heads = NULL;
            vq->dev = dev;
            mutex_init(&vq->mutex);
            vhost_vq_reset(dev, vq);
            if (vq->handle_kick)
                vhost_poll_init(&vq->poll, vq->handle_kick,
                        EPOLLIN, dev);
        }
    }

    VhostOps

    vhost-backend.c    kernel_ops   

    vhost-user.c   user_ops 

    可以看出来一个是内核态用的,一个是给用户态用的(vhost-user,ovs+dpdk)

    static const VhostOps kernel_ops = {
        .backend_type = VHOST_BACKEND_TYPE_KERNEL,
        ....
        .vhost_set_mem_table = vhost_kernel_set_mem_table,
        .vhost_set_vring_addr = vhost_kernel_set_vring_addr,
        ....
    }
     
    const VhostOps user_ops = {
        .backend_type = VHOST_BACKEND_TYPE_USER,
        ...
        .vhost_set_mem_table = vhost_user_set_mem_table,
        .vhost_set_vring_addr = vhost_user_set_vring_addr,
        ...

    vhost_set_mem_table 和 .vhost_set_vring_addr非常重要,用来实现共享内存, vhost-user 的基础是 vhost-user进程和QEMU进程之间是通过共享内存的。

     virtio_net_vhost_status-->vhost_net_start--> vhost_net_start_one-->vhost_dev_start ---> vhost_set_mem_table

     hw/virtio/vhost-backend.c:294:static const VhostOps kernel_ops = {

    hw/virtio/vhost-user.c:2357:const VhostOps user_ops = {
    include/hw/virtio/vhost-backend.h:175:extern const VhostOps user_ops;

    Qemu层 --evenfd 设置

    ioctt命令实现

    /* Set eventfd to poll for added buffers */
    #define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
    /* Set eventfd to signal when buffers have beed used */
    #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
    /* Set eventfd to signal an error */
    #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
    /* Set busy loop timeout (in us) */
    #define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23,    
                         struct vhost_vring_state)
    /* Get busy loop timeout (in us) */
    #define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24,    
                         struct vhost_vring_state)

    vhost_net的启用是在命令行的-netdev tap,…中指定vhost=on选项,其初始化流程如下:

    1. 根据“Qemu之Network Device全虚拟方案一:前端网络流的建立”一文中,tap设备的创建会调用到net_init_tap()函数;
    2. net_init_tap()其中会检查选项是否指定vhost=on,如果指定,则会调用到vhost_net_init()进行初始化;
    3. 通过open(“/dev/vhost-net”, O_RDWR)打开了vhost driver;并通过ioctl(vhost_fd)进行了一系列的初始化;
    4. 调用ioctl VHOST_SET_VRING_KICK 设置kick fd(guest ->vhost) (VirtQueue.host_notifier.fd);
    5. 调用ioctl VHOST_SET_VRING_CALL 设置call fd(vhost ->guest) (VirtQueue.guest_notifier.fd);

    vhost_vring_ioctl----eventfd_ctx_fdget

    long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
    {
        struct file *eventfp, *filep = NULL;
        bool pollstart = false, pollstop = false;
        struct eventfd_ctx *ctx = NULL;
        u32 __user *idxp = argp;
        struct vhost_virtqueue *vq;
        struct vhost_vring_state s;
        struct vhost_vring_file f;
        u32 idx;
        long r;
    
        r = get_user(idx, idxp);
        if (r < 0)
            return r;
        if (idx >= d->nvqs)
            return -ENOBUFS;
    
        idx = array_index_nospec(idx, d->nvqs);
        vq = d->vqs[idx];
    
        if (ioctl == VHOST_SET_VRING_NUM ||
            ioctl == VHOST_SET_VRING_ADDR) {
            return vhost_vring_set_num_addr(d, vq, ioctl, argp);
        }
    
        mutex_lock(&vq->mutex);
    
        switch (ioctl) {
        case VHOST_SET_VRING_BASE:
            /* Moving base with an active backend?
             * You don't want to do that. */
            if (vq->private_data) {
                r = -EBUSY;
                break;
            }
            if (copy_from_user(&s, argp, sizeof s)) {
                r = -EFAULT;
                break;
            }
            if (s.num > 0xffff) {
                r = -EINVAL;
                break;
            }
            vq->last_avail_idx = s.num;
            /* Forget the cached index value. */
            vq->avail_idx = vq->last_avail_idx;
            break;
        case VHOST_GET_VRING_BASE:
            s.index = idx;
            s.num = vq->last_avail_idx;
            if (copy_to_user(argp, &s, sizeof s))
                r = -EFAULT;
            break;
        case VHOST_SET_VRING_KICK:
            if (copy_from_user(&f, argp, sizeof f)) {
                r = -EFAULT;
                break;
            }
            eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd);
            if (IS_ERR(eventfp)) {
                r = PTR_ERR(eventfp);
                break;
            }
            if (eventfp != vq->kick) {
                pollstop = (filep = vq->kick) != NULL;
                pollstart = (vq->kick = eventfp) != NULL;
            } else
                filep = eventfp;
            break;
        case VHOST_SET_VRING_CALL:
            if (copy_from_user(&f, argp, sizeof f)) {
                r = -EFAULT;
                break;
            }
            ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
            if (IS_ERR(ctx)) {
                r = PTR_ERR(ctx);
                break;
            }
    
            swap(ctx, vq->call_ctx.ctx);
            break;
        case VHOST_SET_VRING_ERR:
            if (copy_from_user(&f, argp, sizeof f)) {
                r = -EFAULT;
                break;
            }
            ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
            if (IS_ERR(ctx)) {
                r = PTR_ERR(ctx);
                break;
            }
            swap(ctx, vq->error_ctx);
            break;
        case VHOST_SET_VRING_ENDIAN:
            r = vhost_set_vring_endian(vq, argp);
            break;
        case VHOST_GET_VRING_ENDIAN:
            r = vhost_get_vring_endian(vq, idx, argp);
            break;
        case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
            if (copy_from_user(&s, argp, sizeof(s))) {
                r = -EFAULT;
                break;
            }
            vq->busyloop_timeout = s.num;
            break;
        case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
            s.index = idx;
            s.num = vq->busyloop_timeout;
            if (copy_to_user(argp, &s, sizeof(s)))
                r = -EFAULT;
            break;
        default:
            r = -ENOIOCTLCMD;
        }
    
        if (pollstop && vq->handle_kick)
            vhost_poll_stop(&vq->poll);
    
        if (!IS_ERR_OR_NULL(ctx))
            eventfd_ctx_put(ctx); ------------ 
    if (filep) fput(filep); if (pollstart && vq->handle_kick) r = vhost_poll_start(&vq->poll, vq->kick); mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) vhost_poll_flush(&vq->poll); return r; } EXPORT_SYMBOL_GPL(vhost_vring_ioctl);

     qemu实现

     hw/virtio/vhost-backend.c:23:static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request

    vhost_kernel_call调用/dev/vhost-net的ioctl

    static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request,
                                 void *arg)
    {
        int fd = (uintptr_t) dev->opaque;
    
        assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL);
    
        return ioctl(fd, request, arg);  
    }
    static int vhost_kernel_set_vring_kick(struct vhost_dev *dev,
                                           struct vhost_vring_file *file)
    {
        return vhost_kernel_call(dev, VHOST_SET_VRING_KICK, file);
    }
    
    static int vhost_kernel_set_vring_call(struct vhost_dev *dev,
                                           struct vhost_vring_file *file)
    {
        return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file);
    }

     

     

    vhost_set_vring_call
    static int vhost_virtqueue_init(struct vhost_dev *dev,
                                    struct vhost_virtqueue *vq, int n)
    {
        int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
        struct vhost_vring_file file = {
            .index = vhost_vq_index,
        };
        int r = event_notifier_init(&vq->masked_notifier, 0);
        if (r < 0) {
            return r;
        }
    
        file.fd = event_notifier_get_fd(&vq->masked_notifier);
        r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
        if (r) {
            VHOST_OPS_DEBUG("vhost_set_vring_call failed");
            r = -errno;
            goto fail_call;
        }
    
        vq->dev = dev;
    
        return 0;
    fail_call:
        event_notifier_cleanup(&vq->masked_notifier);
        return r;
    }

     vHOST net---/dev/vhost-net 内核模块 

    [root@localhost qemu]# ls /dev/vhost-net 
    /dev/vhost-net
    [root@localhost qemu]# 
    [root@localhost dpdk-19.11]# lsof /dev/vhost-net 
    COMMAND     PID USER   FD   TYPE DEVICE SIZE/OFF  NODE NAME
    qemu-syst 49786 root   19u   CHR 10,238      0t0 83987 /dev/vhost-net
    [root@localhost dpdk-19.11]# ps -elf | grep 49786
    7 S root      49786      1 53  80   0 - 74701 poll_s 02:26 ?        00:00:09 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 2 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0  root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -drive file=vhuser-test1.qcow2 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -net nic,model=virtio,macaddr=00:16:3e:22:22:22 -net tap,id=hostnet1,script=qemu-ifup,vnet_hdr=on,vhost=on -vnc :10
    1 S root      49806      2  0  80   0 -     0 vhost_ 02:26 ?        00:00:00 [vhost-49786]
    0 S root      49846  49322  0  80   0 -  1729 pipe_w 02:27 pts/4    00:00:00 grep --color=auto 49786
    [root@localhost dpdk-19.11]# 

    dpdk vhost不需要vhost-net

    [root@localhost ~]# lsof /dev/vhost-net
    [root@localhost ~]# ps -elf | grep qemu
    3 S root      49916      1 24  80   0 - 94022 poll_s 02:29 ?        00:00:19 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0  root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10
    0 S root      49991  49249  0  80   0 -  1729 pipe_w 02:30 pts/3    00:00:00 grep --color=auto qemu
    [root@localhost ~]# 

    当给一个Qemu进程传递了参数-netdev tap,vhost=on 的时候,QEMU会通过调用几个ioctl命令对这个文件描述符进行一些初始化的工作,然后进行特性的协商,从而宿主机跟客户机的vhost-net driver建立关系。 QEMU代码调用如下:

    vhost_net_init -> vhost_dev_init 

    vhost内核模块主要是把virtiO后端驱动的数据平面迁移到了内核中,而控制平面还在qemu中,因此就须要一些列的注册把相关信息记录在内核中,如虚拟机内存布局,设备关联的eventfd等。虽然KVM中有虚拟机的内存布局,可是因为vhost并不是在KVM中,而是单独的一个内核模块,因此须要qemu单独处理。且目前vhost只支持网络部分,块设备等其余部分尚不支持。内核中两个文件比较重要:vhost.c和vhost-net.c。其中前者实现的是脱离具体功能的vhost核心实现,后者实现网络方面的功能。内核模块加载主要是初始化vhost-net,起始于vhost_net_init(vhost/net.c)函数

    static const struct file_operations vhost_net_fops = {
        .owner          = THIS_MODULE,
        .release        = vhost_net_release,
        .unlocked_ioctl = vhost_net_ioctl,
    #ifdef CONFIG_COMPAT
        .compat_ioctl   = vhost_net_compat_ioctl,
    #endif
        .open           = vhost_net_open,
        .llseek        = noop_llseek,
    };

    函数表中vhost_net_open和vhost_net_ioctl两个函数须要注意,简单来说,前者初始化,后者控制,固然是qemu经过ioctl进行控制。

     ioctl函数实现

     首先说明在2.6.36以后ioctl函数已经不再存在了,而是用unlocked_ioctl和compat_ioctl两个函数实现以前版本的ioctl函数。

    vhost_net_ioctl--->vhost_dev_ioctl
    vhost_net_ioctl--->vhost_vring_ioctl
     
    static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
                    unsigned long arg)
    {
        struct vhost_net *n = f->private_data;
        void __user *argp = (void __user *)arg;
        u64 __user *featurep = argp;
        struct vhost_vring_file backend;
        u64 features;
        int r;
    
        switch (ioctl) {
        case VHOST_NET_SET_BACKEND:
            if (copy_from_user(&backend, argp, sizeof backend))
                return -EFAULT;
            return vhost_net_set_backend(n, backend.index, backend.fd);
        case VHOST_GET_FEATURES:
            features = VHOST_NET_FEATURES;
            if (copy_to_user(featurep, &features, sizeof features))
                return -EFAULT;
            return 0;
        case VHOST_SET_FEATURES:
            if (copy_from_user(&features, featurep, sizeof features))
                return -EFAULT;
            if (features & ~VHOST_NET_FEATURES)
                return -EOPNOTSUPP;
            return vhost_net_set_features(n, features);
        case VHOST_GET_BACKEND_FEATURES:
            features = VHOST_NET_BACKEND_FEATURES;
            if (copy_to_user(featurep, &features, sizeof(features)))
                return -EFAULT;
            return 0;
        case VHOST_SET_BACKEND_FEATURES:
            if (copy_from_user(&features, featurep, sizeof(features)))
                return -EFAULT;
            if (features & ~VHOST_NET_BACKEND_FEATURES)
                return -EOPNOTSUPP;
            vhost_set_backend_features(&n->dev, features);
            return 0;
        case VHOST_RESET_OWNER:
            return vhost_net_reset_owner(n);
        case VHOST_SET_OWNER:
            return vhost_net_set_owner(n);
        default:
            mutex_lock(&n->dev.mutex);
            r = vhost_dev_ioctl(&n->dev, ioctl, argp);
            if (r == -ENOIOCTLCMD)
                r = vhost_vring_ioctl(&n->dev, ioctl, argp);
            else
                vhost_net_flush(n);
            mutex_unlock(&n->dev.mutex);
            return r;
        }
    }
    /* Caller must have device mutex */
    long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
    {
        struct eventfd_ctx *ctx;
        u64 p;
        long r;
        int i, fd;
    
        /* If you are not the owner, you can become one */
        if (ioctl == VHOST_SET_OWNER) {
            r = vhost_dev_set_owner(d);
            goto done;
        }
    
        /* You must be the owner to do anything else */
        r = vhost_dev_check_owner(d);
        if (r)
            goto done;
    
        switch (ioctl) {
        case VHOST_SET_MEM_TABLE:
            r = vhost_set_memory(d, argp);
            break;
        case VHOST_SET_LOG_BASE:
            if (copy_from_user(&p, argp, sizeof p)) {
                r = -EFAULT;
                break;
            }
            if ((u64)(unsigned long)p != p) {
                r = -EFAULT;
                break;
            }
            for (i = 0; i < d->nvqs; ++i) {
                struct vhost_virtqueue *vq;
                void __user *base = (void __user *)(unsigned long)p;
                vq = d->vqs[i];
                mutex_lock(&vq->mutex);
                /* If ring is inactive, will check when it's enabled. */
                if (vq->private_data && !vq_log_access_ok(vq, base))
                    r = -EFAULT;
                else
                    vq->log_base = base;
                mutex_unlock(&vq->mutex);
            }
            break;
        case VHOST_SET_LOG_FD:
            r = get_user(fd, (int __user *)argp);
            if (r < 0)
                break;
            ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
            if (IS_ERR(ctx)) {
                r = PTR_ERR(ctx);
                break;
            }
            swap(ctx, d->log_ctx);
            for (i = 0; i < d->nvqs; ++i) {
                mutex_lock(&d->vqs[i]->mutex);
                d->vqs[i]->log_ctx = d->log_ctx;
                mutex_unlock(&d->vqs[i]->mutex);
            }
            if (ctx)
                eventfd_ctx_put(ctx);
            break;
        default:
            r = -ENOIOCTLCMD;
            break;
        }
    done:
        return r;
    }

    https://blog.csdn.net/majieyue/article/details/51262510

  • 相关阅读:
    约瑟夫环问题拓展 C/C++
    C/C++之STL简介
    详解约瑟夫环问题 C/C++
    HC-SR04超声波传感器
    TCRT5000 红外传感器
    win10的docker无法运行mysql的image,Public Key Retrieval is not allowed
    如何将docker默认的安装位置从C盘改为D盘?
    免费PDF阅读器
    A1B2B3
    动态代理
  • 原文地址:https://www.cnblogs.com/dream397/p/13936103.html
Copyright © 2011-2022 走看看