vhost是virtio的另一种方案,用于跳过qemu,减少qemu和内核之间上下文切换的开销,对于网络IO而言提升尤其明显。vhost目前有两种实现方案,内核态和用户态,本文重点讨论内核态的vhost vhost内核模块主要处理数据面的事情,控制面上还是交给qemu,vhost的数据结构如下 struct vhost_dev { MemoryListener memory_listener; /* MemoryListener是物理内存操作的回调函数集合 */ struct vhost_memory *mem; int n_mem_sections; MemoryRegionSection *mem_sections; struct vhost_virtqueue *vqs; /* vhost_virtqueue列表和个数 */ int nvqs; /* the first virtuque which would be used by this vhost dev */ int vq_index; unsigned long long features; /* vhost设备支持的features */ unsigned long long acked_features; /* guest acked的features */ unsigned long long backend_features; /* backend, e.g. tap设备,支持的features */ bool started; bool log_enabled; vhost_log_chunk_t *log; unsigned long long log_size; Error *migration_blocker; bool force; bool memory_changed; hwaddr mem_changed_start_addr; hwaddr mem_changed_end_addr; const VhostOps *vhost_ops; /* VhostOps基于kernel和user两种形态的vhost有不同的实现,内核的实现最终调用ioctl完成 */ void *opaque; }; struct vhost_virtqueue { int kick; int call; void *desc; void *avail; void *used; int num; unsigned long long used_phys; unsigned used_size; void *ring; unsigned long long ring_phys; unsigned ring_size; EventNotifier masked_notifier;
/* The routine to call when the Guest pings us, or timeout. */
vhost_work_fn_t handle_kick;
}; vhost的内存布局,也是由一组vhost_memory_region构成, struct vhost_memory_region { __u64 guest_phys_addr; __u64 memory_size; /* bytes */ __u64 userspace_addr; __u64 flags_padding; /* No flags are currently specified. */ }; /* All region addresses and sizes must be 4K aligned. */ #define VHOST_PAGE_SIZE 0x1000 struct vhost_memory { __u32 nregions; __u32 padding; struct vhost_memory_region regions[0]; };
1、/dev/vhost-net实现ioctl操作,这样vhost_kernel_call调用/dev/vhost-net的ioctl 发送ring_kick或者ring_call
2、怎么实现类似kvm_vm_ioctl(...,KVM_IOEVENTFD,...)一样的操作
3、怎么实现vhost_dev的VhostOps的vhost_set_vring_kick和vhost_set_vring_call
kernnel vhost方式通过vhost_vring_ioctl 设置VHOST_SET_VRING_CALL,设置irqfd,把中断注入guest;通过 VHOST_SET_VRING_KICK,设置ioeventfd, 获取guest notify
4、非vhost 通过kvm_irqchip_assign_irqfd最终调用kvm_vm_ioctl来设置kvm模块的irqfd的
handle_kick
/* Init poll
/* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, __poll_t mask, struct vhost_dev *dev) { init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; poll->dev = dev; poll->wqh = NULL; vhost_work_init(&poll->work, fn); }
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) { clear_bit(VHOST_WORK_QUEUED, &work->flags); work->fn = fn; }
static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); struct vhost_dev *dev; int r; if (!n) return -ENOMEM; dev = &n->dev; n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; /* TX virtqueue->kick的callback函数 */ n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; /* RX virtqueue->kick的callback函数 */ r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); /* 初始化vhost_net的TX vhost_poll */ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); /* 初始化vhost_net的RX vhost_poll */ return 0;
void vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int nvqs, int iov_limit, int weight, int byte_weight, bool use_worker, int (*msg_handler)(struct vhost_dev *dev, struct vhost_iotlb_msg *msg)) { for (i = 0; i < dev->nvqs; ++i) { vq = dev->vqs[i]; vq->log = NULL; vq->indirect = NULL; vq->heads = NULL; vq->dev = dev; mutex_init(&vq->mutex); vhost_vq_reset(dev, vq); if (vq->handle_kick) vhost_poll_init(&vq->poll, vq->handle_kick, EPOLLIN, dev); } }
VhostOps
vhost-backend.c kernel_ops
vhost-user.c user_ops
可以看出来一个是内核态用的,一个是给用户态用的(vhost-user,ovs+dpdk)
static const VhostOps kernel_ops = { .backend_type = VHOST_BACKEND_TYPE_KERNEL, .... .vhost_set_mem_table = vhost_kernel_set_mem_table, .vhost_set_vring_addr = vhost_kernel_set_vring_addr, .... } const VhostOps user_ops = { .backend_type = VHOST_BACKEND_TYPE_USER, ... .vhost_set_mem_table = vhost_user_set_mem_table, .vhost_set_vring_addr = vhost_user_set_vring_addr, ...
vhost_set_mem_table 和 .vhost_set_vring_addr非常重要,用来实现共享内存, vhost-user 的基础是 vhost-user进程和QEMU进程之间是通过共享内存的。
virtio_net_vhost_status-->vhost_net_start--> vhost_net_start_one-->vhost_dev_start ---> vhost_set_mem_table
hw/virtio/vhost-backend.c:294:static const VhostOps kernel_ops = {
hw/virtio/vhost-user.c:2357:const VhostOps user_ops = {
include/hw/virtio/vhost-backend.h:175:extern const VhostOps user_ops;
Qemu层 --evenfd 设置
ioctt命令实现
/* Set eventfd to poll for added buffers */ #define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) /* Set eventfd to signal when buffers have beed used */ #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) /* Set eventfd to signal an error */ #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) /* Set busy loop timeout (in us) */ #define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23, struct vhost_vring_state) /* Get busy loop timeout (in us) */ #define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24, struct vhost_vring_state)
vhost_net的启用是在命令行的-netdev tap,…中指定vhost=on选项,其初始化流程如下:
- 根据“Qemu之Network Device全虚拟方案一:前端网络流的建立”一文中,tap设备的创建会调用到net_init_tap()函数;
- net_init_tap()其中会检查选项是否指定vhost=on,如果指定,则会调用到vhost_net_init()进行初始化;
- 通过open(“/dev/vhost-net”, O_RDWR)打开了vhost driver;并通过ioctl(vhost_fd)进行了一系列的初始化;
- 调用ioctl VHOST_SET_VRING_KICK 设置kick fd(guest ->vhost) (VirtQueue.host_notifier.fd);
- 调用ioctl VHOST_SET_VRING_CALL 设置call fd(vhost ->guest) (VirtQueue.guest_notifier.fd);
vhost_vring_ioctl----eventfd_ctx_fdget
long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) { struct file *eventfp, *filep = NULL; bool pollstart = false, pollstop = false; struct eventfd_ctx *ctx = NULL; u32 __user *idxp = argp; struct vhost_virtqueue *vq; struct vhost_vring_state s; struct vhost_vring_file f; u32 idx; long r; r = get_user(idx, idxp); if (r < 0) return r; if (idx >= d->nvqs) return -ENOBUFS; idx = array_index_nospec(idx, d->nvqs); vq = d->vqs[idx]; if (ioctl == VHOST_SET_VRING_NUM || ioctl == VHOST_SET_VRING_ADDR) { return vhost_vring_set_num_addr(d, vq, ioctl, argp); } mutex_lock(&vq->mutex); switch (ioctl) { case VHOST_SET_VRING_BASE: /* Moving base with an active backend? * You don't want to do that. */ if (vq->private_data) { r = -EBUSY; break; } if (copy_from_user(&s, argp, sizeof s)) { r = -EFAULT; break; } if (s.num > 0xffff) { r = -EINVAL; break; } vq->last_avail_idx = s.num; /* Forget the cached index value. */ vq->avail_idx = vq->last_avail_idx; break; case VHOST_GET_VRING_BASE: s.index = idx; s.num = vq->last_avail_idx; if (copy_to_user(argp, &s, sizeof s)) r = -EFAULT; break; case VHOST_SET_VRING_KICK: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->kick) { pollstop = (filep = vq->kick) != NULL; pollstart = (vq->kick = eventfp) != NULL; } else filep = eventfp; break; case VHOST_SET_VRING_CALL: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd); if (IS_ERR(ctx)) { r = PTR_ERR(ctx); break; } swap(ctx, vq->call_ctx.ctx); break; case VHOST_SET_VRING_ERR: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd); if (IS_ERR(ctx)) { r = PTR_ERR(ctx); break; } swap(ctx, vq->error_ctx); break; case VHOST_SET_VRING_ENDIAN: r = vhost_set_vring_endian(vq, argp); break; case VHOST_GET_VRING_ENDIAN: r = vhost_get_vring_endian(vq, idx, argp); break; case VHOST_SET_VRING_BUSYLOOP_TIMEOUT: if (copy_from_user(&s, argp, sizeof(s))) { r = -EFAULT; break; } vq->busyloop_timeout = s.num; break; case VHOST_GET_VRING_BUSYLOOP_TIMEOUT: s.index = idx; s.num = vq->busyloop_timeout; if (copy_to_user(argp, &s, sizeof(s))) r = -EFAULT; break; default: r = -ENOIOCTLCMD; } if (pollstop && vq->handle_kick) vhost_poll_stop(&vq->poll); if (!IS_ERR_OR_NULL(ctx)) eventfd_ctx_put(ctx); ------------
if (filep) fput(filep); if (pollstart && vq->handle_kick) r = vhost_poll_start(&vq->poll, vq->kick); mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) vhost_poll_flush(&vq->poll); return r; } EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
qemu实现
hw/virtio/vhost-backend.c:23:static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request
vhost_kernel_call调用/dev/vhost-net的ioctl
static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request, void *arg) { int fd = (uintptr_t) dev->opaque; assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL); return ioctl(fd, request, arg); }
static int vhost_kernel_set_vring_kick(struct vhost_dev *dev, struct vhost_vring_file *file) { return vhost_kernel_call(dev, VHOST_SET_VRING_KICK, file); } static int vhost_kernel_set_vring_call(struct vhost_dev *dev, struct vhost_vring_file *file) { return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file); }
vhost_set_vring_call
static int vhost_virtqueue_init(struct vhost_dev *dev, struct vhost_virtqueue *vq, int n) { int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); struct vhost_vring_file file = { .index = vhost_vq_index, }; int r = event_notifier_init(&vq->masked_notifier, 0); if (r < 0) { return r; } file.fd = event_notifier_get_fd(&vq->masked_notifier); r = dev->vhost_ops->vhost_set_vring_call(dev, &file); if (r) { VHOST_OPS_DEBUG("vhost_set_vring_call failed"); r = -errno; goto fail_call; } vq->dev = dev; return 0; fail_call: event_notifier_cleanup(&vq->masked_notifier); return r; }
vHOST net---/dev/vhost-net 内核模块
[root@localhost qemu]# ls /dev/vhost-net /dev/vhost-net [root@localhost qemu]#
[root@localhost dpdk-19.11]# lsof /dev/vhost-net COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME qemu-syst 49786 root 19u CHR 10,238 0t0 83987 /dev/vhost-net [root@localhost dpdk-19.11]# ps -elf | grep 49786 7 S root 49786 1 53 80 0 - 74701 poll_s 02:26 ? 00:00:09 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 2 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -drive file=vhuser-test1.qcow2 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -net nic,model=virtio,macaddr=00:16:3e:22:22:22 -net tap,id=hostnet1,script=qemu-ifup,vnet_hdr=on,vhost=on -vnc :10 1 S root 49806 2 0 80 0 - 0 vhost_ 02:26 ? 00:00:00 [vhost-49786] 0 S root 49846 49322 0 80 0 - 1729 pipe_w 02:27 pts/4 00:00:00 grep --color=auto 49786 [root@localhost dpdk-19.11]#
dpdk vhost不需要vhost-net
[root@localhost ~]# lsof /dev/vhost-net [root@localhost ~]# ps -elf | grep qemu 3 S root 49916 1 24 80 0 - 94022 poll_s 02:29 ? 00:00:19 qemu-system-aarch64 -name vm2 -daemonize -enable-kvm -M virt -cpu host -smp 16 -m 4096 -object memory-backend-file,id=mem,size=4096M,mem-path=/mnt/huge,share=on -numa node,memdev=mem -mem-prealloc -drive file=vhuser-test1.qcow2 -global virtio-blk-device.scsi=off -device virtio-scsi-device,id=scsi -kernel vmlinuz-4.18 --append console=ttyAMA0 root=UUID=6a09973e-e8fd-4a6d-a8c0-1deb9556f477 iommu=pt intel_iommu=on iommu.passthrough=1 -initrd initramfs-4.18 -serial telnet:localhost:4322,server,nowait -monitor telnet:localhost:4321,server,nowait -chardev socket,id=char0,path=/tmp/vhost1,server -netdev type=vhost-user,id=netdev0,chardev=char0,vhostforce -device virtio-net-pci,netdev=netdev0,mac=52:54:00:00:00:01,mrg_rxbuf=on,rx_queue_size=256,tx_queue_size=256 -vnc :10 0 S root 49991 49249 0 80 0 - 1729 pipe_w 02:30 pts/3 00:00:00 grep --color=auto qemu [root@localhost ~]#
当给一个Qemu进程传递了参数-netdev tap,vhost=on 的时候,QEMU会通过调用几个ioctl命令对这个文件描述符进行一些初始化的工作,然后进行特性的协商,从而宿主机跟客户机的vhost-net driver建立关系。 QEMU代码调用如下:
vhost_net_init -> vhost_dev_init
vhost内核模块主要是把virtiO后端驱动的数据平面迁移到了内核中,而控制平面还在qemu中,因此就须要一些列的注册把相关信息记录在内核中,如虚拟机内存布局,设备关联的eventfd等。虽然KVM中有虚拟机的内存布局,可是因为vhost并不是在KVM中,而是单独的一个内核模块,因此须要qemu单独处理。且目前vhost只支持网络部分,块设备等其余部分尚不支持。内核中两个文件比较重要:vhost.c和vhost-net.c。其中前者实现的是脱离具体功能的vhost核心实现,后者实现网络方面的功能。内核模块加载主要是初始化vhost-net,起始于vhost_net_init(vhost/net.c)函数
static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .unlocked_ioctl = vhost_net_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vhost_net_compat_ioctl, #endif .open = vhost_net_open, .llseek = noop_llseek, };
函数表中vhost_net_open和vhost_net_ioctl两个函数须要注意,简单来说,前者初始化,后者控制,固然是qemu经过ioctl进行控制。
ioctl函数实现
首先说明在2.6.36以后ioctl函数已经不再存在了,而是用unlocked_ioctl和compat_ioctl两个函数实现以前版本的ioctl函数。
vhost_net_ioctl--->vhost_dev_ioctl
vhost_net_ioctl--->vhost_vring_ioctl
static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; struct vhost_vring_file backend; u64 features; int r; switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: features = VHOST_NET_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~VHOST_NET_FEATURES) return -EOPNOTSUPP; return vhost_net_set_features(n, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_NET_BACKEND_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; if (features & ~VHOST_NET_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&n->dev, features); return 0; case VHOST_RESET_OWNER: return vhost_net_reset_owner(n); case VHOST_SET_OWNER: return vhost_net_set_owner(n); default: mutex_lock(&n->dev.mutex); r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&n->dev, ioctl, argp); else vhost_net_flush(n); mutex_unlock(&n->dev.mutex); return r; } }
/* Caller must have device mutex */ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) { struct eventfd_ctx *ctx; u64 p; long r; int i, fd; /* If you are not the owner, you can become one */ if (ioctl == VHOST_SET_OWNER) { r = vhost_dev_set_owner(d); goto done; } /* You must be the owner to do anything else */ r = vhost_dev_check_owner(d); if (r) goto done; switch (ioctl) { case VHOST_SET_MEM_TABLE: r = vhost_set_memory(d, argp); break; case VHOST_SET_LOG_BASE: if (copy_from_user(&p, argp, sizeof p)) { r = -EFAULT; break; } if ((u64)(unsigned long)p != p) { r = -EFAULT; break; } for (i = 0; i < d->nvqs; ++i) { struct vhost_virtqueue *vq; void __user *base = (void __user *)(unsigned long)p; vq = d->vqs[i]; mutex_lock(&vq->mutex); /* If ring is inactive, will check when it's enabled. */ if (vq->private_data && !vq_log_access_ok(vq, base)) r = -EFAULT; else vq->log_base = base; mutex_unlock(&vq->mutex); } break; case VHOST_SET_LOG_FD: r = get_user(fd, (int __user *)argp); if (r < 0) break; ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd); if (IS_ERR(ctx)) { r = PTR_ERR(ctx); break; } swap(ctx, d->log_ctx); for (i = 0; i < d->nvqs; ++i) { mutex_lock(&d->vqs[i]->mutex); d->vqs[i]->log_ctx = d->log_ctx; mutex_unlock(&d->vqs[i]->mutex); } if (ctx) eventfd_ctx_put(ctx); break; default: r = -ENOIOCTLCMD; break; } done: return r; }
https://blog.csdn.net/majieyue/article/details/51262510