zoukankan html css js c++ java

epoll oneshot

/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)

主要是看下：惊群源：

1、socket wake_up

2、epoll_wait 中wake_up

目前data ready的时候调用sk_data_ready 唤醒进程，此时唤醒进程选择了只唤醒一个

/ nr_exclusive是1  
  
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,  
  
int nr_exclusive, int wake_flags, void *key)  
  
{  
  
wait_queue_t *curr, *next;  
  
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {  
  
unsigned flags = curr->flags;  
  
if (curr->func(curr, mode, wake_flags, key) &&  
  
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)  
  
break;  
  
}  
  
}

View Code

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

传进来的nr_exclusive是1, 所以flags & WQ_FLAG_EXCLUSIVE为真的时候，执行一次，就会跳出循环。

Epoll_create()在fork子进程之前

所有进程都共享一个 epfd，所以data ready 唤醒进程的时候即使加上 nr_exclusive = 1 只唤醒一个进程，那么唤醒那个一个呢？

也就是当连接到来时，我们需要选择一个进程来accept，这个时候，任何一个accept都是可以的。当连接建立以后，后续的读写事件，却与进程有了关联。一个请求与a进程建立连接后，后续的读写也应该由a进程来做。

当读写事件发生时，应该通知哪个进程呢？Epoll并不知道，因此，事件有可能错误通知另一个进程处理

Epoll_create()在fork子进程之后

每个进程的读写事件，只注册在自己进程的epoll中。所以不会出现竞争

但是accept呢？？？

目前有的内核版本说是会出现有的不会！！！

这就需要看内核版本实现了无非就是唤醒的时候加上一些标志。。。当然是用reuseport 一劳永逸！！

如果不是是用reuseport实现只唤醒一个进程，那么wake_up的时候就是唤醒等待队列的头一个。。那怎么做到负载均衡呢？？？

所以还是reuseport好！！！！

  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
             int nr_exclusive, int wake_flags, void *key)
 {
     wait_queue_t *curr, *next;
 
     list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
         unsigned flags = curr->flags;
 
         if (curr->func(curr, mode, wake_flags, key) &&
                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
             break;
     }
 }

void __wake_up(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, void *key)
{
    unsigned long flags;

    spin_lock_irqsave(&q->lock, flags);
    __wake_up_common(q, mode, nr_exclusive, 0, key);
    spin_unlock_irqrestore(&q->lock, flags);
}


/*
 * This is the callback that is passed to the wait queue wakeup
 * machanism. It is called by the stored file descriptors when they
 * have events to report.
 */
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    int pwake = 0;
    unsigned long flags;
    struct epitem *epi = ep_item_from_wait(wait);
    struct eventpoll *ep = epi->ep;

    spin_lock_irqsave(&ep->lock, flags);

    /*
     * If the event mask does not contain any poll(2) event, we consider the
     * descriptor to be disabled. This condition is likely the effect of the
     * EPOLLONESHOT bit that disables the descriptor when an event is received,
     * until the next EPOLL_CTL_MOD will be issued.
     */
    if (!(epi->event.events & ~EP_PRIVATE_BITS))
        goto out_unlock;

    /*
     * Check the events coming with the callback. At this stage, not
     * every device reports the events in the "key" parameter of the
     * callback. We need to be able to handle both cases here, hence the
     * test for "key" != NULL before the event match test.
     */
    if (key && !((unsigned long) key & epi->event.events))
        goto out_unlock;

    /*
     * If we are trasfering events to userspace, we can hold no locks
     * (because we're accessing user memory, and because of linux f_op->poll()
     * semantics). All the events that happens during that period of time are
     * chained in ep->ovflist and requeued later on.
     */
    if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
        if (epi->next == EP_UNACTIVE_PTR) {
            epi->next = ep->ovflist;
            ep->ovflist = epi;
        }
        goto out_unlock;
    }

    /* If this file is already in the ready list we exit soon */
    if (!ep_is_linked(&epi->rdllink))
        list_add_tail(&epi->rdllink, &ep->rdllist);

    /*
     * Wake up ( if active ) both the eventpoll wait list and the ->poll()
     * wait list.
     */
    if (waitqueue_active(&ep->wq))
        wake_up_locked(&ep->wq);
    if (waitqueue_active(&ep->poll_wait))
        pwake++;

out_unlock:
    spin_unlock_irqrestore(&ep->lock, flags);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);

    return 1;
}




static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    int pwake = 0;
    unsigned long flags;
    struct epitem *epi = ep_item_from_wait(wait);
    struct eventpoll *ep = epi->ep;
    int ewake = 0;

    if ((unsigned long)key & POLLFREE) {
        ep_pwq_from_wait(wait)->whead = NULL;
        /*
         * whead = NULL above can race with ep_remove_wait_queue()
         * which can do another remove_wait_queue() after us, so we
         * can't use __remove_wait_queue(). whead->lock is held by
         * the caller.
         */
        list_del_init(&wait->task_list);
    }

    spin_lock_irqsave(&ep->lock, flags);

    /*
     * If the event mask does not contain any poll(2) event, we consider the
     * descriptor to be disabled. This condition is likely the effect of the
     * EPOLLONESHOT bit that disables the descriptor when an event is received,
     * until the next EPOLL_CTL_MOD will be issued.
     */
    if (!(epi->event.events & ~EP_PRIVATE_BITS))
        goto out_unlock;

    /*
     * Check the events coming with the callback. At this stage, not
     * every device reports the events in the "key" parameter of the
     * callback. We need to be able to handle both cases here, hence the
     * test for "key" != NULL before the event match test.
     */
    if (key && !((unsigned long) key & epi->event.events))
        goto out_unlock;

    /*
     * If we are transferring events to userspace, we can hold no locks
     * (because we're accessing user memory, and because of linux f_op->poll()
     * semantics). All the events that happen during that period of time are
     * chained in ep->ovflist and requeued later on.
     */
    if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
        if (epi->next == EP_UNACTIVE_PTR) {
            epi->next = ep->ovflist;
            ep->ovflist = epi;
            if (epi->ws) {
                /*
                 * Activate ep->ws since epi->ws may get
                 * deactivated at any time.
                 */
                __pm_stay_awake(ep->ws);
            }

        }
        goto out_unlock;
    }

    /* If this file is already in the ready list we exit soon */
    if (!ep_is_linked(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);
        ep_pm_stay_awake_rcu(epi);
    }

    /*
     * Wake up ( if active ) both the eventpoll wait list and the ->poll()
     * wait list.
     */
    if (waitqueue_active(&ep->wq)) {
        if ((epi->event.events & EPOLLEXCLUSIVE) &&
                    !((unsigned long)key & POLLFREE)) {
            switch ((unsigned long)key & EPOLLINOUT_BITS) {
            case POLLIN:
                if (epi->event.events & POLLIN)
                    ewake = 1;
                break;
            case POLLOUT:
                if (epi->event.events & POLLOUT)
                    ewake = 1;
                break;
            case 0:
                ewake = 1;
                break;
            }
        }
        wake_up_locked(&ep->wq);
    }
    if (waitqueue_active(&ep->poll_wait))
        pwake++;

out_unlock:
    spin_unlock_irqrestore(&ep->lock, flags);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);

    if (epi->event.events & EPOLLEXCLUSIVE)
        return ewake;

    return 1;
}

View Code

/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                 poll_table *pt)
{
    struct epitem *epi = ep_item_from_epqueue(pt);
    struct eppoll_entry *pwq;

    if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        if (epi->event.events & EPOLLEXCLUSIVE)
            add_wait_queue_exclusive(whead, &pwq->wait);
        else
            add_wait_queue(whead, &pwq->wait);
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } else {
        /* We have to signal that an error occurred */
        epi->nwait = -1;
    }
}

根据EPOLLEXCLUSIVE 加入到不同的唤醒队列add_wait_queue_exclusive add_wait_queue

在wake_up的时候通过nr-exclu 控制但是要想break还需要返回 ep_poll_callback返回 true；

对于epoll_oneshot

在epoll_wait后 send fd 到user时，

if (epi->event.events & EPOLLONESHOT) 不会重复添加进去导致后续链式唤醒
               epi->event.events &= EP_PRIVATE_BITS;
else if (!(epi->event.events & EPOLLET)) {
               list_add_tail(&epi->rdllink, &ep->rdllist);
               ep_pm_stay_awake(epi);
           }

同时在ep_call_back的时候也会继续检查 EPOLLONESHOT ，房子 epoll_wait返回时，在处理data中，fd又有数据需要相应，此时多线程中别的线程可以相应。。。。乱序了！！！

   /*
   * If the event mask does not contain any poll(2) event, we consider the descriptor to be disabled. This condition is likely the effect of the
   * EPOLLONESHOT bit that disables the descriptor when an event is received,* until the next EPOLL_CTL_MOD will be issued.
   */
   if (!(epi->event.events & ~EP_PRIVATE_BITS))
       goto out_unlock;

查看全文

相关阅读:
HDU_3496_(二维费用背包)
HDU_3732_(多重背包)
HDU_2079_(01背包)(dfs)
HDU_2844_(多重背包)
Codeforces_766_D_(并查集)
HDU_3591_(多重背包+完全背包)
struts2标签
 ongl 表达式
 result 相关
 struts2页面输出错误信息

原文地址：https://www.cnblogs.com/codestack/p/13040907.html