zoukankan      html  css  js  c++  java
  • dpdk tx_pkt_burst rte_pktmbuf_free mbuf释放

    [root@localhost ixgbe]# grep tx_pkt_burst -rn *
    ixgbe_ethdev.c:1102:    eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
    ixgbe_ethdev.c:1582:    eth_dev->tx_pkt_burst = &ixgbe_xmit_pkts;
    ixgbe_ethdev.c:2989:    dev->tx_pkt_burst = NULL;
    ixgbe_ethdev.c:5360:    dev->tx_pkt_burst = NULL;
    ixgbe_rxtx.c:2400:                      dev->tx_pkt_burst = ixgbe_xmit_pkts_vec;
    ixgbe_rxtx.c:2403:              dev->tx_pkt_burst = ixgbe_xmit_pkts_simple;
    ixgbe_rxtx.c:2413:              dev->tx_pkt_burst = ixgbe_xmit_pkts;
    ixgbe_vf_representor.c:206:     ethdev->tx_pkt_burst = ixgbe_vf_representor_tx_burst
    (gdb) bt
    #0  hinic_xmit_pkts (tx_queue=0x13e7e7000, tx_pkts=0xffffbd40ce00, nb_pkts=1)
        at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:1066
    #1  0x0000000000465b18 in rte_eth_tx_burst (port_id=0, queue_id=0, tx_pkts=0xffffbd40ce00, nb_pkts=1)
        at /data1/dpdk-19.11/arm64-armv8a-linuxapp-gcc/include/rte_ethdev.h:4666
    #2  0x00000000004666bc in reply_to_icmp_echo_rqsts () at /data1/dpdk-19.11/demo/dpdk-pingpong/main.c:695
    #3  0x00000000004667e8 in server_loop () at /data1/dpdk-19.11/demo/dpdk-pingpong/main.c:735
    #4  0x0000000000466820 in pong_launch_one_lcore (dummy=0x0) at /data1/dpdk-19.11/demo/dpdk-pingpong/main.c:742
    #5  0x0000000000593538 in eal_thread_loop (arg=0x0)
        at /data1/dpdk-19.11/lib/librte_eal/linux/eal/eal_thread.c:153
    #6  0x0000ffffbe617d38 in start_thread (arg=0xffffbd40d910) at pthread_create.c:309
    #7  0x0000ffffbe55f5f0 in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:91
    (gdb) s
    1071            struct hinic_txq *txq = tx_queue;
    (gdb) n
    1077            if (HINIC_GET_SQ_FREE_WQEBBS(txq) < txq->tx_free_thresh)
    (gdb) n
    1081            for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
    (gdb) n
    1082                    mbuf_pkt = *tx_pkts++;
    (gdb) n
    1083                    queue_info = 0;
    (gdb) n
    1086                    if (unlikely(!hinic_get_sge_txoff_info(mbuf_pkt,
    (gdb) n
    1093                    wqe_wqebb_cnt = HINIC_SQ_WQEBB_CNT(sqe_info.sge_cnt);
    (gdb) n
    1094                    free_wqebb_cnt = HINIC_GET_SQ_FREE_WQEBBS(txq);
    (gdb) n
    1095                    if (unlikely(wqe_wqebb_cnt > free_wqebb_cnt)) {
    (gdb) n
    1108                    sq_wqe = hinic_get_sq_wqe(txq, wqe_wqebb_cnt, &sqe_info);
    (gdb) n
    1111                    if (unlikely(!hinic_mbuf_dma_map_sge(txq, mbuf_pkt,
    (gdb) n
    1121                    task = &sq_wqe->task;
    (gdb) n
    1124                    hinic_fill_tx_offload_info(mbuf_pkt, task, &queue_info,
    (gdb) n
    1128                    tx_info = &txq->tx_info[sqe_info.pi];
    (gdb) n
    1129                    tx_info->mbuf = mbuf_pkt;
    (gdb) n
    1130                    tx_info->wqebb_cnt = wqe_wqebb_cnt;
    (gdb) n
    1133                    hinic_fill_sq_wqe_header(&sq_wqe->ctrl, queue_info,
    (gdb) n
    1134                                             sqe_info.sge_cnt, sqe_info.owner);
    (gdb) n
    1133                    hinic_fill_sq_wqe_header(&sq_wqe->ctrl, queue_info,
    (gdb) n
    1137                    hinic_sq_wqe_cpu_to_be32(sq_wqe, sqe_info.seq_wqebbs);
    (gdb) n
    1139                    tx_bytes += mbuf_pkt->pkt_len;
    (gdb) n
    1081            for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
    (gdb) n
    1143            if (nb_tx) {
    (gdb) n
    1144                    hinic_sq_write_db(txq->sq, txq->cos);
    (gdb) n
    1146                    txq->txq_stats.packets += nb_tx;
    (gdb) n
    1147                    txq->txq_stats.bytes += tx_bytes;
    (gdb) n
    1149            txq->txq_stats.burst_pkts = nb_tx;
    (gdb) n
    1151            return nb_tx;
    (gdb) n
    1152    }
    (gdb) n
    #!/bin/bash
    for i in {1..254};do
            ip=10.10.103.229
                    ping -c 1000 $ip &> /dev/null && echo $ip is up &
    done
    wait

    ping 10.10.103.229 持续10多分钟,才之心到断点Breakpoint 1, hinic_xmit_mbuf_cleanup (txq=0x13e7e7000) at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:584

    hinic_xmit_mbuf_cleanup 会执行free buf

    参考 http://chinaunix.net/uid-28541347-id-5791061.html

    net_hinic: Disable promiscuous, nic_dev: hinic-0000:05:00.0, port_id: 0, promisc: 0
    net_hinic: Disable allmulticast succeed, nic_dev: hinic-0000:05:00.0, port_id: 0
    Initilize port 0 done.
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
     tip=10.10.103.229  ARP:  hrd=1 proto=0x0800 hln=6 pln=4 op=1 (ARP Request)
    [Switching to Thread 0xffffbd40d910 (LWP 44270)]
    
    Breakpoint 1, hinic_xmit_mbuf_cleanup (txq=0x13e7e7000) at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:584
    584             int i, nb_free = 0;
    (gdb) bt
    #0  hinic_xmit_mbuf_cleanup (txq=0x13e7e7000) at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:584
    #1  0x000000000078ac00 in hinic_xmit_pkts (tx_queue=0x13e7e7000, tx_pkts=0xffffbd40ce80, nb_pkts=1)
        at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:1078
    #2  0x00000000004663c8 in reply_to_icmp_echo_rqsts ()
    #3  0x00000000004674fc in pong_launch_one_lcore ()
    #4  0x0000000000593ae8 in eal_thread_loop (arg=0x0)
        at /data1/dpdk-19.11/lib/librte_eal/linux/eal/eal_thread.c:153
    #5  0x0000ffffbe617d38 in start_thread (arg=0xffffbd40d910) at pthread_create.c:309
    #6  0x0000ffffbe55f5f0 in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:91
    (gdb) delete 
    Delete all breakpoints? (y or n) y
    (gdb) c

     txq->wq->delta txq->q_depth初始化

     

    static int hinic_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
                             uint16_t nb_desc, unsigned int socket_id,
                             __rte_unused const struct rte_eth_txconf *tx_conf)
    {
            int rc;
            struct hinic_nic_dev *nic_dev;
            struct hinic_hwdev *hwdev;
            struct hinic_txq *txq;
            u16 sq_depth, tx_free_thresh;
    
            nic_dev = HINIC_ETH_DEV_TO_PRIVATE_NIC_DEV(dev);
            hwdev = nic_dev->hwdev;
    
            /* queue depth must be power of 2, otherwise will be aligned up */
            sq_depth = (nb_desc & (nb_desc - 1)) ?
                            ((u16)(1U << (ilog2(nb_desc) + 1))) : nb_desc;
    
    
            /* alloc tx sq hw wqepage */
            rc = hinic_create_sq(hwdev, queue_idx, sq_depth);
    }

     

    将nb_txd改小

    rte_eth_tx_queue_setup(portid, 0, nb_txd,
    rte_eth_dev_socket_id(portid),
    &txq_conf);

    Breakpoint 1, hinic_xmit_mbuf_cleanup (txq=0x13e7e7000) at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:584
    584             int i, nb_free = 0;
    (gdb) s
    586             int wqebb_cnt = 0;
    (gdb) list
    581     {
    582             struct hinic_tx_info *tx_info;
    583             struct rte_mbuf *mbuf, *m, *mbuf_free[HINIC_MAX_TX_FREE_BULK];
    584             int i, nb_free = 0;
    585             u16 hw_ci, sw_ci, sq_mask;
    586             int wqebb_cnt = 0;
    587
    588             hw_ci = HINIC_GET_SQ_HW_CI(txq);
    589             sw_ci = HINIC_GET_SQ_LOCAL_CI(txq);
    590             sq_mask = HINIC_GET_SQ_WQE_MASK(txq);
    (gdb) n
    588             hw_ci = HINIC_GET_SQ_HW_CI(txq);
    (gdb) n
    589             sw_ci = HINIC_GET_SQ_LOCAL_CI(txq);
    (gdb) n
    590             sq_mask = HINIC_GET_SQ_WQE_MASK(txq);
    (gdb) n
    592             for (i = 0; i < txq->tx_free_thresh; ++i) {
    (gdb) n
    593                     tx_info = &txq->tx_info[sw_ci];
    (gdb) n
    594                     if (hw_ci == sw_ci ||
    (gdb) p *tx_info
    $1 = {mbuf = 0x13e9a9400, wqebb_cnt = 1, cpy_mbuf = 0x0}
    (gdb) n
    595                             (((hw_ci - sw_ci) & sq_mask) < tx_info->wqebb_cnt))
    (gdb) n
    594                     if (hw_ci == sw_ci ||
    (gdb) n
    598                     sw_ci = (sw_ci + tx_info->wqebb_cnt) & sq_mask;
    (gdb) n
    600                     if (unlikely(tx_info->cpy_mbuf != NULL)) {
    (gdb) n
    605                     wqebb_cnt += tx_info->wqebb_cnt;
    (gdb) n
    606                     mbuf = tx_info->mbuf;
    (gdb) n
    608                     if (likely(mbuf->nb_segs == 1)) {
    (gdb) n
    609                             m = rte_pktmbuf_prefree_seg(mbuf);
    (gdb) n
    610                             tx_info->mbuf = NULL;
    (gdb) n
    612                             if (unlikely(m == NULL))
    (gdb) n
    615                             mbuf_free[nb_free++] = m;
    (gdb) n
    616                             if (unlikely(m->pool != mbuf_free[0]->pool ||
    (gdb) n
    592             for (i = 0; i < txq->tx_free_thresh; ++i) {
    (gdb) n
    593                     tx_info = &txq->tx_info[sw_ci];
    (gdb) n
    594                     if (hw_ci == sw_ci ||
    (gdb) n
    595                             (((hw_ci - sw_ci) & sq_mask) < tx_info->wqebb_cnt))
    (gdb) n
    594                     if (hw_ci == sw_ci ||
    (gdb) n
    598                     sw_ci = (sw_ci + tx_info->wqebb_cnt) & sq_mask;
    (gdb) n
    600                     if (unlikely(tx_info->cpy_mbuf != NULL)) {
    (gdb) n
    605                     wqebb_cnt += tx_info->wqebb_cnt;
    (gdb) n
    606                     mbuf = tx_info->mbuf;
    (gdb) n
    608                     if (likely(mbuf->nb_segs == 1)) {
    (gdb) n
    609                             m = rte_pktmbuf_prefree_seg(mbuf);
    (gdb) n
    610                             tx_info->mbuf = NULL;
    (gdb) n
    612                             if (unlikely(m == NULL))
    (gdb) n
    615                             mbuf_free[nb_free++] = m;
    (gdb) n
    616                             if (unlikely(m->pool != mbuf_free[0]->pool ||
    (gdb) n
    592             for (i = 0; i < txq->tx_free_thresh; ++i) {
    (gdb) c
    Continuing.
    
    Breakpoint 1, hinic_xmit_mbuf_cleanup (txq=0x13e7e7000) at /data1/dpdk-19.11/drivers/net/hinic/hinic_pmd_tx.c:584
    584             int i, nb_free = 0;
    (gdb) delete
    Delete all breakpoints? (y or n) y
    (gdb) c
    Continuing.

    bnxt_xmit_pkts

    uint16_t bnxt_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
                 uint16_t nb_pkts)
    {
        struct bnxt_tx_queue *txq = tx_queue;
        uint16_t nb_tx_pkts = 0;
        uint16_t db_mask = txq->tx_ring->tx_ring_struct->ring_size >> 2;
        uint16_t last_db_mask = 0;
    
        /* Handle TX completions */
        bnxt_handle_tx_cp(txq);
    
        /* Handle TX burst request */
        for (nb_tx_pkts = 0; nb_tx_pkts < nb_pkts; nb_tx_pkts++) {
            if (bnxt_start_xmit(tx_pkts[nb_tx_pkts], txq)) {
                break;
            } else if ((nb_tx_pkts & db_mask) != last_db_mask) {
                B_TX_DB(txq->tx_ring->tx_doorbell,
                        txq->tx_ring->tx_prod);
                last_db_mask = nb_tx_pkts & db_mask;
            }
        }
        if (nb_tx_pkts)
            B_TX_DB(txq->tx_ring->tx_doorbell, txq->tx_ring->tx_prod);
    
        return nb_tx_pkts;
    }

    这个函数的逻辑可以分为三个部分来看:

    首先是bnxt_handle_tx_cp,这里的cp是指complete(完成),这个函数主要负责处理之前网卡已经发送完成的mbuf,也就是网卡已经通过DMA将mbuf中的数据拷贝走,软件可以释放mbuf的逻辑;

    其次是bnxt_start_xmit,这个是真正的发送逻辑,其实这里的发送也并不是真的把数据拷贝到网卡上,而是根据每个mbuf的数据地址设置到bd ring,从而告诉网卡DMA拷贝的源地址;

    最后是B_TX_DB对tx_doorbell的写操作,作用就是前面bd的地址信息已经填充完毕,告诉网卡可以发起DMA了。

    在分析具体函数前,首先熟悉一下相关的数据结构。和tx_queue相关联的有两个ring,一个是tx_ring(发送ring),一个是cp_ring(完成ring)。其数据结构关系如下:

    下面分别介绍三个阶段的实现。

    释放已经DMA完成的mbuf

     bnxt_handle_tx_cp

    static int bnxt_handle_tx_cp(struct bnxt_tx_queue *txq)
    {
        struct bnxt_cp_ring_info *cpr = txq->cp_ring;
        uint32_t raw_cons = cpr->cp_raw_cons; /* 记录完成队列上次完成队列释放(consumer)的index */
        uint32_t cons;
        int nb_tx_pkts = 0;
        struct tx_cmpl *txcmp;
    
        if ((txq->tx_ring->tx_ring_struct->ring_size -
                (bnxt_tx_avail(txq->tx_ring))) >
                txq->tx_free_thresh) { /*如果发送ring中的已用desc数量大于tx_free_thresh*/
            while (1) {
                cons = RING_CMP(cpr->cp_ring_struct, raw_cons);
                txcmp = (struct tx_cmpl *)&cpr->cp_desc_ring[cons];
    
                /* struct tx_cmpl中的type由硬件设置,TX_CMPL_TYPE_TX_L2表示网卡已经DMA完成,软件可以释放mbuf中的数据了 */
                if (CMP_TYPE(txcmp) == TX_CMPL_TYPE_TX_L2)
                    nb_tx_pkts++;
                else
                    RTE_LOG_DP(DEBUG, PMD,
                            "Unhandled CMP type %02x
    ",
                            CMP_TYPE(txcmp));
                raw_cons = NEXT_RAW_CMP(raw_cons); /* raw_cons = raw_cons + 1 */
            }
            if (nb_tx_pkts) /* nb_tx_pkts记录了本次可以释放的mbuf数量 */
                bnxt_tx_cmp(txq, nb_tx_pkts); /* 释放mbuf */
            cpr->cp_raw_cons = raw_cons; /* 更新 cpr->cp_raw_cons */
            B_CP_DIS_DB(cpr, cpr->cp_raw_cons); /* 通过cp ring的cp_doorbell通知硬件对应的cp ring bd已经可以释放了*/
        }
        return nb_tx_pkts;
    }

    真正释放mbuf的操作是在bnxt_tx_cmp函数完成的。

       bnxt_tx_cmp

    static void bnxt_tx_cmp(struct bnxt_tx_queue *txq, int nr_pkts)
    {
        struct bnxt_tx_ring_info *txr = txq->tx_ring;
        uint16_t cons = txr->tx_cons;
        int i, j;
    
        for (i = 0; i < nr_pkts; i++) {
            struct bnxt_sw_tx_bd *tx_buf;
            struct rte_mbuf *mbuf;
    
            tx_buf = &txr->tx_buf_ring[cons]; /*tx_buf_ring存放txring中的mbuf*/
            cons = RING_NEXT(txr->tx_ring_struct, cons);
            mbuf = tx_buf->mbuf;
            tx_buf->mbuf = NULL;
    
            /* EW - no need to unmap DMA memory? */
            /* tx_buf->nr_bds记录一个mbuf对应的bd数量,一个mbuf可能对应多个bd */
            for (j = 1; j < tx_buf->nr_bds; j++)
                cons = RING_NEXT(txr->tx_ring_struct, cons); /* cons = cons + 1 */
            rte_pktmbuf_free(mbuf);
        }
    
        txr->tx_cons = cons; /* 清空了一部分mbuf,更新consumer index */
    }

    这里注意一点,在函数的最后更新tx_ring的consumer index,虽然对于发送端来说,软件驱动是productor(产生数据),网卡是consumer(消费数据),但是真正释放数据还是由软件驱动完成,所以consumer也是要在软件更新的。

    数据包发送

    数据包发送是在bnxt_start_xmit中完成的。

      bnxt_start_xmit

    static uint16_t bnxt_start_xmit(struct rte_mbuf *tx_pkt,
                    struct bnxt_tx_queue *txq)
    {
        struct bnxt_tx_ring_info *txr = txq->tx_ring;
        struct tx_bd_long *txbd;
        struct tx_bd_long_hi *txbd1;
        uint32_t vlan_tag_flags, cfa_action;
        bool long_bd = false;
        uint16_t last_prod = 0;
        struct rte_mbuf *m_seg;
        struct bnxt_sw_tx_bd *tx_buf;
        static const uint32_t lhint_arr[4] = {
            TX_BD_LONG_FLAGS_LHINT_LT512,
            TX_BD_LONG_FLAGS_LHINT_LT1K,
            TX_BD_LONG_FLAGS_LHINT_LT2K,
            TX_BD_LONG_FLAGS_LHINT_LT2K
        };
    
        if (tx_pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_TCP_CKSUM |
                    PKT_TX_UDP_CKSUM | PKT_TX_IP_CKSUM |
                    PKT_TX_VLAN_PKT | PKT_TX_OUTER_IP_CKSUM))
            long_bd = true;
        /* 1. 将待发送的mbuf放入tx_ring的bnxt_sw_tx_bd中 */
        tx_buf = &txr->tx_buf_ring[txr->tx_prod];
        tx_buf->mbuf = tx_pkt;
        tx_buf->nr_bds = long_bd + tx_pkt->nb_segs;
        /* 一个mbuf可能对应多个bd,last_prod指向该mbuf对应的最后一个bd的index */
        last_prod = (txr->tx_prod + tx_buf->nr_bds - 1) &
                    txr->tx_ring_struct->ring_mask;
    
        if (unlikely(bnxt_tx_avail(txr) < tx_buf->nr_bds))
            return -ENOMEM;
        /* 2. 根据mbuf的信息设置rx tx_desc_ring中对应的bd,其中关键是txbd->addr */
        txbd = &txr->tx_desc_ring[txr->tx_prod];
        txbd->opaque = txr->tx_prod;
        txbd->flags_type = tx_buf->nr_bds << TX_BD_LONG_FLAGS_BD_CNT_SFT;
        txbd->len = tx_pkt->data_len;
        if (txbd->len >= 2014)
            txbd->flags_type |= TX_BD_LONG_FLAGS_LHINT_GTE2K;
        else
            txbd->flags_type |= lhint_arr[txbd->len >> 9];
        /* txbd->addr是mbuf的dma地址,也就是iova地址 */
        txbd->addr = rte_cpu_to_le_32(RTE_MBUF_DATA_DMA_ADDR(tx_buf->mbuf)); /* txr->tx_prod = txr->tx_prod + 1 */
    
        if (long_bd) {
            txbd->flags_type |= TX_BD_LONG_TYPE_TX_BD_LONG;
            vlan_tag_flags = 0;
            cfa_action = 0;
            if (tx_buf->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
                /* shurd: Should this mask at
                 * TX_BD_LONG_CFA_META_VLAN_VID_MASK?
                 */
                vlan_tag_flags = TX_BD_LONG_CFA_META_KEY_VLAN_TAG |
                    tx_buf->mbuf->vlan_tci;
                /* Currently supports 8021Q, 8021AD vlan offloads
                 * QINQ1, QINQ2, QINQ3 vlan headers are deprecated
                 */
                /* DPDK only supports 802.11q VLAN packets */
                vlan_tag_flags |=
                        TX_BD_LONG_CFA_META_VLAN_TPID_TPID8100;
            }
            /* 更新tx_ring的productor index */
            txr->tx_prod = RING_NEXT(txr->tx_ring_struct, txr->tx_prod);
    
            txbd1 = (struct tx_bd_long_hi *)
                        &txr->tx_desc_ring[txr->tx_prod];
            txbd1->lflags = 0;
            txbd1->cfa_meta = vlan_tag_flags;
            txbd1->cfa_action = cfa_action;
            /* 根据mbuf的ol_flags设置bd中对应的flag */
            if (tx_pkt->ol_flags & PKT_TX_TCP_SEG) {
                /* TSO */
                txbd1->lflags |= TX_BD_LONG_LFLAGS_LSO;
                txbd1->hdr_size = tx_pkt->l2_len + tx_pkt->l3_len +
                        tx_pkt->l4_len + tx_pkt->outer_l2_len +
                        tx_pkt->outer_l3_len;
                txbd1->mss = tx_pkt->tso_segsz;
    
            } else if ((tx_pkt->ol_flags & PKT_TX_OIP_IIP_TCP_UDP_CKSUM) ==
                 PKT_TX_OIP_IIP_TCP_UDP_CKSUM) {
                /* Outer IP, Inner IP, Inner TCP/UDP CSO */
                txbd1->lflags |= TX_BD_FLG_TIP_IP_TCP_UDP_CHKSUM;
                txbd1->mss = 0;
            } else if ((tx_pkt->ol_flags & PKT_TX_IIP_TCP_UDP_CKSUM) ==
                 PKT_TX_IIP_TCP_UDP_CKSUM) {
                /* (Inner) IP, (Inner) TCP/UDP CSO */
                txbd1->lflags |= TX_BD_FLG_IP_TCP_UDP_CHKSUM;
                txbd1->mss = 0;
            } else if ((tx_pkt->ol_flags & PKT_TX_OIP_TCP_UDP_CKSUM) ==
                 PKT_TX_OIP_TCP_UDP_CKSUM) {
                /* Outer IP, (Inner) TCP/UDP CSO */
                txbd1->lflags |= TX_BD_FLG_TIP_TCP_UDP_CHKSUM;
                txbd1->mss = 0;
            } else if ((tx_pkt->ol_flags & PKT_TX_OIP_IIP_CKSUM) ==
                 PKT_TX_OIP_IIP_CKSUM) {
                /* Outer IP, Inner IP CSO */
                txbd1->lflags |= TX_BD_FLG_TIP_IP_CHKSUM;
                txbd1->mss = 0;
            } else if ((tx_pkt->ol_flags & PKT_TX_TCP_UDP_CKSUM) ==
                 PKT_TX_TCP_UDP_CKSUM) {
                /* TCP/UDP CSO */
                txbd1->lflags |= TX_BD_LONG_LFLAGS_TCP_UDP_CHKSUM;
                txbd1->mss = 0;
            } else if (tx_pkt->ol_flags & PKT_TX_IP_CKSUM) {
                /* IP CSO */
                txbd1->lflags |= TX_BD_LONG_LFLAGS_IP_CHKSUM;
                txbd1->mss = 0;
            } else if (tx_pkt->ol_flags & PKT_TX_OUTER_IP_CKSUM) {
                /* IP CSO */
                txbd1->lflags |= TX_BD_LONG_LFLAGS_T_IP_CHKSUM;
                txbd1->mss = 0;
            }
        } else {
            txbd->flags_type |= TX_BD_SHORT_TYPE_TX_BD_SHORT;
        }
    
        m_seg = tx_pkt->next;
        /* i is set at the end of the if(long_bd) block */
        while (txr->tx_prod != last_prod) {
             /* 更新tx_ring的productor index */
            txr->tx_prod = RING_NEXT(txr->tx_ring_struct, txr->tx_prod); /* txr->tx_prod = txr->tx_prod + 1 */
            tx_buf = &txr->tx_buf_ring[txr->tx_prod];
    
            txbd = &txr->tx_desc_ring[txr->tx_prod];
            txbd->addr = rte_cpu_to_le_32(RTE_MBUF_DATA_DMA_ADDR(m_seg));
            txbd->flags_type = TX_BD_SHORT_TYPE_TX_BD_SHORT;
            txbd->len = m_seg->data_len;
    
            m_seg = m_seg->next;
        }
    
        txbd->flags_type |= TX_BD_LONG_FLAGS_PACKET_END;
         /* 更新tx_ring的productor index */
        txr->tx_prod = RING_NEXT(txr->tx_ring_struct, txr->tx_prod); /* txr->tx_prod = txr->tx_prod + 1 */
    
        return 0;
    }

    其中值得注意的有两点,一个是mbuf向db(tx_bd_long)转换的过程,其bd地址设置为mbuf的iova地址,也就是dma地址。

    txbd->addr = rte_cpu_to_le_32(RTE_MBUF_DATA_DMA_ADDR(tx_buf->mbuf));
    #define RTE_MBUF_DATA_DMA_ADDR(mb) 
        ((uint64_t)((mb)->buf_iova + (mb)->data_off))
    另一方面是在发送过程中会更新tx_ring的productor index。
    
    启动DMA
    启动硬件DMA拷贝是通过一下语句完成:
    
    B_TX_DB(txq->tx_ring->tx_doorbell, txq->tx_ring->tx_prod)
    
    将tx_ring的productor index(tx_ring->tx_prod)写入tx_ring的tx_doorbell中。而无论是cp_ring的cp_doorbell还是tx_ring的tx_doorbell都在在bnxt_alloc_hwrm_rings函数中初始化为设备的bar空间地址的。
    cpr->cp_doorbell = (char *)pci_dev->mem_resource[2].addr + idx * 0x80;
    txr->tx_doorbell = (char *)pci_dev->mem_resource[2].addr + idx * 0x80;

    ixgbe_xmit_pkts()

    https://blog.csdn.net/hz5034/article/details/88381486

    发送时回写的三种情况(默认为第一种,回写取决于RS):
    1、TXDCTL[n].WTHRESH = 0 and a descriptor that has RS set is ready to be written back.
    2、TXDCTL[n].WTHRESH > 0 and TXDCTL[n].WTHRESH descriptors have accumulated.
    3、TXDCTL[n].WTHRESH > 0 and the corresponding EITR counter has reached zero. The timer expiration flushes any accumulated descriptors and sets an interrupt event (TXDW).

    发送时回写:
    1、挂载每个包的最后一个分段时,若当前使用的desc数大于上限(默认为32),设置RS
    2、burst发包的最后一个包的最后一个分段,设置RS

    uint16_t
    ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
            uint16_t nb_pkts)
    {
        ...
        txq = tx_queue;
        sw_ring = txq->sw_ring;
        txr     = txq->tx_ring;
        tx_id   = txq->tx_tail; /* 相当于ixgbe的next_to_use */
        txe = &sw_ring[tx_id]; /* 得到tx_tail指向的entry */
        txp = NULL;
        ...
        /* 若空闲的mbuf数小于下限(默认为32),清理空闲的mbuf */
        if (txq->nb_tx_free < txq->tx_free_thresh)
            ixgbe_xmit_cleanup(txq);
        ...
        /* TX loop */
        for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
            ...
            tx_pkt = *tx_pkts++; /* 待发送的mbuf */
            pkt_len = tx_pkt->pkt_len; /* 待发送的mbuf的长度 */
            ...
            nb_used = (uint16_t)(tx_pkt->nb_segs + new_ctx); /* 使用的desc数 */
            ...
            tx_last = (uint16_t) (tx_id + nb_used - 1); /* tx_last指向最后一个desc */
            ...
            if (tx_last >= txq->nb_tx_desc) /* 注意是一个环形缓冲区 */
                tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
            ...
            if (nb_used > txq->nb_tx_free) {
                ...
                if (ixgbe_xmit_cleanup(txq) != 0) {
                    /* Could not clean any descriptors */
                    if (nb_tx == 0) /* 若是第一个包(未发包),return 0 */
                        return 0;
                    goto end_of_tx; /* 若非第一个包(已发包),停止发包,更新发送队列参数 */
                }
                ...
            }
            ...
            /* 每个包可能包含多个分段,m_seg指向第一个分段 */
            m_seg = tx_pkt;
            do {
                txd = &txr[tx_id]; /* desc */
                txn = &sw_ring[txe->next_id]; /* 下一个entry */
                ...
                txe->mbuf = m_seg; /* 将m_seg挂载到txe */
                ...
                slen = m_seg->data_len; /* m_seg的长度 */
                buf_dma_addr = rte_mbuf_data_dma_addr(m_seg); /* m_seg的总线地址 */
                txd->read.buffer_addr =
                    rte_cpu_to_le_64(buf_dma_addr); /* 总线地址赋给txd->read.buffer_addr */
                txd->read.cmd_type_len =
                    rte_cpu_to_le_32(cmd_type_len | slen); /* 长度赋给txd->read.cmd_type_len */
                ...
                txe->last_id = tx_last; /* last_id指向最后一个desc */
                tx_id = txe->next_id; /* tx_id指向下一个desc */
                txe = txn; /* txe指向下一个entry */
                m_seg = m_seg->next; /* m_seg指向下一个分段 */
            } while (m_seg != NULL);
            ...
            /* 最后一个分段 */
            cmd_type_len |= IXGBE_TXD_CMD_EOP;
            txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used); /* 更新nb_tx_used */
            txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used); /* 更新nb_tx_free */
            ...
            if (txq->nb_tx_used >= txq->tx_rs_thresh) { /* 若使用的mbuf数大于上限(默认为32),设置RS */
                ...
                cmd_type_len |= IXGBE_TXD_CMD_RS;
                ...
                txp = NULL; /* txp为NULL表示已设置RS */
            } else
                txp = txd; /* txp非NULL表示未设置RS */
            ...
            txd->read.cmd_type_len |= rte_cpu_to_le_32(cmd_type_len);
        }
        ...
    end_of_tx:
        /* burst发包的最后一个包的最后一个分段 */
        ...
        if (txp != NULL) /* 若未设置RS,设置RS */
            txp->read.cmd_type_len |= rte_cpu_to_le_32(IXGBE_TXD_CMD_RS);
        ...
        IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id); /* 将tx_id写入TDT */
        txq->tx_tail = tx_id; /* tx_tail指向下一个desc */
        ...
        return nb_tx;
    }

    ixgbe_xmit_cleanup()

    static inline int
    ixgbe_xmit_cleanup(struct ixgbe_tx_queue *txq)
    {
        ...
        uint16_t last_desc_cleaned = txq->last_desc_cleaned;
        ...
        /* 最后一个entry */
        desc_to_clean_to = (uint16_t)(last_desc_cleaned + txq->tx_rs_thresh);
        if (desc_to_clean_to >= nb_tx_desc) /* 注意是环形缓冲区 */
            desc_to_clean_to = (uint16_t)(desc_to_clean_to - nb_tx_desc);
        ...
        /* 最后一个entry的最后一个desc */
        desc_to_clean_to = sw_ring[desc_to_clean_to].last_id;
        status = txr[desc_to_clean_to].wb.status;
        /* 若最后一个desc的DD为0,return -1 */
        if (!(status & rte_cpu_to_le_32(IXGBE_TXD_STAT_DD))) {
            ...
            return -(1);
        }
        ...
        /* 将要清理的desc数 */
        if (last_desc_cleaned > desc_to_clean_to) /* 注意是环形缓冲区 */
            nb_tx_to_clean = (uint16_t)((nb_tx_desc - last_desc_cleaned) +
                                desc_to_clean_to);
        else
            nb_tx_to_clean = (uint16_t)(desc_to_clean_to -
                            last_desc_cleaned);
        ...
        txr[desc_to_clean_to].wb.status = 0; /* 清零DD */
        ...
        txq->last_desc_cleaned = desc_to_clean_to; /* 更新last_desc_cleaned */
        txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + nb_tx_to_clean); /* 更新nb_tx_free */
        ...
        return 0;
    }
  • 相关阅读:
    浪潮之巅阅读笔记01
    2018年春季个人阅读计划
    问题账户需求分析
    需求工程解析图收获
    《软件需求分析》阅读笔记
    《软件需求》阅读笔记之三
    《小账本》开发日志 第六天
    《小账本》开发日志 第五天
    [POI2009]KAM-Pebbles BZOJ1115 [ 待填坑 ] 博弈
    打谷机 BZOJ 1603 模拟
  • 原文地址:https://www.cnblogs.com/dream397/p/13674512.html
Copyright © 2011-2022 走看看