zoukankan      html  css  js  c++  java
  • TCP最大报文段MSS源码分析

    概述

    本文主要对MSS相关的几个字段结合源码流程进行分析;

    字段含义

    user_mss(tcp_options_received)–用户配置的mss,优先级最高;

    mss_clamp(tcp_options_received)–对端通告的mss,即为对端能接受的最大mss,对端通告的mss与user_mss中的较小值;

    advmss(tcp_sock)–用于通告对端的mss值,本端能接受的最大mss;

    mss_cache(tcp_sock)–缓存发送方当前有效的mss值,根据pmtu变化,不会超过mss_clamp;

    rcv_mss(inet_connection_sock)–由最近接收到的段估算的对端mss,主要用来确定是否执行延迟确认;

    user_mss配置

    user_mss是用户配置的MSS,该MSS优先级最高,如果配置了该MSS,则MSS均不能超过该值;下面为调用setsockopt设置user_mss的代码,其操作字段为TCP_MAXSEG;配置范围不能小于最小MSS,不能大于最大窗口值;

     1 static int do_tcp_setsockopt(struct sock *sk, int level,
     2         int optname, char __user *optval, unsigned int optlen)
     3 {
     4     switch (optname) {
     5     case TCP_MAXSEG:
     6         /* Values greater than interface MTU won't take effect. However
     7          * at the point when this call is done we typically don't yet
     8          * know which interface is going to be used */
     9         if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
    10             err = -EINVAL;
    11             break;
    12         }
    13         tp->rx_opt.user_mss = val;
    14         break;
    15 }

    交互流程代码分析

    第一次握手
    客户端发送syn

    在进行connect操作的初始化中对mss的设置如下:

    (1) 如果有用户配置的user_mss,则将mss_clamp(本端最大mss)设置为user_mss;

    (2) 调用tcp_sync_mss来同步mss,其主要是根据设备mtu,最大窗口等计算出当前有效的mss,并将该mss记录到tp->mss_cache中;因该函数涉及篇幅较大,在本文最后进行分析;

    (3) 设置用于通告给对端的advmss,去路由表中查MSS,这里会用到pmtu,然后将这个值与user_mss比较,取较小的值设置为向对端通告的值;

    (4) 估算对端的mss,根据advmss,mss_cache,rcv_wnd,MSS_DEFAULT,MIN_MSS估算rcv_mss;

     1 static void tcp_connect_init(struct sock *sk)
     2 {
     3     /* If user gave his TCP_MAXSEG, record it to clamp */
     4     /* (1)如果配置了user_mss,则设置最大mss为user_mss */
     5     if (tp->rx_opt.user_mss)
     6         tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
     7     tp->max_window = 0;
     8     tcp_mtup_init(sk);
     9     /* (2)根据设备mtu同步mss */
    10     tcp_sync_mss(sk, dst_mtu(dst));
    11 
    12     tcp_ca_dst_init(sk, dst);
    13 
    14     if (!tp->window_clamp)
    15         tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
    16 
    17     /* 
    18         (3)设置向对端通告的mss
    19         dst_metric_advmss-去路由表中查询mss 
    20         tcp_mss_clamp-取user_mss和上述查询到的mss之间的较小值
    21     */
    22     tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
    23 
    24     /* (4)估算对端mss */
    25     tcp_initialize_rcv_mss(sk);
    26 }

    在发送syn流程中,会将advmss添加到tcp首部的选项中;调用关系为tcp_transmit_skb->tcp_syn_options->tcp_advertise_mss;可见这里不是直接使用前面的adv_mss,而是调用tcp_advertise_mss重新获取的;

     1 /* Compute TCP options for SYN packets. This is not the final
     2  * network wire format yet.
     3  */
     4 static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
     5                 struct tcp_out_options *opts,
     6                 struct tcp_md5sig_key **md5)
     7 {
     8     /* We always get an MSS option.  The option bytes which will be seen in
     9      * normal data packets should timestamps be used, must be in the MSS
    10      * advertised.  But we subtract them from tp->mss_cache so that
    11      * calculations in tcp_sendmsg are simpler etc.  So account for this
    12      * fact here if necessary.  If we don't do this correctly, as a
    13      * receiver we won't recognize data packets as being full sized when we
    14      * should, and thus we won't abide by the delayed ACK rules correctly.
    15      * SACKs don't matter, we never delay an ACK when we have any of those
    16      * going out.  */
    17     opts->mss = tcp_advertise_mss(sk);
    18     remaining -= TCPOLEN_MSS_ALIGNED;
    19 }

    tcp_advertise_mss重新取查路由表获取mss,并且与前面获取的mss取较小值;

     1 /* Calculate mss to advertise in SYN segment.
     2  * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
     3  *
     4  * 1. It is independent of path mtu.
     5  * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
     6  * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
     7  *    attached devices, because some buggy hosts are confused by
     8  *    large MSS.
     9  * 4. We do not make 3, we advertise MSS, calculated from first
    10  *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
    11  *    This may be overridden via information stored in routing table.
    12  * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
    13  *    probably even Jumbo".
    14  */
    15 static __u16 tcp_advertise_mss(struct sock *sk)
    16 {
    17     struct tcp_sock *tp = tcp_sk(sk);
    18     const struct dst_entry *dst = __sk_dst_get(sk);
    19     int mss = tp->advmss;
    20 
    21     if (dst) {
    22         unsigned int metric = dst_metric_advmss(dst);
    23 
    24         if (metric < mss) {
    25             mss = metric;
    26             tp->advmss = mss;
    27         }
    28     }
    29 
    30     return (__u16)mss;
    31 }
    服务器接收syn

    服务器当前处于LISTEN状态,收到客户端发来的syn包,在处理过程中,需要解析tcp首部的选项,调用关系为tcp_conn_request->tcp_parse_options,其中解析选项的MSS部分如下,解析mss选项,与user_mss进行对比取较小值,然后将mss_clamp(最大mss)设置为该值;

     1 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
     2  * But, this can also be called on packets in the established flow when
     3  * the fast version below fails.
     4  */
     5 void tcp_parse_options(const struct sk_buff *skb,
     6                struct tcp_options_received *opt_rx, int estab,
     7                struct tcp_fastopen_cookie *foc)
     8 {
     9     switch (opcode) {
    10     case TCPOPT_MSS:
    11     if (opsize == TCPOLEN_MSS && th->syn && !estab) {
    12     u16 in_mss = get_unaligned_be16(ptr);
    13      if (in_mss) {
    14          if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
    15              in_mss = opt_rx->user_mss;
    16              opt_rx->mss_clamp = in_mss;
    17     }
    18     }
    19     break;
    20 }

    在分配了请求控制块,对控制块进行初始化的时候,使用从选项中获取的最大mss初始化控制块的mss;

    1 static void tcp_openreq_init(struct request_sock *req,
    2                  const struct tcp_options_received *rx_opt,
    3                  struct sk_buff *skb, const struct sock *sk)
    4 {
    5     struct inet_request_sock *ireq = inet_rsk(req);
    6         /*  ... */
    7     req->mss = rx_opt->mss_clamp;
    8         /*  ... */
    9 }
    第二次握手
    服务器发送syn+ack

    在请求控制块添加到连接链表之后,需要向客户端发送syn+ack,在构造synack包时,需要在选项中指明本端的mss,调用关系如下:tcp_v4_send_synack–>tcp_make_synack–>tcp_synack_options;首先获取mss,方法与前客户端的方法一致,即从路由表中获取mss,与用户配置的user_mss进行比较,取其中较小值;然后调用选项设置将该mss加入到选项中;

     1 struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
     2                 struct request_sock *req,
     3                 struct tcp_fastopen_cookie *foc,
     4                 enum tcp_synack_type synack_type)
     5 {
     6     /* mss取从路由表中查询的mss与user_mss之间的较小值 */
     7     mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
     8         /* 设置tcp选项 */ 
     9        tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +   sizeof(*th);
    10 }
     1 /* Set up TCP options for SYN-ACKs. */
     2 static unsigned int tcp_synack_options(struct request_sock *req,
     3                        unsigned int mss, struct sk_buff *skb,
     4                        struct tcp_out_options *opts,
     5                        const struct tcp_md5sig_key *md5,
     6                        struct tcp_fastopen_cookie *foc)
     7 {
     8     struct inet_request_sock *ireq = inet_rsk(req);
     9     unsigned int remaining = MAX_TCP_OPTION_SPACE;
    10 
    11     /* We always send an MSS option. */
    12     opts->mss = mss;
    13     remaining -= TCPOLEN_MSS_ALIGNED;
    14 }
    客户端接收syn+ack

    客户端当前处于SYN_SENT状态,此时收到服务器发来的syn+ack包,客户端进行以下工作:(1)解析该包tcp选项中的mss ,存入opt_rx->mss_clamp (2) 通过最新的pmtu计算mss (3) 估算对端mss (4) 如果需要进入快速模式,则需要通过rcv_mss计算快速模式额度;

     1 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
     2                      const struct tcphdr *th)
     3 {
     4     struct inet_connection_sock *icsk = inet_csk(sk);
     5     struct tcp_sock *tp = tcp_sk(sk);
     6     struct tcp_fastopen_cookie foc = { .len = -1 };
     7     int saved_clamp = tp->rx_opt.mss_clamp;
     8     bool fastopen_fail;
     9          /* ... */
    10     /* (1)解析tcp选项 */
    11     tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
    12         /* ... */
    13         /* (2)计算mss */
    14         tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 
    15         /* (3)初始化rcv_mss */ 
    16         tcp_initialize_rcv_mss(sk);
    17         /* ... */
    18        /* (4)进入快速ack模式 */
    19        tcp_enter_quickack_mode(sk);
    20 }
    已连接状态发送数据

    tcp发送数据系统调用最终会调用tcp_sendmsg函数,该函数会在发送数据之前,获取发送mss,该mss用于限制后续发送数据段大小;

    1 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
    2 {
    3         /*...*/
    4     mss_now = tcp_send_mss(sk, &size_goal, flags);
    5         /*...*/
    6 }
    1 static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
    2 {
    3     int mss_now;
    4 
    5     mss_now = tcp_current_mss(sk);
    6     *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
    7 
    8     return mss_now;
    9 }

    tcp_current_mss函数根据当前mtu和实际头部选项长度,来更新mss值;

     1 /* Compute the current effective MSS, taking SACKs and IP options,
     2  * and even PMTU discovery events into account.
     3  */
     4 unsigned int tcp_current_mss(struct sock *sk)
     5 {
     6     const struct tcp_sock *tp = tcp_sk(sk);
     7     const struct dst_entry *dst = __sk_dst_get(sk);
     8     u32 mss_now;
     9     unsigned int header_len;
    10     struct tcp_out_options opts;
    11     struct tcp_md5sig_key *md5;
    12 
    13     /* 获取当前有效mss */
    14     mss_now = tp->mss_cache;
    15 
    16     /* 路由缓存存在 */
    17     if (dst) {
    18         /* 获取路径mtu */
    19         u32 mtu = dst_mtu(dst);
    20 
    21         /* 两个mtu不相等,以当前mtu为准更新mss */
    22         if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
    23             mss_now = tcp_sync_mss(sk, mtu);
    24     }
    25 
    26     /* 获取头部长度 */
    27     header_len = tcp_established_options(sk, NULL, &opts, &md5) +
    28              sizeof(struct tcphdr);
    29     /* The mss_cache is sized based on tp->tcp_header_len, which assumes
    30      * some common options. If this is an odd packet (because we have SACK
    31      * blocks etc) then our calculated header_len will be different, and
    32      * we have to adjust mss_now correspondingly */
    33 
    34     /*  头部长度不等,需要更新mss */
    35     if (header_len != tp->tcp_header_len) {
    36         int delta = (int) header_len - tp->tcp_header_len;
    37         mss_now -= delta;
    38     }
    39 
    40     /* 返回mss */
    41     return mss_now;
    42 }

    函数tcp_sync_mss

    这个函数上面的诸多流程都有用到,这里统一进行分析说明;

     1 /* This function synchronize snd mss to current pmtu/exthdr set.
     2 
     3    tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
     4    for TCP options, but includes only bare TCP header.
     5 
     6    tp->rx_opt.mss_clamp is mss negotiated at connection setup.
     7    It is minimum of user_mss and mss received with SYN.
     8    It also does not include TCP options.
     9 
    10    inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
    11 
    12    tp->mss_cache is current effective sending mss, including
    13    all tcp options except for SACKs. It is evaluated,
    14    taking into account current pmtu, but never exceeds
    15    tp->rx_opt.mss_clamp.
    16 
    17    NOTE1. rfc1122 clearly states that advertised MSS
    18    DOES NOT include either tcp or ip options.
    19 
    20    NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
    21    are READ ONLY outside this function.        --ANK (980731)
    22  */
    23 /*更新mss */
    24 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
    25 {
    26     struct tcp_sock *tp = tcp_sk(sk);
    27     struct inet_connection_sock *icsk = inet_csk(sk);
    28     int mss_now;
    29 
    30     /* 发现mtu上限>路径mtu,则重置为路径mtu */
    31     if (icsk->icsk_mtup.search_high > pmtu)
    32         icsk->icsk_mtup.search_high = pmtu;
    33 
    34     /* 计算当前mss */
    35     mss_now = tcp_mtu_to_mss(sk, pmtu);
    36     /* 根据对端通知的最大窗口和当前mss大小调整mss */
    37     mss_now = tcp_bound_to_half_wnd(tp, mss_now);
    38 
    39     /* And store cached results */
    40     /* 记录最新的路径mtu */
    41     icsk->icsk_pmtu_cookie = pmtu;
    42     /* 启用了路径mtu发现 */
    43     if (icsk->icsk_mtup.enabled)
    44         /* mss为当前mss和mss探测下限计算所得的最小值 */
    45         mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
    46     /* 当前mss缓存 */
    47     tp->mss_cache = mss_now;
    48 
    49     return mss_now;
    50 }

    下面两个函数作用为根据mtu计算mss;

    1 /* 计算mss,未包含SACK */
    2 int tcp_mtu_to_mss(struct sock *sk, int pmtu)
    3 {
    4     /* Subtract TCP options size, not including SACKs */
    5     /* 去掉tcp选项的长度 */
    6     return __tcp_mtu_to_mss(sk, pmtu) -
    7            (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
    8 }
     1 /* 在不根据tcp选项的情况下计算mss */
     2 static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
     3 {
     4     const struct tcp_sock *tp = tcp_sk(sk);
     5     const struct inet_connection_sock *icsk = inet_csk(sk);
     6     int mss_now;
     7 
     8     /* Calculate base mss without TCP options:
     9        It is MMS_S - sizeof(tcphdr) of rfc1122
    10      */
    11     /* 当前mss = 路径mtu - 网络头 - tcp头 */
    12     mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
    13 
    14     /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
    15     if (icsk->icsk_af_ops->net_frag_header_len) {
    16         const struct dst_entry *dst = __sk_dst_get(sk);
    17 
    18         if (dst && dst_allfrag(dst))
    19             mss_now -= icsk->icsk_af_ops->net_frag_header_len;
    20     }
    21 
    22     /* Clamp it (mss_clamp does not include tcp options) */
    23     /* 当前mss > mss最大值,调整成最大值 */
    24     if (mss_now > tp->rx_opt.mss_clamp)
    25         mss_now = tp->rx_opt.mss_clamp;
    26 
    27     /* Now subtract optional transport overhead */
    28     /* mss减去ip选项长度 */
    29     mss_now -= icsk->icsk_ext_hdr_len;
    30 
    31     /* Then reserve room for full set of TCP options and 8 bytes of data */
    32     /* 若不足48,则需要扩充保留40字节的tcp选项和8字节的tcp数据长度 */
    33     /* 8+20+20+18=64,最小包长 */
    34     if (mss_now < 48)
    35         mss_now = 48;
    36 
    37     /* 返回mss */
    38     return mss_now;
    39 }

    tcp_bound_to_half_wnd函数根据对端通告窗口的最大值来调整mss;如果最大窗口大于默认mss,则当前mss不能超过窗口的一半,当然也不能太小,最小68-headerlen;

     1 static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
     2 {
     3     int cutoff;
     4 
     5     /* When peer uses tiny windows, there is no use in packetizing
     6      * to sub-MSS pieces for the sake of SWS or making sure there
     7      * are enough packets in the pipe for fast recovery.
     8      *
     9      * On the other hand, for extremely large MSS devices, handling
    10      * smaller than MSS windows in this way does make sense.
    11      */
    12     /* 
    13         对端通告的最大窗口> 默认mss 
    14         cutoff记录最大窗口的一半
    15     */
    16     if (tp->max_window > TCP_MSS_DEFAULT)
    17         cutoff = (tp->max_window >> 1);
    18     /* <=默认mss,则记录最大窗口 */
    19     else
    20         cutoff = tp->max_window;
    21     
    22 
    23     /* 包大小值限制在68-header <= x <=cutoff之间 */
    24 
    25 
    26     
    27     /* 包大小> cutoff,则从cutoff和最小mtu之间取大的 */
    28     if (cutoff && pktsize > cutoff)
    29         return max_t(int, cutoff, 68U - tp->tcp_header_len);
    30 
    31     /* 包大小<= cutoff,返回包大小 */
    32     /* 窗口很大,则使用包大小 */
    33     else
    34         return pktsize;
    35 }
  • 相关阅读:
    SQL执行效率1
    php经典算法(转载)
    linux自用命令
    vim基本命令
    xampp安装
    BUU-rsa
    z3约束器学习笔记
    面试前夕oi挣扎式复习
    bss上的格式化字符串漏洞
    一、汇编
  • 原文地址:https://www.cnblogs.com/wanpengcoder/p/11751292.html
Copyright © 2011-2022 走看看