1 diff -Naur linux-2.6.26/include/linux/sysctl.h linux-2.6.26-dctcp-rev1.1.0/include/linux/sysctl.h 2 --- linux-2.6.26/include/linux/sysctl.h 2008-07-13 14:51:29.000000000 -0700 3 +++ linux-2.6.26-dctcp-rev1.1.0/include/linux/sysctl.h 2011-10-07 14:41:50.000000000 -0700 4 @@ -435,6 +435,9 @@ 5 NET_TCP_ALLOWED_CONG_CONTROL=123, 6 NET_TCP_MAX_SSTHRESH=124, 7 NET_TCP_FRTO_RESPONSE=125, 8 + NET_TCP_DELAYED_ACK=126, 9 + NET_TCP_DCTCP_ENABLE=127, 10 + NET_TCP_DCTCP_SHIFT_G=128, 11 }; 12 13 enum { 14 diff -Naur linux-2.6.26/include/linux/tcp.h linux-2.6.26-dctcp-rev1.1.0/include/linux/tcp.h 15 --- linux-2.6.26/include/linux/tcp.h 2008-07-13 14:51:29.000000000 -0700 16 +++ linux-2.6.26-dctcp-rev1.1.0/include/linux/tcp.h 2011-10-07 14:53:45.000000000 -0700 17 @@ -405,6 +405,15 @@ 18 /* TCP MD5 Signagure Option information */ 19 struct tcp_md5sig_info *md5sig_info; 20 #endif 21 + 22 +/* DCTCP Specific Parameters */ 23 + u32 acked_bytes_ecn; 24 + u32 acked_bytes_total; 25 + u32 prior_rcv_nxt; 26 + u32 dctcp_alpha; 27 + u32 next_seq; 28 + u32 ce_state; /* 0: last pkt was non-ce , 1: last pkt was ce */ 29 + u32 delayed_ack_reserved; 30 }; 31 32 static inline struct tcp_sock *tcp_sk(const struct sock *sk) 33 diff -Naur linux-2.6.26/include/net/tcp.h linux-2.6.26-dctcp-rev1.1.0/include/net/tcp.h 34 --- linux-2.6.26/include/net/tcp.h 2008-07-13 14:51:29.000000000 -0700 35 +++ linux-2.6.26-dctcp-rev1.1.0/include/net/tcp.h 2011-10-07 14:41:50.000000000 -0700 36 @@ -214,6 +214,9 @@ 37 extern int sysctl_tcp_fack; 38 extern int sysctl_tcp_reordering; 39 extern int sysctl_tcp_ecn; 40 +extern int sysctl_tcp_delayed_ack; 41 +extern int sysctl_tcp_dctcp_enable; 42 +extern int sysctl_tcp_dctcp_shift_g; 43 extern int sysctl_tcp_dsack; 44 extern int sysctl_tcp_mem[3]; 45 extern int sysctl_tcp_wmem[3]; 46 diff -Naur linux-2.6.26/kernel/sysctl_check.c linux-2.6.26-dctcp-rev1.1.0/kernel/sysctl_check.c 47 --- linux-2.6.26/kernel/sysctl_check.c 2008-07-13 14:51:29.000000000 -0700 48 +++ linux-2.6.26-dctcp-rev1.1.0/kernel/sysctl_check.c 2011-10-07 14:41:50.000000000 -0700 49 @@ -353,6 +353,9 @@ 50 { NET_TCP_FACK, "tcp_fack" }, 51 { NET_TCP_REORDERING, "tcp_reordering" }, 52 { NET_TCP_ECN, "tcp_ecn" }, 53 + { NET_TCP_DELAYED_ACK, "tcp_delayed_ack" }, 54 + { NET_TCP_DCTCP_ENABLE, "tcp_dctcp_enable" }, 55 + { NET_TCP_DCTCP_SHIFT_G, "tcp_dctcp_shift_g" }, 56 { NET_TCP_DSACK, "tcp_dsack" }, 57 { NET_TCP_MEM, "tcp_mem" }, 58 { NET_TCP_WMEM, "tcp_wmem" }, 59 diff -Naur linux-2.6.26/net/ipv4/sysctl_net_ipv4.c linux-2.6.26-dctcp-rev1.1.0/net/ipv4/sysctl_net_ipv4.c 60 --- linux-2.6.26/net/ipv4/sysctl_net_ipv4.c 2008-07-13 14:51:29.000000000 -0700 61 +++ linux-2.6.26-dctcp-rev1.1.0/net/ipv4/sysctl_net_ipv4.c 2011-10-07 14:41:50.000000000 -0700 62 @@ -506,6 +506,30 @@ 63 .proc_handler = &proc_dointvec 64 }, 65 { 66 + .ctl_name = NET_TCP_DELAYED_ACK, 67 + .procname = "tcp_delayed_ack", 68 + .data = &sysctl_tcp_delayed_ack, 69 + .maxlen = sizeof(int), 70 + .mode = 0644, 71 + .proc_handler = &proc_dointvec 72 + }, 73 + { 74 + .ctl_name = NET_TCP_DCTCP_ENABLE, 75 + .procname = "tcp_dctcp_enable", 76 + .data = &sysctl_tcp_dctcp_enable, 77 + .maxlen = sizeof(int), 78 + .mode = 0644, 79 + .proc_handler = &proc_dointvec 80 + }, 81 + { 82 + .ctl_name = NET_TCP_DCTCP_SHIFT_G, 83 + .procname = "tcp_dctcp_shift_g", 84 + .data = &sysctl_tcp_dctcp_shift_g, 85 + .maxlen = sizeof(int), 86 + .mode = 0644, 87 + .proc_handler = &proc_dointvec 88 + }, 89 + { 90 .ctl_name = NET_TCP_DSACK, 91 .procname = "tcp_dsack", 92 .data = &sysctl_tcp_dsack, 93 diff -Naur linux-2.6.26/net/ipv4/tcp_input.c linux-2.6.26-dctcp-rev1.1.0/net/ipv4/tcp_input.c 94 --- linux-2.6.26/net/ipv4/tcp_input.c 2008-07-13 14:51:29.000000000 -0700 95 +++ linux-2.6.26-dctcp-rev1.1.0/net/ipv4/tcp_input.c 2011-10-07 14:53:21.000000000 -0700 96 @@ -79,6 +79,9 @@ 97 int sysctl_tcp_fack __read_mostly = 1; 98 int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; 99 int sysctl_tcp_ecn __read_mostly; 100 +int sysctl_tcp_delayed_ack __read_mostly = 1; 101 +int sysctl_tcp_dctcp_enable __read_mostly; 102 +int sysctl_tcp_dctcp_shift_g __read_mostly = 5; /* g=1/2^5 */ 103 int sysctl_tcp_dsack __read_mostly = 1; 104 int sysctl_tcp_app_win __read_mostly = 31; 105 int sysctl_tcp_adv_win_scale __read_mostly = 2; 106 @@ -212,16 +215,68 @@ 107 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 108 } 109 110 -static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) 111 +static inline void TCP_ECN_dctcp_check_ce(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) 112 { 113 if (tp->ecn_flags & TCP_ECN_OK) { 114 - if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) 115 - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 116 - /* Funny extension: if ECT is not set on a segment, 117 - * it is surely retransmit. It is not in ECN RFC, 118 - * but Linux follows this rule. */ 119 - else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) 120 - tcp_enter_quickack_mode((struct sock *)tp); 121 + u32 temp_rcv_nxt; 122 + 123 + if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) { 124 + 125 + /* rcv_nxt is already update in previous process (tcp_rcv_established) */ 126 + 127 + if(sysctl_tcp_dctcp_enable) { 128 + 129 + /* state has changed from CE=0 to CE=1 && delayed ack has not sent yet */ 130 + if(tp->ce_state == 0 && tp->delayed_ack_reserved) { 131 + 132 + /* save current rcv_nxt */ 133 + temp_rcv_nxt = tp->rcv_nxt; 134 + /* generate previous ack with CE=0 */ 135 + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 136 + tp->rcv_nxt = tp->prior_rcv_nxt; 137 + tcp_send_ack(sk); 138 + /* recover current rcv_nxt */ 139 + tp->rcv_nxt = temp_rcv_nxt; 140 + } 141 + 142 + tp->ce_state = 1; 143 + } 144 + 145 + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 146 + 147 + 148 + /* Funny extension: if ECT is not set on a segment, 149 + * it is surely retransmit. It is not in ECN RFC, 150 + * but Linux follows this rule. */ 151 + } else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) { 152 + tcp_enter_quickack_mode((struct sock *)tp); 153 + }else { 154 + /* It has ECT but it doesn't have CE */ 155 + 156 + if(sysctl_tcp_dctcp_enable) { 157 + 158 + if(tp->ce_state != 0 && tp->delayed_ack_reserved) { 159 + 160 + /* save current rcv_nxt */ 161 + temp_rcv_nxt = tp->rcv_nxt; 162 + /* generate previous ack with CE=1 */ 163 + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 164 + tp->rcv_nxt = tp->prior_rcv_nxt; 165 + tcp_send_ack(sk); 166 + /* recover current rcv_nxt */ 167 + tp->rcv_nxt = temp_rcv_nxt; 168 + } 169 + 170 + tp->ce_state = 0; 171 + 172 + /* deassert only when DCTCP is enabled */ 173 + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 174 + } 175 + 176 + } 177 + 178 + /* set current rcv_nxt to prior_rcv_nxt */ 179 + tp->prior_rcv_nxt = tp->rcv_nxt; 180 } 181 } 182 183 @@ -572,6 +627,8 @@ 184 */ 185 tcp_incr_quickack(sk); 186 icsk->icsk_ack.ato = TCP_ATO_MIN; 187 + 188 + tp->ce_state = 0; 189 } else { 190 int m = now - icsk->icsk_ack.lrcvtime; 191 192 @@ -592,7 +649,7 @@ 193 } 194 icsk->icsk_ack.lrcvtime = now; 195 196 - TCP_ECN_check_ce(tp, skb); 197 + TCP_ECN_dctcp_check_ce(sk, tp, skb); 198 199 if (skb->len >= 128) 200 tcp_grow_window(sk, skb); 201 @@ -836,19 +893,54 @@ 202 struct tcp_sock *tp = tcp_sk(sk); 203 const struct inet_connection_sock *icsk = inet_csk(sk); 204 205 + __u32 ssthresh_old; 206 + __u32 cwnd_old; 207 + __u32 cwnd_new; 208 + 209 tp->prior_ssthresh = 0; 210 tp->bytes_acked = 0; 211 if (icsk->icsk_ca_state < TCP_CA_CWR) { 212 tp->undo_marker = 0; 213 - if (set_ssthresh) 214 - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 215 - tp->snd_cwnd = min(tp->snd_cwnd, 216 - tcp_packets_in_flight(tp) + 1U); 217 + 218 + if(!sysctl_tcp_dctcp_enable) { 219 + 220 + if (set_ssthresh) 221 + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 222 + 223 + tp->snd_cwnd = min(tp->snd_cwnd, 224 + tcp_packets_in_flight(tp) + 1U); 225 + 226 + }else { 227 + 228 + cwnd_new = max (tp->snd_cwnd - ((tp->snd_cwnd * tp->dctcp_alpha)>>11) , 2U); 229 + 230 + if(set_ssthresh) { 231 + 232 + ssthresh_old = tp->snd_ssthresh; 233 + tp->snd_ssthresh = cwnd_new; 234 + 235 + /* printk("%llu alpha= %d ssth old= %d new= %d ", */ 236 + /* ktime_to_us(ktime_get_real()), */ 237 + /* tp->dctcp_alpha, */ 238 + /* ssthresh_old, */ 239 + /* tp->snd_ssthresh); */ 240 + } 241 + 242 + cwnd_old = tp->snd_cwnd; 243 + tp->snd_cwnd = cwnd_new; 244 + 245 + /* printk("%llu alpha= %d cwnd old= %d new= %d ", */ 246 + /* ktime_to_us(ktime_get_real()), */ 247 + /* tp->dctcp_alpha, */ 248 + /* cwnd_old, */ 249 + /* tp->snd_cwnd); */ 250 + } 251 + 252 tp->snd_cwnd_cnt = 0; 253 tp->high_seq = tp->snd_nxt; 254 tp->snd_cwnd_stamp = tcp_time_stamp; 255 TCP_ECN_queue_cwr(tp); 256 - 257 + 258 tcp_set_ca_state(sk, TCP_CA_CWR); 259 } 260 } 261 @@ -2513,7 +2605,8 @@ 262 tcp_try_keep_open(sk); 263 tcp_moderate_cwnd(tp); 264 } else { 265 - tcp_cwnd_down(sk, flag); 266 + if(!sysctl_tcp_dctcp_enable) 267 + tcp_cwnd_down(sk, flag); 268 } 269 } 270 271 @@ -3216,6 +3309,9 @@ 272 int prior_packets; 273 int frto_cwnd = 0; 274 275 + __u32 alpha_old; 276 + __u32 acked_bytes; 277 + 278 /* If the ack is newer than sent or older than previous acks 279 * then we can probably ignore it. 280 */ 281 @@ -3269,6 +3365,45 @@ 282 tcp_ca_event(sk, CA_EVENT_SLOW_ACK); 283 } 284 285 + 286 + /* START: DCTCP Processing */ 287 + 288 + /* calc acked bytes */ 289 + if(after(ack,prior_snd_una)) { 290 + acked_bytes = ack - prior_snd_una; 291 + } else { 292 + acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; 293 + } 294 + 295 + if(flag & FLAG_ECE) 296 + tp->acked_bytes_ecn += acked_bytes; 297 + 298 + tp->acked_bytes_total += acked_bytes; 299 + 300 + /* Expired RTT */ 301 + if (!before(tp->snd_una,tp->next_seq)) { 302 + 303 + /* For avoiding denominator == 1 */ 304 + if(tp->acked_bytes_total == 0) tp->acked_bytes_total = 1; 305 + 306 + alpha_old = tp->dctcp_alpha; 307 + 308 + /* alpha = (1-g) * alpha + g * F */ 309 + tp->dctcp_alpha = alpha_old - (alpha_old >> sysctl_tcp_dctcp_shift_g) 310 + + (tp->acked_bytes_ecn << (10 - sysctl_tcp_dctcp_shift_g)) / tp->acked_bytes_total; 311 + 312 + if(tp->dctcp_alpha > 1024) tp->dctcp_alpha = 1024; /* round to 0-1024 */ 313 + 314 + /* printk("bytes_ecn= %d total= %d alpha: old= %d new= %d ", */ 315 + /* tp->acked_bytes_ecn, tp->acked_bytes_total, alpha_old, tp->dctcp_alpha); */ 316 + 317 + tp->acked_bytes_ecn = 0; 318 + tp->acked_bytes_total = 0; 319 + tp->next_seq = tp->snd_nxt; 320 + } 321 + 322 + /* END: DCTCP Processing */ 323 + 324 /* We passed data and got it acked, remove any soft error 325 * log. Something worked... 326 */ 327 @@ -4014,7 +4149,7 @@ 328 goto queue_and_out; 329 } 330 331 - TCP_ECN_check_ce(tp, skb); 332 + TCP_ECN_dctcp_check_ce(sk, tp, skb); 333 334 if (tcp_try_rmem_schedule(sk, skb->truesize)) 335 goto drop; 336 @@ -4421,6 +4556,8 @@ 337 && __tcp_select_window(sk) >= tp->rcv_wnd) || 338 /* We ACK each frame or... */ 339 tcp_in_quickack_mode(sk) || 340 + /* Delayed ACK is disabled or ... */ 341 + sysctl_tcp_delayed_ack == 0 || 342 /* We have out of order data. */ 343 (ofo_possible && skb_peek(&tp->out_of_order_queue))) { 344 /* Then ack it now */ 345 @@ -5419,6 +5556,9 @@ 346 } 347 348 EXPORT_SYMBOL(sysctl_tcp_ecn); 349 +EXPORT_SYMBOL(sysctl_tcp_delayed_ack); 350 +EXPORT_SYMBOL(sysctl_tcp_dctcp_enable); 351 +EXPORT_SYMBOL(sysctl_tcp_dctcp_shift_g); 352 EXPORT_SYMBOL(sysctl_tcp_reordering); 353 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); 354 EXPORT_SYMBOL(tcp_parse_options); 355 diff -Naur linux-2.6.26/net/ipv4/tcp_minisocks.c linux-2.6.26-dctcp-rev1.1.0/net/ipv4/tcp_minisocks.c 356 --- linux-2.6.26/net/ipv4/tcp_minisocks.c 2008-07-13 14:51:29.000000000 -0700 357 +++ linux-2.6.26-dctcp-rev1.1.0/net/ipv4/tcp_minisocks.c 2011-10-07 15:03:45.000000000 -0700 358 @@ -398,6 +398,11 @@ 359 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; 360 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; 361 362 + /* Initialize DCTCP internal parameters */ 363 + newtp->next_seq = newtp->snd_nxt; 364 + newtp->acked_bytes_ecn = 0; 365 + newtp->acked_bytes_total = 0; 366 + 367 tcp_prequeue_init(newtp); 368 369 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); 370 diff -Naur linux-2.6.26/net/ipv4/tcp_output.c linux-2.6.26-dctcp-rev1.1.0/net/ipv4/tcp_output.c 371 --- linux-2.6.26/net/ipv4/tcp_output.c 2008-07-13 14:51:29.000000000 -0700 372 +++ linux-2.6.26-dctcp-rev1.1.0/net/ipv4/tcp_output.c 2011-10-07 14:41:50.000000000 -0700 373 @@ -290,7 +290,7 @@ 374 struct tcp_sock *tp = tcp_sk(sk); 375 376 tp->ecn_flags = 0; 377 - if (sysctl_tcp_ecn) { 378 + if (sysctl_tcp_ecn || sysctl_tcp_dctcp_enable) { 379 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; 380 tp->ecn_flags = TCP_ECN_OK; 381 } 382 @@ -600,6 +600,10 @@ 383 TCP_ECN_send(sk, skb, tcp_header_size); 384 } 385 386 + /* In DCTCP, Assert ECT bit to all packets*/ 387 + if(sysctl_tcp_dctcp_enable) 388 + INET_ECN_xmit(sk); 389 + 390 #ifdef CONFIG_TCP_MD5SIG 391 /* Calculate the MD5 hash, as we have all we need now */ 392 if (md5) { 393 @@ -2352,6 +2356,11 @@ 394 tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); 395 TCP_ECN_send_syn(sk, buff); 396 397 + /* Initialize DCTCP internal parameters */ 398 + tp->next_seq = tp->snd_nxt; 399 + tp->acked_bytes_ecn = 0; 400 + tp->acked_bytes_total = 0; 401 + 402 /* Send it off. */ 403 TCP_SKB_CB(buff)->when = tcp_time_stamp; 404 tp->retrans_stamp = TCP_SKB_CB(buff)->when; 405 @@ -2385,6 +2394,10 @@ 406 int ato = icsk->icsk_ack.ato; 407 unsigned long timeout; 408 409 + /* Delayed ACK reserved flag for DCTCP */ 410 + struct tcp_sock *tp = tcp_sk(sk); 411 + tp->delayed_ack_reserved = 1; 412 + 413 if (ato > TCP_DELACK_MIN) { 414 const struct tcp_sock *tp = tcp_sk(sk); 415 int max_ato = HZ / 2; 416 @@ -2436,6 +2449,10 @@ 417 { 418 struct sk_buff *buff; 419 420 + /* Delayed ACK reserved flag for DCTCP */ 421 + struct tcp_sock *tp = tcp_sk(sk); 422 + tp->delayed_ack_reserved = 0; 423 + 424 /* If we have been reset, we may not send again. */ 425 if (sk->sk_state == TCP_CLOSE) 426 return;
https://github.com/myasuda/DCTCP-Linux/blob/master/dctcp-2.6.26-rev1.1.0.patch