概述
shutdown系统调用在tcp层会调用两个函数,对于ESTABLISHED状态需要调用tcp_shutdown关闭连接,对于LISTEN和SYN_SENT状态则需要以非阻塞模式调用tcp_disconnect断开连接;本文除了对这两个函数进行分析以外,还会分析在shutdown关闭了读或者写之后,读写系统调用sendmsg和recvmsg将如何处理对应操作;
1 /* 关闭操作 */ 2 int inet_shutdown(struct socket *sock, int how) 3 { 4 /*...*/ 5 switch (sk->sk_state) { 6 case TCP_CLOSE: 7 err = -ENOTCONN; 8 /* Hack to wake up other listeners, who can poll for 9 POLLHUP, even on eg. unconnected UDP sockets -- RR */ 10 default: 11 /* 设置how值到sk_shutdown,并且调用传输层的shutdown */ 12 sk->sk_shutdown |= how; 13 if (sk->sk_prot->shutdown) 14 sk->sk_prot->shutdown(sk, how); 15 break; 16 17 /* Remaining two branches are temporary solution for missing 18 * close() in multithreaded environment. It is _not_ a good idea, 19 * but we have no choice until close() is repaired at VFS level. 20 */ 21 case TCP_LISTEN: 22 /* 监听状态,如果无接收方向的关闭操作,跳出 */ 23 if (!(how & RCV_SHUTDOWN)) 24 break; 25 /* 有接收方向的关闭,继续 */ 26 /* Fall through */ 27 case TCP_SYN_SENT: 28 /* 调用传输层的disconnect断开连接 */ 29 err = sk->sk_prot->disconnect(sk, O_NONBLOCK); 30 31 /* 调增状态 */ 32 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; 33 break; 34 } 35 36 /* Wake up anyone sleeping in poll. */ 37 /* 状态改变,唤醒等待的进程 */ 38 sk->sk_state_change(sk); 39 release_sock(sk); 40 return err; 41 }
tcp_shutdown
tcp_shutdown函数完成设置关闭之后的状态,并且发送fin;注意只有接收端关闭时,不发送fin,只是在recvmsg系统调用中判断状态,不接收数据;
1 /* 2 * Shutdown the sending side of a connection. Much like close except 3 * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). 4 */ 5 6 void tcp_shutdown(struct sock *sk, int how) 7 { 8 /* We need to grab some memory, and put together a FIN, 9 * and then put it into the queue to be sent. 10 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. 11 */ 12 /* 不含有SEND_SHUTDOWN,返回,接收方关闭,不发fin */ 13 if (!(how & SEND_SHUTDOWN)) 14 return; 15 16 /* If we've already sent a FIN, or it's a closed state, skip this. */ 17 18 /* 以下这几个状态发fin */ 19 if ((1 << sk->sk_state) & 20 (TCPF_ESTABLISHED | TCPF_SYN_SENT | 21 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { 22 /* Clear out any half completed packets. FIN if needed. */ 23 /* 设置新状态,发送fin */ 24 if (tcp_close_state(sk)) 25 tcp_send_fin(sk); 26 } 27 }
tcp_close_state函数根据new_state状态表进行跳转,比如TCP_ESTABLISHED关闭时会跳转到TCP_FIN_WAIT1 | TCP_ACTION_FIN;
1 static const unsigned char new_state[16] = { 2 /* current state: new state: action: */ 3 [0 /* (Invalid) */] = TCP_CLOSE, 4 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 5 [TCP_SYN_SENT] = TCP_CLOSE, 6 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, 7 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, 8 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, 9 [TCP_TIME_WAIT] = TCP_CLOSE, 10 [TCP_CLOSE] = TCP_CLOSE, 11 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, 12 [TCP_LAST_ACK] = TCP_LAST_ACK, 13 [TCP_LISTEN] = TCP_CLOSE, 14 [TCP_CLOSING] = TCP_CLOSING, 15 [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ 16 }; 17 18 static int tcp_close_state(struct sock *sk) 19 { 20 int next = (int)new_state[sk->sk_state]; 21 int ns = next & TCP_STATE_MASK; 22 23 tcp_set_state(sk, ns); 24 25 return next & TCP_ACTION_FIN; 26 }
tcp_send_fin完成fin的发送,如果队列中有数据段未发送,则共用最后一个数据段,在上面打fin标记,没有能重用的情况下,则新分配数据段;然后关闭nagle算法,并将队列中的数据段都发送出去;(注: 对于压力下,判断是否有数据这个逻辑未理解清楚)
1 /* Send a FIN. The caller locks the socket for us. 2 * We should try to send a FIN packet really hard, but eventually give up. 3 */ 4 void tcp_send_fin(struct sock *sk) 5 { 6 struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); 7 struct tcp_sock *tp = tcp_sk(sk); 8 9 /* Optimization, tack on the FIN if we have one skb in write queue and 10 * this skb was not yet sent, or we are under memory pressure. 11 * Note: in the latter case, FIN packet will be sent after a timeout, 12 * as TCP stack thinks it has already been transmitted. 13 */ 14 /* 取到尾skb指针&& (有数据要发送 || 内存压力之下) */ 15 if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { 16 coalesce: 17 /* 尾skb上打fin标记 */ 18 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; 19 /* fin标记占用一个序号 */ 20 TCP_SKB_CB(tskb)->end_seq++; 21 tp->write_seq++; 22 23 /* tskb已经发送了,压力之下,认为已经发送了?? */ 24 if (!tcp_send_head(sk)) { 25 /* This means tskb was already sent. 26 * Pretend we included the FIN on previous transmit. 27 * We need to set tp->snd_nxt to the value it would have 28 * if FIN had been sent. This is because retransmit path 29 * does not change tp->snd_nxt. 30 */ 31 tp->snd_nxt++; 32 return; 33 } 34 } 35 /* 不满足上述情况,需要重新分配内存 */ 36 else { 37 /* 分配skb */ 38 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); 39 if (unlikely(!skb)) { 40 /* 队列为空无压力情况?? 冲走一遍最后包共用fin流程*/ 41 if (tskb) 42 goto coalesce; 43 return; 44 } 45 46 /* 初始化skb */ 47 skb_reserve(skb, MAX_TCP_HEADER); 48 sk_forced_mem_schedule(sk, skb->truesize); 49 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 50 tcp_init_nondata_skb(skb, tp->write_seq, 51 TCPHDR_ACK | TCPHDR_FIN); 52 53 /* 添加到发送队列 */ 54 tcp_queue_skb(sk, skb); 55 } 56 57 /* 关闭nagle算法,将队列中的数据段全部发送出去 */ 58 __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); 59 }
tcp_disconnect
在连接为LISTEN或者SYN_SENT状态,会调用tcp_disconnect端口连接;函数首先对各种状态做分别的特有处理,然后再统一清理资源;
1 int tcp_disconnect(struct sock *sk, int flags) 2 { 3 struct inet_sock *inet = inet_sk(sk); 4 struct inet_connection_sock *icsk = inet_csk(sk); 5 struct tcp_sock *tp = tcp_sk(sk); 6 int err = 0; 7 int old_state = sk->sk_state; 8 9 /* 不是close状态则设置为close,从hash中删除控制块 */ 10 if (old_state != TCP_CLOSE) 11 tcp_set_state(sk, TCP_CLOSE); 12 13 /* ABORT function of RFC793 */ 14 /* LISTEN状态,停止监听 */ 15 if (old_state == TCP_LISTEN) { 16 inet_csk_listen_stop(sk); 17 } 18 /* 修复模式 */ 19 else if (unlikely(tp->repair)) { 20 sk->sk_err = ECONNABORTED; 21 } 22 /* 需要发送rst 23 || 下一个发送序号并不是最后一个队列数据段序号 24 && 是被动关闭的结束状态 */ 25 else if (tcp_need_reset(old_state) || 26 (tp->snd_nxt != tp->write_seq && 27 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 28 /* The last check adjusts for discrepancy of Linux wrt. RFC 29 * states 30 */ 31 /* 发送rst */ 32 tcp_send_active_reset(sk, gfp_any()); 33 sk->sk_err = ECONNRESET; 34 } 35 /* SYN_SENT状态 */ 36 else if (old_state == TCP_SYN_SENT) 37 sk->sk_err = ECONNRESET; 38 39 /* 清除定时器 */ 40 tcp_clear_xmit_timers(sk); 41 42 /* 释放接收队列中的skb */ 43 __skb_queue_purge(&sk->sk_receive_queue); 44 45 /* 释放发送队列中的skb */ 46 tcp_write_queue_purge(sk); 47 tcp_fastopen_active_disable_ofo_check(sk); 48 /*释放未按顺序达到的skb */ 49 skb_rbtree_purge(&tp->out_of_order_queue); 50 51 52 /* 其他各种清理工作 */ 53 54 inet->inet_dport = 0; 55 56 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 57 inet_reset_saddr(sk); 58 59 sk->sk_shutdown = 0; 60 sock_reset_flag(sk, SOCK_DONE); 61 tp->srtt_us = 0; 62 tp->write_seq += tp->max_window + 2; 63 if (tp->write_seq == 0) 64 tp->write_seq = 1; 65 icsk->icsk_backoff = 0; 66 tp->snd_cwnd = 2; 67 icsk->icsk_probes_out = 0; 68 tp->packets_out = 0; 69 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 70 tp->snd_cwnd_cnt = 0; 71 tp->window_clamp = 0; 72 tcp_set_ca_state(sk, TCP_CA_Open); 73 tcp_clear_retrans(tp); 74 inet_csk_delack_init(sk); 75 /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 76 * issue in __tcp_select_window() 77 */ 78 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; 79 tcp_init_send_head(sk); 80 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 81 __sk_dst_reset(sk); 82 dst_release(sk->sk_rx_dst); 83 sk->sk_rx_dst = NULL; 84 tcp_saved_syn_free(tp); 85 86 /* Clean up fastopen related fields */ 87 tcp_free_fastopen_req(tp); 88 inet->defer_connect = 0; 89 90 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); 91 92 sk->sk_error_report(sk); 93 return err; 94 }
tcp_sendmsg&&tcp_recvmsg
在使用shutdown关闭了发送之后,再次调用tcp_sendmsg发送数据,那么该函数会返回错误;
1 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) 2 { 3 err = -EPIPE; 4 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 5 goto do_error; 6 }
在使用shutdown关闭了接收之后,再次调用tcp_recvmsg接收数据,那么函数不会读取数据,而是立即返回;
1 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, 2 int flags, int *addr_len) 3 { 4 /*... */ 5 6 do { 7 u32 offset; 8 9 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ 10 if (tp->urg_data && tp->urg_seq == *seq) { 11 if (copied) 12 break; 13 if (signal_pending(current)) { 14 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; 15 break; 16 } 17 } 18 19 /* Next get a buffer. */ 20 21 last = skb_peek_tail(&sk->sk_receive_queue); 22 skb_queue_walk(&sk->sk_receive_queue, skb) { 23 last = skb; 24 /* Now that we have two receive queues this 25 * shouldn't happen. 26 */ 27 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), 28 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X ", 29 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, 30 flags)) 31 break; 32 33 offset = *seq - TCP_SKB_CB(skb)->seq; 34 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 35 pr_err_once("%s: found a SYN, please report ! ", __func__); 36 offset--; 37 } 38 if (offset < skb->len) 39 goto found_ok_skb; 40 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 41 goto found_fin_ok; 42 WARN(!(flags & MSG_PEEK), 43 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X ", 44 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); 45 } 46 47 /* Well, if we have backlog, try to process it now yet. */ 48 49 if (copied >= target && !sk->sk_backlog.tail) 50 break; 51 52 if (copied) { 53 if (sk->sk_err || 54 sk->sk_state == TCP_CLOSE || 55 (sk->sk_shutdown & RCV_SHUTDOWN) || 56 !timeo || 57 signal_pending(current)) 58 break; 59 } else { 60 if (sock_flag(sk, SOCK_DONE)) 61 break; 62 63 if (sk->sk_err) { 64 copied = sock_error(sk); 65 break; 66 } 67 68 if (sk->sk_shutdown & RCV_SHUTDOWN) 69 break; 70 71 if (sk->sk_state == TCP_CLOSE) { 72 if (!sock_flag(sk, SOCK_DONE)) { 73 /* This occurs when user tries to read 74 * from never connected socket. 75 */ 76 copied = -ENOTCONN; 77 break; 78 } 79 break; 80 } 81 82 if (!timeo) { 83 copied = -EAGAIN; 84 break; 85 } 86 87 if (signal_pending(current)) { 88 copied = sock_intr_errno(timeo); 89 break; 90 } 91 } 92 } while (len > 0); 93 }