zoukankan      html  css  js  c++  java
  • Linux kernel 之 socket 创建过程分析

    • 重要结构体

    • struct socket 结构体

    	// 普通的 BSD 标准 socket 结构体
    	// socket_state: socket 状态, 连接?不连接?
    	// type: socket type (%SOCK_STREAM, etc)
    	// flags: socket flags (%SOCK_NOSPACE, etc)
    	// ops: 专用协议的socket的操作
    	// file: 与socket 有关的指针列表
    	// sk: 负责协议相关结构体,这样就让这个这个结构体和协议分开。
    	// wq: 等待队列
    	struct socket {  
     	   socket_state        state;                                                  
    	                                                                                
     	   kmemcheck_bitfield_begin(type);                                             
     	   short           type;                                                       
     	   kmemcheck_bitfield_end(type);                                               
                                                                                    
     	   unsigned long       flags;                                                  
                                                                                    
      	  struct socket_wq __rcu  *wq;                                                
      	                                                                              
      	  struct file     *file;                                                      
      	  struct sock     *sk;                                                        
      	  const struct proto_ops  *ops;                                               
    	};  
    
    • struct socket 的创建

    	// socket() 本质上是 glibc 中的函数,执行的实际上是 sys_socketcall() 系统调用。
    	// sys_socketcall() 几乎是所有的socket函数的入口, 
    	// 也就是 bind,connect 等函数都是需要asmlinkage long sys_socketcall(int call, unsigned long __user *args); 、、sys_socketcall() 作为入口,函数如下:
    
    	// include/linux/syscalls.h
    	asmlinkage long sys_socketcall(int call, unsigned long __user *args); 
    
    	// net/socket.c
    	SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)            
    	{                                                                               
        	unsigned long a[AUDITSC_ARGS];                                              
        	unsigned long a0, a1;                                                       
        	int err;                                                                    
        	unsigned int len;                                                           
                                                                                    
        	if (call < 1 || call > SYS_SENDMMSG)                                        
            	return -EINVAL;                                                         
                                                                                    
        	len = nargs[call];                                                          
        	if (len > sizeof(a))                                                        
            	return -EINVAL;                                                         
                                                                                   
        	/* copy_from_user should be SMP safe. */                                    
        	if (copy_from_user(a, args, len))                                           
            	return -EFAULT;                                                         
                                                                                    
        	err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);             
        	if (err)                                                                    
        	    return err;                                                             
        	                                                                            
        	a0 = a[0]; 
        	a1 = a[1];                                                                  
        	// 判断,然后运行相对应的函数 
        	switch (call) {                                                             
        	case SYS_SOCKET:  // 这里就是 sys_socket(), 
        	    err = sys_socket(a0, a1, a[2]);                                         
        	    break;                                                                  
        	case SYS_BIND:                                                              
        	    err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);                 
        	    break;                                                                  
        	case SYS_CONNECT:                                                           
        	    err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);              
        	    break;                                                                  
        	case SYS_LISTEN:                                                            
        	    err = sys_listen(a0, a1);                                               
        	    break;                                                                  
        	// ... ...                                                             
        	default:                                                                    
        	    err = -EINVAL;                                                          
        	    break;                                                                  
        	}                                                                           
        	return err;                                                                 
    	}   	                                                                                                                                                                              
    
    	// include/linux/syscalls.h
    	asmlinkage long sys_socket(int, int, int);
    	// net/socket.c 
    	SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)                     
    	{                                                                                  
        	int retval;                                                                    
        	struct socket *sock;                                                           
        	int flags;                                                                     
        	                                                                               
        	/* Check the SOCK_* constants for consistency.  */                             
        	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);                                       
        	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);                   
        	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);                                   
        	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);                                  
        	                                                                               
        	flags = type & ~SOCK_TYPE_MASK;                                                
        	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))                                
        	    return -EINVAL;                                                         
        	type &= SOCK_TYPE_MASK;                                                     
        	                                                                            
        	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))                 
        	    flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;                          
        	// 这里创建了 socket 结构体
        	retval = sock_create(family, type, protocol, &sock);                        
        	if (retval < 0)                                                             
        	    goto out;                                                               
        	// 与文件系统进行关联
        	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));               
        	if (retval < 0)                                                             
        	    goto out_release;                                                       
        	                                                                            
    	out:                                                                            
        	/* It may be already another descriptor 8) Not kernel problem. */           
        	return retval;                                                              
                                                                                    
    	out_release:                                                                    
    	    sock_release(sock);                                                         
    	    return retval;                                                              
    	}                                                                               
    
    • sock_create() 函数

    	// include/linux/net.h
    	int sock_create(int family, int type, int proto, struct socket **res);
    
    	// net/socket.c
    	int sock_create(int family, int type, int protocol, struct socket **res)        
    	{                                                                               
        	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
    	}                                                                               
    	EXPORT_SYMBOL(sock_create);  
    
    	// include/linux/net.h 
    	int __sock_create(struct net *net, int family, int type, int proto,             
              struct socket **res, int kern);   
    	// net/socket.c
    	int __sock_create(struct net *net, int family, int type, int protocol,          
                	 struct socket **res, int kern)                                     
    	{                                                                               
    	    int err;                                                                    
    	    struct socket *sock;                                                        
    	    const struct net_proto_family *pf;                                          
    	                                                                                
    	    /*                                                                          
    	     *      Check protocol is in range                                          
    	     */    
    		// 检查 协议族是否在范围呢  
    	    if (family < 0 || family >= NPROTO)                                         
    	        return -EAFNOSUPPORT;                                                   
    	    if (type < 0 || type >= SOCK_MAX)   // 检查类型  
    	        return -EINVAL;                                                         
    	                                                                                
    	    /* Compatibility.                                                           
    	                                                                                
    	       This uglymoron is moved from INET layer to here to avoid                 
    	       deadlock in module load.                                                 
    	     */    // 检查用的是PF_INET 其实这个都是兼容的。 
    	    if (family == PF_INET && type == SOCK_PACKET) {                             
    	        static int warned;                                                      
    	        if (!warned) {                                                          
    	            warned = 1;                                                         
    	            pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)
    ",                 
    	                current->comm);                                                 
    	        }                                                                       
    	        family = PF_PACKET;                                                     
    	    }                                                                           
    	    // 安全机制检查 
    	    err = security_socket_create(family, type, protocol, kern);                 
    	    if (err)                                                                    
    	        return err;                                                             
    	                                                                                
    	    /*                                                                          
    	     *  Allocate the socket and allow the family to set things up. if           
    	     *  the protocol is 0, the family is instructed to select an appropriate    
    	     *  default.                                                                
    	     */  // ----> sock_alloc  接下面  
    	    sock = sock_alloc();                                                        
    	    if (!sock) {                                                                
    	        net_warn_ratelimited("socket: no more sockets
    ");                      
    	        return -ENFILE; /* Not exactly a match, but its the                     
    	                   closest posix thing */                                       
    	    }                                                                           
    	                                                                                
    	    sock->type = type;    
             
    		// ... ...    
    	    return 0;         
           // ... ...            
    	}                                   
    	EXPORT_SYMBOL(__sock_create);  
    
    • sock_alloc() 函数解析,被上面的 __sock_create() 函数调用

    	// net/socket.c
    	static struct socket *sock_alloc(void)                                          
    	{                                                                               
    	    struct inode *inode;                                                        
    	    struct socket *sock;                                                        
    	                                                                                
    	    inode = new_inode_pseudo(sock_mnt->mnt_sb);                                 
    	    if (!inode)                                                                 
    	        return NULL;                                                            
    	                                                                                
    	    sock = SOCKET_I(inode);                                                     
    	                                                                                
    	    kmemcheck_annotate_bitfield(sock, type);                                    
    	    inode->i_ino = get_next_ino();                                              
    	    inode->i_mode = S_IFSOCK | S_IRWXUGO; // 模式  
    	    inode->i_uid = current_fsuid();  // 获取当前的uid  
    	    inode->i_gid = current_fsgid();  // 获取当前的gid 
    	    inode->i_op = &sockfs_inode_ops; // 操作 
    	                                                                                
    	    this_cpu_add(sockets_in_use, 1);                                            
    	    return sock;   
    	}   
    	// 申请一个 socket 结构体 ,名字为 sock
    	// 申请一个新的节点和一个新的 socket 项目, 绑定他们两个并且初始化
    	// 如果申请inode 失败返回 NULL, 或者返回sock  
    
    	// 接下来我们再看到 SOCKET_I(inode);
    	// include/net/sock.h
    	static inline struct socket *SOCKET_I(struct inode *inode)                      
    	{                                                                               
        	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;        
    	}    
    	// 然后我们发现,返回的是 inode 内的socket 结构体。  
    	
    	// 我们可以分析一个 container_of() 这个是怎么定义的。
    	// include/linux/kernel.h
    	#define container_of(ptr, type, member) ({                                     
        const typeof( ((type *)0)->member ) *__mptr = (ptr);                       
        (type *)( (char *)__mptr - offsetof(type,member) );})  
    	//  typeof 将 ptr 的指针临时保存起来为 __mptr
    	//  然后用这个 __mptr 指针减去下面的 member 的便宜量。
        //  得到的就是 type 这个结构体的头指针。
    	//  offsetof   include/linux/stddef.h
    	#undef offsetof                                                                 
    	#ifdef __compiler_offsetof                                                      
    	#define offsetof(TYPE, MEMBER)  __compiler_offsetof(TYPE, MEMBER)               
    	#else                                                                           
    	#define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)                  
    	#endif   
    	                                                                          
    	// 反正这里有点难理解,最后得到的结果是 type 这个结构体的头指针。
    	
    	// 所以说 SOCKET_I() 得到的是 struct socket_alloc 的头指针
    	// include/net/sock.h
    	struct socket_alloc {                                                           
        	struct socket socket;                                                       
        	struct inode vfs_inode;                                                     
    	};               
    
    • 回到 __sock_create() 继续分析

    	// net/socket.c   --> __sock_create()
    	#ifdef CONFIG_MODULES                                                           
    	    /* Attempt to load a protocol module if the find failed.                    
    	     *                                                                          
    	     * 12/09/1996 Marcin: But! this makes REALLY only sense, if the 	user        
    	     * requested real, full-featured networking support upon 	configuration.         
    	     * Otherwise module support will break!                                     
    	     */                                                                         
    	    if (rcu_access_pointer(net_families[family]) == NULL)                       
    	        request_module("net-pf-%d", family);                                    
    	#endif                                                                          
    
    	如果在 make menuconfig 中选上 编译成模块的选项,则会运行上面这个部分。
    	里面先是检查对应的协议族的操作表是否已经安装,如果没有安装则使用 request_module 进行安装,现在都是在 TCP/IP协议下进行分析,所以 family 是 AF_INET , 也就是 2 , 所以实际检查的全局变量是 net_families[2], 这个全局变量是在系统初始化时由 net/ipv4/af_inet.c 文件进行安装,具体代码是:
    
    	// net/ipv4/af_inet.c
    	static int __init inet_init(void)                                               
    	{                                                                               
    	    struct inet_protosw *q;                                                     
    	    struct list_head *r;                                                        
    	    int rc = -EINVAL;                                                           
    	                                                                                
    	    sock_skb_cb_check_size(sizeof(struct inet_skb_parm));                       
    	    // 各个协议的注册 
    	    rc = proto_register(&tcp_prot, 1);                                          
    	    if (rc)                                                                     
    	        goto out;                                                               
    	                                                                                
    	    rc = proto_register(&udp_prot, 1);                                          
    	    if (rc)                                                                     
    	        goto out_unregister_tcp_proto;                                          
    	                                                                                
    	    rc = proto_register(&raw_prot, 1);                                          
    	    if (rc)                                                                     
    	        goto out_unregister_udp_proto;                                          
    	                                                                                
    	    rc = proto_register(&ping_prot, 1);                                         
    	    if (rc)                                                                     
    	        goto out_unregister_raw_proto;                                          
    	                                                                                
    	    /*                                                                          
    	     *  Tell SOCKET that we are alive...                                        
    	     */                                                                         
    	
    		    (void)sock_register(&inet_family_ops);                                      
    	                                                                                
    	#ifdef CONFIG_SYSCTL                                                            
    	    ip_static_sysctl_init();                                                    
    	#endif                                                                          
    	                                                                                
    	    /*                                                                          
    	     *  Add all the base protocols.                                             
    	     */                                                                         
    	    // 各个协议的添加,添加不成功则报错 
    	    if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)                    
    	        pr_crit("%s: Cannot add ICMP protocol
    ", __func__);                    
    	    if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)                      
    	        pr_crit("%s: Cannot add UDP protocol
    ", __func__);                     
    	    if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)                      
    	        pr_crit("%s: Cannot add TCP protocol
    ", __func__);                     
    	#ifdef CONFIG_IP_MULTICAST                                                      
    	    if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)                    
    	        pr_crit("%s: Cannot add IGMP protocol
    ", __func__);                    
    	#endif                                                                          
    	                                                                                
    	    /* Register the socket-side information for inet_create. */                 
    	    for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)                            
    	        INIT_LIST_HEAD(r);                                                      
    	    // 把这个关键性的链接表一个个注册上去
    
    		// ******************************************************
    		// inetsw_array 结构体数组数组, 这里面有包含每个的协议,比如说tcp_prot
    		static struct inet_protosw inetsw_array[] =                                     
    		{                                                                               
        		{                                                                           
            		.type =       SOCK_STREAM,                                              
            		.protocol =   IPPROTO_TCP,                                              
         	   		.prot =       &tcp_prot,                                                
          	 		.ops =        &inet_stream_ops,                                         
           		 	.flags =      INET_PROTOSW_PERMANENT |                                  
                      		INET_PROTOSW_ICSK,                                            
        		},                                                                          
                                                                                    
        		{                                                                           
            		.type =       SOCK_DGRAM,                                               
            		.protocol =   IPPROTO_UDP,                                              
            		.prot =       &udp_prot,                                                
            		.ops =        &inet_dgram_ops,                                          
            		.flags =      INET_PROTOSW_PERMANENT,                                   
           		},                                                                       
                                                                                    
           		{                                                                        
            		.type =       SOCK_DGRAM,                                               
            		.protocol =   IPPROTO_ICMP,                                             
            		.prot =       &ping_prot,                                               
            		.ops =        &inet_dgram_ops,                                          
            		.flags =      INET_PROTOSW_REUSE,                                       
           		},
    			// ... ...
    		} 
    		
    		// tcp_prot  ---> net/ipv4/tcp_ipv4.c
    		struct proto tcp_prot = {                                                       
        		.name           = "TCP",                                                    
        		.owner          = THIS_MODULE,                                              
        		.close          = tcp_close,                                                
        		.connect        = tcp_v4_connect,                                               
        		.disconnect     = tcp_disconnect,                                               
        		.accept         = inet_csk_accept,                                              
        		.ioctl          = tcp_ioctl,                                                    
        		.init           = tcp_v4_init_sock,    // 这是init 函数会在后面被调用
        		.destroy        = tcp_v4_destroy_sock,                                          
        		.shutdown       = tcp_shutdown,                                                 
        		.setsockopt     = tcp_setsockopt,                                               
        		.getsockopt     = tcp_getsockopt,                                               
        		.recvmsg        = tcp_recvmsg,                                                  
        		.sendmsg        = tcp_sendmsg,                                                  
        		.sendpage       = tcp_sendpage,                                                 
        		.backlog_rcv        = tcp_v4_do_rcv,                                            
        		.release_cb     = tcp_release_cb,                                               
        		.hash           = inet_hash,                                                    
        		.unhash         = inet_unhash,                                                  
        		.get_port       = inet_csk_get_port,                                            
        		.enter_memory_pressure  = tcp_enter_memory_pressure,                            
        		.stream_memory_free = tcp_stream_memory_free,                                   
        		.sockets_allocated  = &tcp_sockets_allocated,                                   
        		.orphan_count       = &tcp_orphan_count,                                        
        		.memory_allocated   = &tcp_memory_allocated,                                    
        		.memory_pressure    = &tcp_memory_pressure,                                 
        		.sysctl_mem     = sysctl_tcp_mem,                                           
        		.sysctl_wmem        = sysctl_tcp_wmem,                                      
        		.sysctl_rmem        = sysctl_tcp_rmem,                                      
        		.max_header     = MAX_TCP_HEADER,                                           
        		.obj_size       = sizeof(struct tcp_sock),                                  
        		.slab_flags     = SLAB_DESTROY_BY_RCU,                                      
        		.twsk_prot      = &tcp_timewait_sock_ops,                                   
        		.rsk_prot       = &tcp_request_sock_ops,                                    
        		.h.hashinfo     = &tcp_hashinfo,                                            
        		.no_autobind        = true,                                                 
    		#ifdef CONFIG_COMPAT                                                            
        		.compat_setsockopt  = compat_tcp_setsockopt,                                
        		.compat_getsockopt  = compat_tcp_getsockopt,                                
    		#endif                                                                          
    		#ifdef CONFIG_MEMCG_KMEM                                                        
        		.init_cgroup        = tcp_init_cgroup,                                      
    		    .destroy_cgroup     = tcp_destroy_cgroup,                                   
        		.proto_cgroup       = tcp_proto_cgroup,                                     
    		#endif                                                                          
    		};                                                                              
    		EXPORT_SYMBOL(tcp_prot); 
    		// ***********************************************************
    
    	    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)            
    	        inet_register_protosw(q);                                               
    	    
    		// 各个协议模块的初始化 
    	    /*                                                                          
    	     *  Set the ARP module up                                                   
    	     */                                                                         
    	                                                                                
    	    arp_init();                                                                 
    	                                                                                
    	    /*                                                                          
    	     *  Set the IP module up                                                    
    	     */                                                                         
    	                                                                                
    	    ip_init();                                                                  
    	                                                                                
    	    tcp_v4_init();                                                              
    	                                                                                
    	    /* Setup TCP slab cache for open requests. */                               
    	    tcp_init();                                                                 
                                                                                    
    	    /* Setup UDP memory threshold */                                            
    	    udp_init();                                                                 
    	                                                                                
    	    /* Add UDP-Lite (RFC 3828) */                                               
    	    udplite4_register();                                                        
                                                                              
    	    ping_init();                                                                
    	                                                                                
    	    /*                                                                          
    	     *  Set the ICMP layer up                                                   
    	     */                                                                         
                                                                                    
    	    if (icmp_init() < 0)                                                        
    	        panic("Failed to create the ICMP control socket.
    ");                   
    	                                                                                
    	    /*                                                                          
    	     *  Initialise the multicast router                                         
    	     */                                                                         
    	#if defined(CONFIG_IP_MROUTE)                                                   
    	    if (ip_mr_init())                                                           
    	        pr_crit("%s: Cannot init ipv4 mroute
    ", __func__);                     
    	#endif                                                                          
    	                                                                                
    	    if (init_inet_pernet_ops())                                                 
    	        pr_crit("%s: Cannot init ipv4 inet pernet ops
    ", __func__);            
    	    /*                                                                          
    	     *  Initialise per-cpu ipv4 mibs                                            
    	     */                                                                         
                                                                                    
    	    if (init_ipv4_mibs())                                                       
    	        pr_crit("%s: Cannot init ipv4 mibs
    ", __func__);                       
    	                                                                                
    	    ipv4_proc_init();                                                           
    	                                                                                
    	    ipfrag_init();                                                              
    	                                                                                
    	    dev_add_pack(&ip_packet_type);                                              
    	                                                                                
    	    ip_tunnel_core_init();                                                      
                                                                                    
    	    rc = 0;                                                                     
    	out:                                                                            
    	    return rc;                                                                  
    	out_unregister_raw_proto:                                                       
    	    proto_unregister(&raw_prot);                                                
    	out_unregister_udp_proto:                                                       
    	    proto_unregister(&udp_prot);                                                
    	out_unregister_tcp_proto:                                                       
    	    proto_unregister(&tcp_prot);                                                
    	    goto out;                                                                   
    	}                                                                               
    
    	fs_initcall(inet_init);
    
    • 很粗浅的看完协议那一部分之后我们回到 __sock_create()

    	// net/socket.c
    	// 看到 这个回调函数的调用
    	    err = pf->create(net, sock, protocol, kern);                                
    	    if (err < 0)                                                                
     	       goto out_module_put; 
    
    	// 先看一个 inet_protosw 结构体
    	// include/net/protocol.h
    	/* This is used to register socket interfaces for IP protocols.  */             
    	struct inet_protosw {                                                           
        	struct list_head list;                                                      
        	                                                                            
        	    /* These two fields form the lookup key.  */                            
        	unsigned short   type;     /* This is the 2nd argument to socket(2). */     
        	unsigned short   protocol; /* This is the L4 protocol number.  */           
        	                                                                            
        	struct proto     *prot;                                                     
        	const struct proto_ops *ops;                                                
        	                                                                            
        	unsigned char    flags;      /* See INET_PROTOSW_* below.  */               
    	};                                                                              
    		
    	// 上面的 create 函数对应的是 net/ipv4/af_inet.c 里面的 inet_create 函数
    	// net/ipv4/af_inet.c
    	static int inet_create(struct net *net, struct socket *sock, int protocol,      
                   int kern)                                                        
    	{                                                                               
    	    struct sock *sk;                                                            
    	    struct inet_protosw *answer;                                                
    	    struct inet_sock *inet;                                                     
    	    struct proto *answer_prot;                                                  
    	    unsigned char answer_flags;                                                 
    	    int try_loading_module = 0;                                                 
    	    int err;                                                                    
    	    // 检查协议是否在范围之内 
    	    if (protocol < 0 || protocol >= IPPROTO_MAX)                                
    	        return -EINVAL;                                                         
    	    // 设置状态为未连接
    	    sock->state = SS_UNCONNECTED;                                               
    	                                                                                
    	    /* Look for the requested type/protocol pair. */ 
    	// 遍历寻找请求的协议类型 
    	lookup_protocol:                                                                
    	    err = -ESOCKTNOSUPPORT;                                                     
    	    rcu_read_lock(); 
    		// 遍历 inetsw[] 数组,其实就是次数而已
    	    list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { 
    	                                                                                
    	        err = 0;   
    			// 检查对应的协议,然后再选择合适的协议 
    	        /* Check the non-wild match. */ 
    			// 找到对应的协议,如果找到对应的协议,但是protocol 不是 IPPRORO_IP,则直接退出 
    	        if (protocol == answer->protocol) {                                     
    	            if (protocol != IPPROTO_IP)                                         
    	                break;                                                          
    	        } else {                                                                
    	            /* Check for the two wild cases. */                                 
    	            if (IPPROTO_IP == protocol) {                                       
    	                protocol = answer->protocol;                                    
    	                break;                                                          
    	            }                                                                   
    	            if (IPPROTO_IP == answer->protocol)                                 
    	                break;                                                          
    	        } 
    			// 如果没有对应的协议则返回错误码 
    	        err = -EPROTONOSUPPORT;
    	    }                                                                           
    	    // 如果没有加载模块的保护措施 
    	    if (unlikely(err)) {                                                        
    	        if (try_loading_module < 2) {                                           
    	            rcu_read_unlock();                                                  
    	            /*                                                                  
    	             * Be more specific, e.g. net-pf-2-proto-132-type-1                 
    	             * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)             
    	             */                                                                 
    	            if (++try_loading_module == 1)                                      
    	                request_module("net-pf-%d-proto-%d-type-%d",                    
    	                           PF_INET, protocol, sock->type);                      
    	            /*                                                                  
    	             * Fall back to generic, e.g. net-pf-2-proto-132                    
    	             * (net-pf-PF_INET-proto-IPPROTO_SCTP)                              
    	             */                                                                 
    	            else                                                                
    	                request_module("net-pf-%d-proto-%d",                            
    	                           PF_INET, protocol);                                  
    	            goto lookup_protocol;                                               
    	        } else                                                                  
    	            goto out_rcu_unlock;                                                
    	    }                                                                           
    	                                                                                
    	    err = -EPERM;  
    		//  检查通用性,只有root 权限然后使用原始套接字 
    	    if (sock->type == SOCK_RAW && !kern &&                                      
    	        !ns_capable(net->user_ns, CAP_NET_RAW))                                 
    		        goto out_rcu_unlock;   
                                                     
    	    // 对socket 的操作集合进行了互联。
    	    sock->ops = answer->ops;                                                    
    	    answer_prot = answer->prot;                                                 
    	    answer_flags = answer->flags;                                               
    	    rcu_read_unlock();                                                          
    	                                                                                
    	    WARN_ON(!answer_prot->slab);                                                
    	                                                                                
    	    err = -ENOBUFS;
    		/* 此处调用sk_alloc分配一个struct sock,该结构体庞大,其作用是网络层对socket的表示,意思就是IP协议下有很多东西比如IP地址,网卡接口,端口等等信息需要再socket层中有所体现从而使编程者方便使用,然后就利用指针等形式把内容进行一定程度上的映射。sk_alloc首先对sock->proto和sock_creator进行设置,设置成当前协议对应的proto调用sk_prot_alloc()根据是否提供了slab缓存而判断是使用slab缓存还是通用缓存。只要分配成功,则调用sock_lock_init()对缓存进行初始化,主要是对sock锁、等待队列以及进程数据结构中的网络空间结构进行分配。初始化完了后调用sock_net_set()函数对网络空间结构进行记录,然后最后增加一个net计数器。至此回到inet_create,判断是否成功分配 */ 
    	    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);                 
    	    if (!sk)                                                                    
    	        goto out;                                                               
    	                                                                                
    	    err = 0;                                                                    
    	    if (INET_PROTOSW_REUSE & answer_flags)                                      
    	        sk->sk_reuse = SK_CAN_REUSE;                                            
    	    
    		// 返回一个 struct inet_sock 的指针给 inet                                                                            
    	    inet = inet_sk(sk);     
    		// 判断是不是面向连通                                                    
    	    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;                    
    	                                                                                
    	    inet->nodefrag = 0;                                                         
    	    
    		// 判断是不是原始套接字,如果是,新建IP头部。
    	    if (SOCK_RAW == sock->type) {                                               
    	        inet->inet_num = protocol;                                              
    	        if (IPPROTO_RAW == protocol)                                            
    	            inet->hdrincl = 1;                                                  
    	    }                                                                           
    	    // 判断是否采用路径 MTU 发现算法                                                                           
    	    if (net->ipv4.sysctl_ip_no_pmtu_disc)                                       
    	        inet->pmtudisc = IP_PMTUDISC_DONT;                                      
    	    else                                                                        
    	        inet->pmtudisc = IP_PMTUDISC_WANT;                                      
    	                                                                                
    	    inet->inet_id = 0;
                                                              
    	    // 进一步初始化结构体 sk (struct sock)
    		// sock_init_data: 初始化接收,发送,错误信息队列,三个队列都是双向链表,属于sk_buff_head 结构体,其中会把 sk_buff 结构体串联在一起,初始化数据包发送定时器,变量,(主要是函数指针)
    	    sock_init_data(sock, sk); 
    	    sk->sk_destruct    = inet_sock_destruct;                                    
    	    sk->sk_protocol    = protocol;                                              
    	    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;                              
    	                                                                                
    	    inet->uc_ttl    = -1;                                                       
    	    inet->mc_loop   = 1;                                                        
    	    inet->mc_ttl    = 1;                                                        
    	    inet->mc_all    = 1;                                                        
    	    inet->mc_index  = 0;                                                        
    	    inet->mc_list   = NULL;                                                     
    	    inet->rcv_tos   = 0;                                                        
    	                                                                                
    	    sk_refcnt_debug_inc(sk);                                                    
    	                                                                                
    	    if (inet->inet_num) {                                                       
    	        /* It assumes that any protocol which allows                            
    	         * the user to assign a number at socket                                
    	         * creation time automatically                                          
    	         * shares.                                                              
    	         */                                                                     
    	        inet->inet_sport = htons(inet->inet_num);                               
    	        /* Add to protocol hash chains. */                                      
    	        sk->sk_prot->hash(sk);                                                  
    	    }                                                                           
    	    
    		//  这里,就是调用了协议里面的 init 函数  tcp_v4_init_sock 
    	    if (sk->sk_prot->init) {                                                    
    	        err = sk->sk_prot->init(sk);                                            
    	        if (err)                                                                
    	            sk_common_release(sk);                                              
    	    }                                                                           
    	out:                                                                            
    	    return err;                                                                 
    	out_rcu_unlock:                                                                 
    	    rcu_read_unlock();                                                          
    	    goto out;                                                                   
    	}                                                                                                                                  
    
    • tcp_v4_init_sock 函数

    	static int tcp_v4_init_sock(struct sock *sk)                                    
    	{      
    		// 强制转换类型 
    	    struct inet_connection_sock *icsk = inet_csk(sk);                           
    	    // 调用这个进行初始化 ,里面就时关于tcp 的一些初始化了,到此为止 
    	    tcp_init_sock(sk);                                                          
    	    // ipv4 专用操作 
    	    icsk->icsk_af_ops = &ipv4_specific;                                         
    	                                                                                
    	#ifdef CONFIG_TCP_MD5SIG                                                        
    	    tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;                          
    	#endif                                                                          
    	                                                                                
    	    return 0;                                                                   
    	}                                                                               
    
    • 到此, sock_create 分析完毕

    • 最后回到 SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

    	// net/socket.c  
    	// 刚才分析完毕  
    	retval = sock_create(family, type, protocol, &sock);                        
        	if (retval < 0)                                                             
            	goto out;
    	// socket 映射到文件系统
    	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));               
        	if (retval < 0)                                                             
        	    goto out_release;                                          
    
    	// net/socket.c
    	static int sock_map_fd(struct socket *sock, int flags)                          
    	{                                                                               
        	struct file *newfile;                                                       
        	int fd = get_unused_fd_flags(flags);                                        
        	if (unlikely(fd < 0))     
        	    return fd;      
        	
    		// 申请一个 sock file 节点 
        	newfile = sock_alloc_file(sock, flags, NULL);                               
        	if (likely(!IS_ERR(newfile))) {                                             
        	    fd_install(fd, newfile);                                                
        	    return fd;                                                              
        	}                                                                           
        	                                                                            
        	put_unused_fd(fd);                                                          
        	return PTR_ERR(newfile);                                                    
    	}
    	// 这里所展现的意思是,把socket当成一个文件节点进行操作,open, read,write ,ioctl 等                                                                 
    
  • 相关阅读:
    安装Django、Nginx和uWSGI
    创建Orcale数据库链接访问外部数据库
    ER图,以及转化成关系模式
    eclipse中的Java项目导出成为一个可以直接双击运行的jar文件
    电脑添加新的字体
    JDBC 的编程步骤
    转转基础服务性能压测实战
    公司起诉CTO拖延研发进度,索赔90万
    详解MQ消息队列及四大主流MQ的优缺点
    晒一波程序员的杯子,逼格超级高
  • 原文地址:https://www.cnblogs.com/chenfulin5/p/6927040.html
Copyright © 2011-2022 走看看