zoukankan      html  css  js  c++  java
  • Socket内核调用数SYSCALL_DEFINE3

    http://blog.chinaunix.net/uid-20788636-id-4408261.html

    前言:

           对于Linux内核的Socket系列文章都是依据于:Linux-3.14.5的版本内核分析,对于文中的注释和问题的说明也参考了网络上经典分析文章,对他们奉献表示感谢!
         转载请标明:http://blog.chinaunix.net/uid-20788636-id-4408261.html

    Socket的创建是在用户空间调用socket系统函数完成的,创建一个Socket返回一个文件描述符fd,内核的系统调用接口为SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol),在net/socket.c文件中,下面我们看一下内核中的源码实现。

    SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

    {

             int retval;

             struct socket *sock;

             int flags;

             /* Check the SOCK_* constants for consistency.  下面这些都是进行各种的检查操作*/

             BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

             BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

             BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

             BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

             flags = type & ~SOCK_TYPE_MASK;

             if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

                       return -EINVAL;

             type &= SOCK_TYPE_MASK;

       

             if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))

                       flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

    //调用创建socket的函数

             retval = sock_create(family, type, protocol, &sock);//------参考下面的分析

             if (retval < 0)

                       goto out;

             retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

             if (retval < 0)

                       goto out_release;

    out:

             /* It may be already another descriptor 8) Not kernel problem. */

             return retval;

    out_release:

             sock_release(sock);

             return retval;

    }

    1.1  socket_create函数

    对于sock_create(family, type, protocol, &sock)函数调用的是包囊函数,

    __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);

    对于__sock_create函数的定义如下:

    int __sock_create(struct net *net, int family, int type, int protocol,

                                 struct socket **res, int kern)

    {

             int err;

             struct socket *sock;

             const struct net_proto_family *pf;

             /*

              *      Check protocol is in range 检查协议的范围,现在内核定义的最大范围为41,这里的family指的是AF_INET6,AF_INET协议簇

           #define NPROTO                  AF_MAX  

    #define AF_MAX           41     /* For now.. */

              */

             if (family < 0 || family >= NPROTO)

                       return -EAFNOSUPPORT;

             if (type < 0 || type >= SOCK_MAX)//这里的type是socket的类型例如SOCK_STREAM

                       return -EINVAL;

             /* Compatibility.

                This uglymoron is moved from INET layer to here to avoid

                deadlock in module load.

              */

             if (family == PF_INET && type == SOCK_PACKET) {//如果是该类型的socket,对family进行重新的赋值

                       static int warned;//这里自动初始化为0,

                       if (!warned) {

                                warned = 1;

                                printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET) ",

                                       current->comm);

                       }

                       family = PF_PACKET;//赋值为PF_PACKET

             }

             err = security_socket_create(family, type, protocol, kern);

             if (err)

                       return err;

             /*

              *     Allocate the socket and allow the family to set things up. if

              *     the protocol is 0, the family is instructed to select an appropriate

              *     default.这里调用sock_alloc分配sock,见下面的分析

              */

             sock = sock_alloc();

             if (!sock) {

                       net_warn_ratelimited("socket: no more sockets ");

                       return -ENFILE;         /* Not exactly a match, but its the

                                            closest posix thing */

             }

             sock->type = type;

    #ifdef CONFIG_MODULES

             /* Attempt to load a protocol module if the find failed.

              *

              * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user

              * requested real, full-featured networking support upon configuration.

              * Otherwise module support will break!

              */

             if (rcu_access_pointer(net_families[family]) == NULL)

                       request_module("net-pf-%d", family);

    #endif

             rcu_read_lock();

             pf = rcu_dereference(net_families[family]);

             err = -EAFNOSUPPORT;

             if (!pf)

                       goto out_release;

             /*

              * We will call the ->create function, that possibly is in a loadable

              * module, so we have to bump that loadable module refcnt first.

              */

             if (!try_module_get(pf->owner))

                       goto out_release;

             /* Now protected by module ref count */

             rcu_read_unlock();

    /*static const struct net_proto_family inet_family_ops = {

             .family = PF_INET,

             .create = inet_create,

             .owner     = THIS_MODULE,

    };这里根据注册的family类型,调用不同的create函数,这里就是调用inet_ctreate*/

             err = pf->create(net, sock, protocol, kern);

             if (err < 0)

                       goto out_module_put;

             /*

              * Now to bump the refcnt of the [loadable] module that owns this

              * socket at sock_release time we decrement its refcnt.

              */

             if (!try_module_get(sock->ops->owner))

                       goto out_module_busy;

             /*

              * Now that we're done with the ->create function, the [loadable]

              * module can have its refcnt decremented

              */

             module_put(pf->owner);

             err = security_socket_post_create(sock, family, type, protocol, kern);

             if (err)

                       goto out_sock_release;

             *res = sock;

             return 0;

    out_module_busy:

             err = -EAFNOSUPPORT;

    out_module_put:

             sock->ops = NULL;

             module_put(pf->owner);

    out_sock_release:

             sock_release(sock);

             return err;

    out_release:

             rcu_read_unlock();

             goto out_sock_release;

    }

    1.1.1   sock_alloc函数

    sock_alloc函数用于分配一个socket结构体,这这里涉及了inode结构以及在分配完成后返回的地址指针。

    static struct socket *sock_alloc(void)

    {

             struct inode *inode;

             struct socket *sock;

       /*下面的new_inode_pseudo函数是分配一个新的inode结构体,但在实际分配过程中,分配了一个socket_alloc结构体,返回d的是inode地址,struct socket_alloc {

             struct socket socket;

             struct inode vfs_inode;

    };

    */

             inode = new_inode_pseudo(sock_mnt->mnt_sb);//sock_mnt哪里进行初始的,请看下面的分析-----(1)

             if (!inode)

                       return NULL;

             sock = SOCKET_I(inode);//该宏根据返回的inode获取到分配的socket_alloc指针

             kmemcheck_annotate_bitfield(sock, type);

       /*下面是对inode变量进行初始化操作,*/

             inode->i_ino = get_next_ino();

             inode->i_mode = S_IFSOCK | S_IRWXUGO;

             inode->i_uid = current_fsuid();//用户ID,在后面调用bind系统调用时会进行对比

             inode->i_gid = current_fsgid();//组ID

             inode->i_op = &sockfs_inode_ops;

             this_cpu_add(sockets_in_use, 1);

             return sock;

    }

    (1)对于sock_mnt->mnt_sb的赋值和分配过程如下:

    在sock_init函数中对socket类型的文件系统进行注册

    static struct file_system_type sock_fs_type = {

             .name =             "sockfs",

             .mount =  sockfs_mount,

             .kill_sb =  kill_anon_super,

    };

    static int __init sock_init(void)

    {

             int err;

             /*

              *      Initialize the network sysctl infrastructure.

              */

             err = net_sysctl_init();

             if (err)

                       goto out;

             /*

              *      Initialize skbuff SLAB cache

              */

             skb_init();

             /*

              *      Initialize the protocols module.

              */

             init_inodecache();

      /*下面的函数进行文件系统的注册*/

             err = register_filesystem(&sock_fs_type);

             if (err)

                       goto out_fs;

    /*下面的函数挂载文件系统*/

             sock_mnt = kern_mount(&sock_fs_type);

             if (IS_ERR(sock_mnt)) {

                       err = PTR_ERR(sock_mnt);

                       goto out_mount;

             }

             /* The real protocol initialization is performed in later initcalls.

              */

    #ifdef CONFIG_NETFILTER

             err = netfilter_init();

             if (err)

                       goto out;

    #endif

    #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

             skb_timestamping_init();

    #endif

    out:

             return err;

    out_mount:

             unregister_filesystem(&sock_fs_type);

    out_fs:

             goto out;

    }

    (2)new_inode_pseudo函数创建一个inode,并初始化inode的i_state变量和inode->i_sb_list链表,实际的分配函数为alloc_inode函数

    struct inode *new_inode_pseudo(struct super_block *sb)

    {

             struct inode *inode = alloc_inode(sb);

             if (inode) {

                       spin_lock(&inode->i_lock);

                       inode->i_state = 0;

                       spin_unlock(&inode->i_lock);

                       INIT_LIST_HEAD(&inode->i_sb_list);

             }

             return inode;

    }

             alloc_inode分配一个inode节点,

    static struct inode *alloc_inode(struct super_block *sb)

    {

             struct inode *inode;

             if (sb->s_op->alloc_inode)

    /*如果当前文件系统的超级块,有自己的分配inode的函数,则调用它自己的分配函数,否则从公用的高速缓存中分配一个inode.对于sokcet来说,在socket.c文件中,调用的函数为sock_alloc_inode

    static const struct super_operations sockfs_ops = {

             .alloc_inode     = sock_alloc_inode,

             .destroy_inode         = sock_destroy_inode,

             .statfs                = simple_statfs,

    };

    */

                       inode = sb->s_op->alloc_inode(sb);

             else

                       inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);

             if (!inode)

                       return NULL;

    /*对inode结构进行初始化*/

             if (unlikely(inode_init_always(sb, inode))) {

                       if (inode->i_sb->s_op->destroy_inode)

                                inode->i_sb->s_op->destroy_inode(inode);

                       else

                                kmem_cache_free(inode_cachep, inode);

                       return NULL;

             }

             return inode;

    }

             (3) 下面是sock_alloc_inode函数,在socket.c文件中

    static struct inode *sock_alloc_inode(struct super_block *sb)

    {

             struct socket_alloc *ei;

             struct socket_wq *wq;

       /*下面的函数分配struct socket_alloc结构体,这里是怎么分配的呢?参考下面的说明 */

             ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);

             if (!ei)

                       return NULL;

             wq = kmalloc(sizeof(*wq), GFP_KERNEL);

             if (!wq) {

                       kmem_cache_free(sock_inode_cachep, ei);

                       return NULL;

             }

             init_waitqueue_head(&wq->wait);

             wq->fasync_list = NULL;

             RCU_INIT_POINTER(ei->socket.wq, wq);

             ei->socket.state = SS_UNCONNECTED;

             ei->socket.flags = 0;

             ei->socket.ops = NULL;

             ei->socket.sk = NULL;

             ei->socket.file = NULL;

             return &ei->vfs_inode; //这里返回的是struct inode vfs_inode;

    }

      备注说明:在分配函数sock_alloc_inode中调用了ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);这里分配的大小为socket_alloc大小,下面分析一下是如果分配该大小的?

    init_inodecache函数中(net/socket.c),对其进行了高速缓存的分配操作,定义在socket.c文件中,这里分配的大小为socket_alloc,但返回是的socket_alloc结构体中的struct inode vfs_inode;变量。该函数在sock_init函数中被调用

    static int init_inodecache(void)

    {

             sock_inode_cachep = kmem_cache_create("sock_inode_cache",

                                                         sizeof(struct socket_alloc),

                                                         0,

                                                         (SLAB_HWCACHE_ALIGN |

                                                          SLAB_RECLAIM_ACCOUNT |

                                                          SLAB_MEM_SPREAD),

                                                         init_once);

             if (sock_inode_cachep == NULL)

                       return -ENOMEM;

             return 0;

    }

    1.1.2   inet_create函数

             在socket_create函数中调用pf->create这里的函数指针为inet_create。在文件/net/af_inet.c中

    static int inet_create(struct net *net, struct socket *sock, int protocol,

                              int kern)

    {

             struct sock *sk;

             struct inet_protosw *answer;

             struct inet_sock *inet;

             struct proto *answer_prot;

             unsigned char answer_flags;

             char answer_no_check;

             int try_loading_module = 0;

             int err;

             sock->state = SS_UNCONNECTED;

             /* Look for the requested type/protocol pair. */

    lookup_protocol:

             err = -ESOCKTNOSUPPORT;

             rcu_read_lock();

    /*  从inetsw中根据类型、协议查找相应的socket interface也就是 inet_protosw */

             list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

                       err = 0;

                       /* Check the non-wild match. */

                       if (protocol == answer->protocol) {

                                if (protocol != IPPROTO_IP)

                                         break;

                       } else {

                                /* Check for the two wild cases. */

                                if (IPPROTO_IP == protocol) {

                                         protocol = answer->protocol;

                                         break;

                                }

                                if (IPPROTO_IP == answer->protocol)

                                         break;

                       }

                       err = -EPROTONOSUPPORT;

             }

    /*如果没有找到,尝试加载模块*/

             if (unlikely(err)) {

                       if (try_loading_module < 2) {

                                rcu_read_unlock();

                                /*

                                 * Be more specific, e.g. net-pf-2-proto-132-type-1

                                 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)

                                 */

                                if (++try_loading_module == 1)

                                         request_module("net-pf-%d-proto-%d-type-%d",

                                                          PF_INET, protocol, sock->type);

                                /*

                                 * Fall back to generic, e.g. net-pf-2-proto-132

                                 * (net-pf-PF_INET-proto-IPPROTO_SCTP)

                                 */

                                else

                                         request_module("net-pf-%d-proto-%d",

                                                          PF_INET, protocol);

                                goto lookup_protocol;

                       } else

                                goto out_rcu_unlock;

             }

             err = -EPERM;

             if (sock->type == SOCK_RAW && !kern &&

                 !ns_capable(net->user_ns, CAP_NET_RAW))

                       goto out_rcu_unlock;

             sock->ops = answer->ops;

             answer_prot = answer->prot;

             answer_no_check = answer->no_check;

             answer_flags = answer->flags;

             rcu_read_unlock();

             WARN_ON(answer_prot->slab == NULL);

    /* sk_alloc表面上是生成一个sock的结构体,但是实际上对于tcp来说是一个tcp_sock的大小的结构体,这样就可以使用inet_sk(sk);进行强制的类型转换,具体是怎么分配的是tcp_sock大小的,在后续进行分析*/

             err = -ENOBUFS;

             sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);

             if (sk == NULL)

                       goto out;

             err = 0;

             sk->sk_no_check = answer_no_check;

             if (INET_PROTOSW_REUSE & answer_flags)

                       sk->sk_reuse = SK_CAN_REUSE;

             inet = inet_sk(sk);

             inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

             inet->nodefrag = 0;

             if (SOCK_RAW == sock->type) {

                       inet->inet_num = protocol;

                       if (IPPROTO_RAW == protocol)

                                inet->hdrincl = 1;

             }

             if (net->ipv4.sysctl_ip_no_pmtu_disc)

                       inet->pmtudisc = IP_PMTUDISC_DONT;

             else

                       inet->pmtudisc = IP_PMTUDISC_WANT;

             inet->inet_id = 0;

        /*对sk结构体中的变量进行初始化操作,*/

             sock_init_data(sock, sk);------------------(1)

             sk->sk_destruct          = inet_sock_destruct;

             sk->sk_protocol           = protocol;

             sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

             inet->uc_ttl      = -1;

             inet->mc_loop = 1;

             inet->mc_ttl     = 1;

             inet->mc_all     = 1;

             inet->mc_index        = 0;

             inet->mc_list   = NULL;

             inet->rcv_tos   = 0;

             sk_refcnt_debug_inc(sk);

             if (inet->inet_num) {

                       /* It assumes that any protocol which allows

                        * the user to assign a number at socket

                        * creation time automatically

                        * shares.

                        */

                       inet->inet_sport = htons(inet->inet_num);

                       /* Add to protocol hash chains. */

                       sk->sk_prot->hash(sk);

             }

             if (sk->sk_prot->init) {

                       err = sk->sk_prot->init(sk);//如果是tcp的话,这里就是tcp_v4_init_sock--------(2)

                       if (err)

                                sk_common_release(sk);

             }

    out:

             return err;

    out_rcu_unlock:

             rcu_read_unlock();

             goto out;

    }

    (1)sock_init_data函数分析

    void sock_init_data(struct socket *sock, struct sock *sk)

    {

             skb_queue_head_init(&sk->sk_receive_queue);

             skb_queue_head_init(&sk->sk_write_queue);

             skb_queue_head_init(&sk->sk_error_queue);

    #ifdef CONFIG_NET_DMA

             skb_queue_head_init(&sk->sk_async_wait_queue);

    #endif

             sk->sk_send_head   =       NULL;

       /*初始化sk定时器*/

             init_timer(&sk->sk_timer);

             sk->sk_allocation     =       GFP_KERNEL;

             sk->sk_rcvbuf            =       sysctl_rmem_default;

             sk->sk_sndbuf           =       sysctl_wmem_default;

             sk->sk_state             =       TCP_CLOSE; //初始化sk_state=TCP_CLOSE状态,在后面是的系统调用中会进行判断

             sk_set_socket(sk, sock);// sk->sk_socket = sock; 设置sk中指向socket的指针

             sock_set_flag(sk, SOCK_ZAPPED);//设置SOKCET的flag位,表明该sokcet已经绑定了一个名字,该标志位没有搞明白什么意思?

             if (sock) {

                       sk->sk_type      =       sock->type;

                       sk->sk_wq        =       sock->wq;

                       sock->sk   =       sk; // struct socket *sock 的sk指向sock

             } else

                       sk->sk_wq        =       NULL;

             spin_lock_init(&sk->sk_dst_lock);

             rwlock_init(&sk->sk_callback_lock);

             lockdep_set_class_and_name(&sk->sk_callback_lock,

                                af_callback_keys + sk->sk_family,

                                af_family_clock_key_strings[sk->sk_family]);

             sk->sk_state_change       =       sock_def_wakeup;

             sk->sk_data_ready  =       sock_def_readable;

             sk->sk_write_space         =       sock_def_write_space;

             sk->sk_error_report         =       sock_def_error_report;

             sk->sk_destruct                 =       sock_def_destruct;

             sk->sk_frag.page     =       NULL;

             sk->sk_frag.offset   =       0;

             sk->sk_peek_off                =       -1;

             sk->sk_peer_pid     =       NULL;

             sk->sk_peer_cred    =       NULL;

             sk->sk_write_pending     =       0;

             sk->sk_rcvlowat                =       1;

             sk->sk_rcvtimeo                =       MAX_SCHEDULE_TIMEOUT;

             sk->sk_sndtimeo               =       MAX_SCHEDULE_TIMEOUT;

             sk->sk_stamp = ktime_set(-1L, 0);

    #ifdef CONFIG_NET_RX_BUSY_POLL

             sk->sk_napi_id                   =       0;

             sk->sk_ll_usec          =       sysctl_net_busy_read;

    #endif

             sk->sk_max_pacing_rate = ~0U;

             sk->sk_pacing_rate = ~0U;

             /*

              * Before updating sk_refcnt, we must commit prior changes to memory

              * (Documentation/RCU/rculist_nulls.txt for details)

              */

             smp_wmb();

             atomic_set(&sk->sk_refcnt, 1);//sk的引用计数加1

             atomic_set(&sk->sk_drops, 0);

    }

    (2)static int tcp_v4_init_sock(struct sock *sk)

    {

             struct inet_connection_sock *icsk = inet_csk(sk);//基于上面的原因分析,其实这里可以进行强制的类型转换

             tcp_init_sock(sk);//进行tcp相关变量的初始化工作

             icsk->icsk_af_ops = &ipv4_specific;

    #ifdef CONFIG_TCP_MD5SIG

             tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;

    #endif

             return 0;

    }

  • 相关阅读:
    Construction构造函数
    映射验证
    映射设置
    条件映射
    映射前和映射后的操作
    AutoMapper 5.0-升级指南
    Bootstrap Tree View
    MiniProfiler使用笔记
    关于添加数据自定义编号格式问题
    【Postgresql】数据库函数
  • 原文地址:https://www.cnblogs.com/feng9exe/p/7001288.html
Copyright © 2011-2022 走看看