zoukankan      html  css  js  c++  java
  • Linux TCP/IP 协议栈之 Socket 的实现分析(一)

    内核版本:2.6.37
    参考[作者:kendo的文章(基于内涵版本2.6.12)]

    第一部份 Socket套接字的创建

    socket 并不是 TCP/IP协议的一部份。
    从广义上来讲,socket 是Unix/Linux 抽像的进程间通讯的一种方法。网络 socket 通讯仅仅是其若干协议中的一类。而tcp/ip 又是网络这类中的一种。
    从tcp/ip 的解度看 socket ,它更多地体现了用户 API 与协议栈的一个中间层接口层。用户通过调用socket API 将报文递交给协议栈,或者从协议栈中接收报文件。

    一、系统总入口
    Linux 内核为所有的与socket 有关的操作的API,提供了一个统一的系统调用入口,其代码在net/socket.c 中:

    /*
     *    System call vectors.
     *
     *    Argument checking cleaned up. Saved 20% in size.
     *  This function doesn't need to set the kernel lock because
     *  it is set by the callees.
     */
    
    SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
    {
        unsigned long a[6];
        unsigned long a0, a1;
        int err;
        unsigned int len;
    
        if (call < 1 || call > SYS_RECVMMSG)
            return -EINVAL;
    
        len = nargs[call];
        if (len > sizeof(a))
            return -EINVAL;
    
        /* copy_from_user should be SMP safe. */
        if (copy_from_user(a, args, len))
            return -EFAULT;
    
        audit_socketcall(nargs[call] / sizeof(unsigned long), a);
    
        a0 = a[0];
        a1 = a[1];
    
        switch (call) {
        case SYS_SOCKET:
            err = sys_socket(a0, a1, a[2]);
            break;
        case SYS_BIND:
            err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
            break;
        case SYS_CONNECT:
            err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
            break;
        case SYS_LISTEN:
            err = sys_listen(a0, a1);
            break;
        case SYS_ACCEPT:
            err = sys_accept4(a0, (struct sockaddr __user *)a1,
                      (int __user *)a[2], 0);
            break;
        case SYS_GETSOCKNAME:
            err =
                sys_getsockname(a0, (struct sockaddr __user *)a1,
                        (int __user *)a[2]);
            break;
        case SYS_GETPEERNAME:
            err =
                sys_getpeername(a0, (struct sockaddr __user *)a1,
                        (int __user *)a[2]);
            break;
        case SYS_SOCKETPAIR:
            err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
            break;
        case SYS_SEND:
            err = sys_send(a0, (void __user *)a1, a[2], a[3]);
            break;
        case SYS_SENDTO:
            err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
                     (struct sockaddr __user *)a[4], a[5]);
            break;
        case SYS_RECV:
            err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
            break;
        case SYS_RECVFROM:
            err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                       (struct sockaddr __user *)a[4],
                       (int __user *)a[5]);
            break;
        case SYS_SHUTDOWN:
            err = sys_shutdown(a0, a1);
            break;
        case SYS_SETSOCKOPT:
            err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
            break;
        case SYS_GETSOCKOPT:
            err =
                sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                       (int __user *)a[4]);
            break;
        case SYS_SENDMSG:
            err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
            break;
        case SYS_RECVMSG:
            err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
            break;
        case SYS_RECVMMSG:
            err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
                       (struct timespec __user *)a[4]);
            break;
        case SYS_ACCEPT4:
            err = sys_accept4(a0, (struct sockaddr __user *)a1,
                      (int __user *)a[2], a[3]);
            break;
        default:
            err = -EINVAL;
            break;
        }
        return err;
    }

    首先调用copy_from_user 将用户态参数拷贝至数组a 。但是问题在于,每个被调用的 API 的参数不尽相同,那么每次拷贝的字节在小如果断定?
    来看其第三个参数nargs[call],其中 call 是操作码,后面有个大大的 switch...case就是判断它。对应的操作码定义在include/linux/net.h :

    #define SYS_SOCKET    1        /* sys_socket(2)        */
    #define SYS_BIND    2        /* sys_bind(2)            */
    #define SYS_CONNECT    3        /* sys_connect(2)        */
    #define SYS_LISTEN    4        /* sys_listen(2)        */
    #define SYS_ACCEPT    5        /* sys_accept(2)        */
    #define SYS_GETSOCKNAME    6        /* sys_getsockname(2)        */
    #define SYS_GETPEERNAME    7        /* sys_getpeername(2)        */
    #define SYS_SOCKETPAIR    8        /* sys_socketpair(2)        */
    #define SYS_SEND    9        /* sys_send(2)            */
    #define SYS_RECV    10        /* sys_recv(2)            */
    #define SYS_SENDTO    11        /* sys_sendto(2)        */
    #define SYS_RECVFROM    12        /* sys_recvfrom(2)        */
    #define SYS_SHUTDOWN    13        /* sys_shutdown(2)        */
    #define SYS_SETSOCKOPT    14        /* sys_setsockopt(2)        */
    #define SYS_GETSOCKOPT    15        /* sys_getsockopt(2)        */
    #define SYS_SENDMSG    16        /* sys_sendmsg(2)        */
    #define SYS_RECVMSG    17        /* sys_recvmsg(2)        */
    #define SYS_ACCEPT4    18        /* sys_accept4(2)        */
    #define SYS_RECVMMSG    19        /* sys_recvmmsg(2)        */

    而数组nargs则根据操作码的不同,计算对应的参数的空间大小:

    /* Argument list sizes for sys_socketcall */
    #define AL(x) ((x) * sizeof(unsigned long))
    static const unsigned char nargs[20] = {
        AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
        AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
        AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
        AL(4), AL(5)
    };
    
    #undef AL

    当拷贝完成参数后,就进入一个switch...case... 判断操作码,跳转至对应的系统接口。

    二、 sys_socket 函数

    当用户空间要创建一个socke 接口时,会调用 API 函数:

    int socket(int domain, int type, int protocol);

    函数,其三个参数分别表示协议族、协议类型(面向连接或无连接)以及协议
    协议族:

    /* Supported address families. */
    #define AF_UNSPEC    0
    #define AF_UNIX        1    /* Unix domain sockets         */
    #define AF_LOCAL    1    /* POSIX name for AF_UNIX    */
    #define AF_INET        2    /* Internet IP Protocol     */
    #define AF_AX25        3    /* Amateur Radio AX.25         */
    #define AF_IPX        4    /* Novell IPX             */
    #define AF_APPLETALK    5    /* AppleTalk DDP         */
    #define AF_NETROM    6    /* Amateur Radio NET/ROM     */
    #define AF_BRIDGE    7    /* Multiprotocol bridge     */
    #define AF_ATMPVC    8    /* ATM PVCs            */
    #define AF_X25        9    /* Reserved for X.25 project     */
    #define AF_INET6    10    /* IP version 6            */
    #define AF_ROSE        11    /* Amateur Radio X.25 PLP    */
    #define AF_DECnet    12    /* Reserved for DECnet project    */
    #define AF_NETBEUI    13    /* Reserved for 802.2LLC project*/
    #define AF_SECURITY    14    /* Security callback pseudo AF */
    #define AF_KEY        15      /* PF_KEY key management API */
    #define AF_NETLINK    16
    #define AF_ROUTE    AF_NETLINK /* Alias to emulate 4.4BSD */
    #define AF_PACKET    17    /* Packet family        */
    #define AF_ASH        18    /* Ash                */
    #define AF_ECONET    19    /* Acorn Econet            */
    #define AF_ATMSVC    20    /* ATM SVCs            */
    #define AF_RDS        21    /* RDS sockets             */
    #define AF_SNA        22    /* Linux SNA Project (nutters!) */
    #define AF_IRDA        23    /* IRDA sockets            */
    #define AF_PPPOX    24    /* PPPoX sockets        */
    #define AF_WANPIPE    25    /* Wanpipe API Sockets */
    #define AF_LLC        26    /* Linux LLC            */
    #define AF_CAN        29    /* Controller Area Network      */
    #define AF_TIPC        30    /* TIPC sockets            */
    #define AF_BLUETOOTH    31    /* Bluetooth sockets         */
    #define AF_IUCV        32    /* IUCV sockets            */
    #define AF_RXRPC    33    /* RxRPC sockets         */
    #define AF_ISDN        34    /* mISDN sockets         */
    #define AF_PHONET    35    /* Phonet sockets        */
    #define AF_IEEE802154    36    /* IEEE802154 sockets        */
    #define AF_CAIF        37    /* CAIF sockets            */
    #define AF_MAX        38    /* For now.. */
    
    /* Protocol families, same as address families. */
    #define PF_UNSPEC    AF_UNSPEC
    #define PF_UNIX        AF_UNIX
    #define PF_LOCAL    AF_LOCAL
    #define PF_INET        AF_INET
    #define PF_AX25        AF_AX25
    #define PF_IPX        AF_IPX
    #define PF_APPLETALK    AF_APPLETALK
    #define PF_NETROM    AF_NETROM
    #define PF_BRIDGE    AF_BRIDGE
    #define PF_ATMPVC    AF_ATMPVC
    #define PF_X25        AF_X25
    #define PF_INET6            AF_INET6
    #define PF_ROSE        AF_ROSE
    #define PF_DECnet    AF_DECnet
    #define PF_NETBEUI    AF_NETBEUI
    #define PF_SECURITY    AF_SECURITY
    #define PF_KEY        AF_KEY
    #define PF_NETLINK    AF_NETLINK
    #define PF_ROUTE    AF_ROUTE
    #define PF_PACKET    AF_PACKET
    #define PF_ASH        AF_ASH
    #define PF_ECONET    AF_ECONET
    #define PF_ATMSVC    AF_ATMSVC
    #define PF_RDS        AF_RDS
    #define PF_SNA        AF_SNA
    #define PF_IRDA        AF_IRDA
    #define PF_PPPOX            AF_PPPOX
    #define PF_WANPIPE    AF_WANPIPE
    #define PF_LLC        AF_LLC
    #define PF_CAN        AF_CAN
    #define PF_TIPC        AF_TIPC
    #define PF_BLUETOOTH    AF_BLUETOOTH
    #define PF_IUCV        AF_IUCV
    #define PF_RXRPC    AF_RXRPC
    #define PF_ISDN        AF_ISDN
    #define PF_PHONET    AF_PHONET
    #define PF_IEEE802154    AF_IEEE802154
    #define PF_CAIF        AF_CAIF
    #define PF_MAX        AF_MAX

    协议类型:

    enum sock_type {
        SOCK_STREAM    = 1,
        SOCK_DGRAM    = 2,
        SOCK_RAW    = 3,
        SOCK_RDM    = 4,
        SOCK_SEQPACKET    = 5,
        SOCK_DCCP    = 6,
        SOCK_PACKET    = 10,
    };

    socket创建通过操作码SYS_SOCKET是由sys_socket() 实现的:

    SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
    {
        int retval;
        struct socket *sock;
        int flags;
    
        /* Check the SOCK_* constants for consistency.  */
        BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
    
        flags = type & ~SOCK_TYPE_MASK;
        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
            return -EINVAL;
        type &= SOCK_TYPE_MASK;
    
        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
            flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
    
        retval = sock_create(family, type, protocol, &sock);
        if (retval < 0)
            goto out;
    
        retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
        if (retval < 0)
            goto out_release;
    
    out:
        /* It may be already another descriptor 8) Not kernel problem. */
        return retval;
    
    out_release:
        sock_release(sock);
        return retval;
    }

    这段代码做了两件事:

    1>  分配 sock 与sk,协议簇的协议封装;

    2>  sock 面向上层系统调用,主要是与文件系统交互。

      通过进程的current指针的files,结合创建socket时返回的文件描符述,可以找到内核中对应的struct file,再根据file的f_dentry可以找到对应的目录项,而目录项struct dentry中,有d_inode指针,指向与sock封装在一起的inode。

      sock又与sk指针互指,一一对应。

    三、 协议簇的协议封装

    int __sock_create(struct net *net, int family, int type, int protocol,
                 struct socket **res, int kern)
    {
        int err;
        struct socket *sock;
        const struct net_proto_family *pf;
    
        /*
         *      Check protocol is in range
         */
        if (family < 0 || family >= NPROTO)
            return -EAFNOSUPPORT;
        if (type < 0 || type >= SOCK_MAX)
            return -EINVAL;
    
        /* Compatibility.
    
           This uglymoron is moved from INET layer to here to avoid
           deadlock in module load.
         */
        if (family == PF_INET && type == SOCK_PACKET) {
            static int warned;
            if (!warned) {
                warned = 1;
                printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)
    ",
                       current->comm);
            }
            family = PF_PACKET;
        }
    
        err = security_socket_create(family, type, protocol, kern);
        if (err)
            return err;
    
        /*
         *    Allocate the socket and allow the family to set things up. if
         *    the protocol is 0, the family is instructed to select an appropriate
         *    default.
         */
        sock = sock_alloc();
        if (!sock) {
            if (net_ratelimit())
                printk(KERN_WARNING "socket: no more sockets
    ");
            return -ENFILE;    /* Not exactly a match, but its the
                       closest posix thing */
        }
    
        sock->type = type;
    
    #ifdef CONFIG_MODULES
        /* Attempt to load a protocol module if the find failed.
         *
         * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
         * requested real, full-featured networking support upon configuration.
         * Otherwise module support will break!
         */
        if (net_families[family] == NULL)
            request_module("net-pf-%d", family);
    #endif
    
        rcu_read_lock();
        pf = rcu_dereference(net_families[family]);
        err = -EAFNOSUPPORT;
        if (!pf)
            goto out_release;
    
        /*
         * We will call the ->create function, that possibly is in a loadable
         * module, so we have to bump that loadable module refcnt first.
         */
        if (!try_module_get(pf->owner))
            goto out_release;
    
        /* Now protected by module ref count */
        rcu_read_unlock();
    
        err = pf->create(net, sock, protocol, kern);
        if (err < 0)
            goto out_module_put;
    
        /*
         * Now to bump the refcnt of the [loadable] module that owns this
         * socket at sock_release time we decrement its refcnt.
         */
        if (!try_module_get(sock->ops->owner))
            goto out_module_busy;
    
        /*
         * Now that we're done with the ->create function, the [loadable]
         * module can have its refcnt decremented
         */
        module_put(pf->owner);
        err = security_socket_post_create(sock, family, type, protocol, kern);
        if (err)
            goto out_sock_release;
        *res = sock;
    
        return 0;
    
    out_module_busy:
        err = -EAFNOSUPPORT;
    out_module_put:
        sock->ops = NULL;
        module_put(pf->owner);
    out_sock_release:
        sock_release(sock);
        return err;
    
    out_release:
        rcu_read_unlock();
        goto out_sock_release;
    }
    EXPORT_SYMBOL(__sock_create);

    上面这个函数主要做了三件事:

    1> sock_alloc()

    在分析这个函数前,首先要了解:为了对 socket 抽像出文件的概念,内核中为socket定义了一个专门的文件系统类型sockfs。

    static struct vfsmount *sock_mnt __read_mostly;
    
    static struct file_system_type sock_fs_type = {
        .name =        "sockfs",
        .mount =    sockfs_mount,
        .kill_sb =    kill_anon_super,
    };

    在模块初始化的时候,安装该文件系统:

    static int __init sock_init(void)
    {
        /*
         *      Initialize sock SLAB cache.
         */
    
        sk_init();
    
        /*
         *      Initialize skbuff SLAB cache
         */
        skb_init();
    
        /*
         *      Initialize the protocols module.
         */
    
        init_inodecache();
        register_filesystem(&sock_fs_type);
        sock_mnt = kern_mount(&sock_fs_type);
    
        /* The real protocol initialization is performed in later initcalls.
         */
    
    #ifdef CONFIG_NETFILTER
        netfilter_init();
    #endif
    
    #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
        skb_timestamping_init();
    #endif
    
        return 0;
    }
    
    core_initcall(sock_init);    /* early initcall */

    文件系统安装中的一个重要步骤kern_mount->kern_mount_data->vfs_kern_mount:

    vfs_kern_mount函数中,先根据注册的文件系统类型,如果文件系统本身有mount成员函数则调用之,没则调用它的get_sb成员函数指针,获取相应的超级块sb 。最后,调置文件系统的超级块成员指针,使之指向对应的值。 

    其中sockfs文件系统的mount函数调用mount_pseudo()实现超级块的初始化,跟节点inode和目录下dentry创建,sockfs_ops这里关联上文件系统。

    那前面提到的new_inode()函数分配inode 时调用的: sock_mnt->mnt_sb->s_op->alloc_inode(sock_mnt->mnt_sb);

    static const struct super_operations sockfs_ops = {
        .alloc_inode    = sock_alloc_inode,
        .destroy_inode    = sock_destroy_inode,
        .statfs        = simple_statfs,
    };

    这个alloc_inode函数指针也就是sockfs_opssock_alloc_inode()函数。

    static struct inode *sock_alloc_inode(struct super_block *sb)
    {
        struct socket_alloc *ei;
    
        ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
        if (!ei)
            return NULL;
        ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL);
        if (!ei->socket.wq) {
            kmem_cache_free(sock_inode_cachep, ei);
            return NULL;
        }
        init_waitqueue_head(&ei->socket.wq->wait);
        ei->socket.wq->fasync_list = NULL;
    
        ei->socket.state = SS_UNCONNECTED;
        ei->socket.flags = 0;
        ei->socket.ops = NULL;
        ei->socket.sk = NULL;
        ei->socket.file = NULL;
    
        return &ei->vfs_inode;
    }

    函数先分配了一个用于封装socket和inode的ei ,然后在高速缓存中为之申请了一块空间。这样,inode和socket就同时都被分配了。接下来初始化socket的各个成员。

    struct socket_alloc {
        struct socket socket;
        struct inode vfs_inode;
    };

    显而易见,该结构实现了inode和socket的封装。已经通过new_inode从sockfs文件系统分配一个inode,可以通过宏SOCKET_I来获取与之对应的socket:

    sock = SOCKET_I(inode);

    分配inode、socket 以及两者如何关联,都已一一分析了。

    2> pf = rcu_dereference(net_families[family]);

    net_families[family]的定义:

    static const struct net_proto_family *net_families[NPROTO] __read_mostly;

    net_proto_family的定义:

    struct net_proto_family {
        int        family;
        int        (*create)(struct net *net, struct socket *sock,
                      int protocol, int kern);
        struct module    *owner;
    };

    net_families数组填充函数sock_register():

    /**
     *    sock_register - add a socket protocol handler
     *    @ops: description of protocol
     *
     *    This function is called by a protocol handler that wants to
     *    advertise its address family, and have it linked into the
     *    socket interface. The value ops->family coresponds to the
     *    socket system call protocol family.
     */
    int sock_register(const struct net_proto_family *ops)
    {
        int err;
    
        if (ops->family >= NPROTO) {
            printk(KERN_CRIT "protocol %d >= NPROTO(%d)
    ", ops->family,
                   NPROTO);
            return -ENOBUFS;
        }
    
        spin_lock(&net_family_lock);
        if (net_families[ops->family])
            err = -EEXIST;
        else {
            net_families[ops->family] = ops;
            err = 0;
        }
        spin_unlock(&net_family_lock);
    
        printk(KERN_INFO "NET: Registered protocol family %d
    ", ops->family);
        return err;
    }
    EXPORT_SYMBOL(sock_register);

    从这里我们看出每个协议族都是通过sock_register函数注册到net_families数组中,通过代码搜索发现每个协议族都会调用这个函数去注册。

    Af_ax25.c (netax25):    sock_register(&ax25_family_ops);
    Af_bluetooth.c (netluetooth):    err = sock_register(&bt_sock_family_ops);
    Af_can.c (netcan):    sock_register(&can_family_ops);
    Af_decnet.c (netdecnet):    sock_register(&dn_family_ops);
    Af_econet.c (neteconet):    sock_register(&econet_family_ops);
    Af_ieee802154.c (netieee802154):    rc = sock_register(&ieee802154_family_ops);
    Af_inet.c (netipv4):    (void)sock_register(&inet_family_ops);
    Af_inet6.c (netipv6):    err = sock_register(&inet6_family_ops);
    Af_ipx.c (netipx):    sock_register(&ipx_family_ops);
    Af_irda.c (netirda):        rc = sock_register(&irda_family_ops);
    Af_iucv.c (netiucv):    err = sock_register(&iucv_sock_family_ops);
    Af_key.c (netkey):    err = sock_register(&pfkey_family_ops);
    Af_llc.c (netllc):    rc = sock_register(&llc_ui_family_ops);
    Af_netlink.c (net
    etlink):    sock_register(&netlink_family_ops);
    Af_netrom.c (net
    etrom):    if (sock_register(&nr_family_ops)) {
    Af_packet.c (netpacket):    sock_register(&packet_family_ops);
    Af_phonet.c (netphonet):    err = sock_register(&phonet_proto_family);
    Af_rds.c (net
    ds):    ret = sock_register(&rds_family_ops);
    Af_rose.c (net
    ose):    sock_register(&rose_family_ops);
    Af_rxrpc.c (net
    xrpc):    ret = sock_register(&rxrpc_family_ops);
    Af_unix.c (netunix):    sock_register(&unix_family_ops);
    Af_x25.c (netx25):    rc = sock_register(&x25_family_ops);
    Caif_socket.c (netcaif):    int err = sock_register(&caif_family_ops);
    Ddp.c (netappletalk):    (void)sock_register(&atalk_family_ops);
    Net.h (includelinux):extern int         sock_register(const struct net_proto_family *fam);
    Pppox.c (drivers
    et):    return sock_register(&pppox_proto_family);
    Pvc.c (netatm):    return sock_register(&pvc_family_ops);
    Socket.c (driversisdnmisdn):    err = sock_register(&mISDN_sock_family_ops);
    Socket.c (net): *    sock_register - add a socket protocol handler
    Socket.c (net):int sock_register(const struct net_proto_family *ops)
    Socket.c (net):EXPORT_SYMBOL(sock_register);
    Socket.c (net	ipc):    res = sock_register(&tipc_family_ops);
    Svc.c (netatm):    return sock_register(&svc_family_ops);

    本文主要分析的ipv4协议族,所以我们参考的文件af_inet.c(net/ipv4)。

    3> err = pf->create(net, sock, protocol, kern);

    在af_inet.c里面inet_init函数里面调用sock_register注册到协议族数组net_families里:

    (void)sock_register(&inet_family_ops);

    接着看inet_family_ops定义:

    static const struct net_proto_family inet_family_ops = {
        .family = PF_INET,
        .create = inet_create,
        .owner    = THIS_MODULE,
    };

    这里的inet_create就是程序调用的函数:

    /*
     *    Create an inet socket.
     */
    
    static int inet_create(struct net *net, struct socket *sock, int protocol,
                   int kern)
    {
        struct sock *sk;
        struct inet_protosw *answer;
        struct inet_sock *inet;
        struct proto *answer_prot;
        unsigned char answer_flags;
        char answer_no_check;
        int try_loading_module = 0;
        int err;
    
        if (unlikely(!inet_ehash_secret))
            if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
                build_ehash_secret();
    
        sock->state = SS_UNCONNECTED;
    
        /* Look for the requested type/protocol pair. */
    lookup_protocol:
        err = -ESOCKTNOSUPPORT;
        rcu_read_lock();
        list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
    
            err = 0;
            /* Check the non-wild match. */
            if (protocol == answer->protocol) {
                if (protocol != IPPROTO_IP)
                    break;
            } else {
                /* Check for the two wild cases. */
                if (IPPROTO_IP == protocol) {
                    protocol = answer->protocol;
                    break;
                }
                if (IPPROTO_IP == answer->protocol)
                    break;
            }
            err = -EPROTONOSUPPORT;
        }
    
        if (unlikely(err)) {
            if (try_loading_module < 2) {
                rcu_read_unlock();
                /*
                 * Be more specific, e.g. net-pf-2-proto-132-type-1
                 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
                 */
                if (++try_loading_module == 1)
                    request_module("net-pf-%d-proto-%d-type-%d",
                               PF_INET, protocol, sock->type);
                /*
                 * Fall back to generic, e.g. net-pf-2-proto-132
                 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
                 */
                else
                    request_module("net-pf-%d-proto-%d",
                               PF_INET, protocol);
                goto lookup_protocol;
            } else
                goto out_rcu_unlock;
        }
    
        err = -EPERM;
        if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
            goto out_rcu_unlock;
    
        err = -EAFNOSUPPORT;
        if (!inet_netns_ok(net, protocol))
            goto out_rcu_unlock;
    
        sock->ops = answer->ops;
        answer_prot = answer->prot;
        answer_no_check = answer->no_check;
        answer_flags = answer->flags;
        rcu_read_unlock();
    
        WARN_ON(answer_prot->slab == NULL);
    
        err = -ENOBUFS;
        sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
        if (sk == NULL)
            goto out;
    
        err = 0;
        sk->sk_no_check = answer_no_check;
        if (INET_PROTOSW_REUSE & answer_flags)
            sk->sk_reuse = 1;
    
        inet = inet_sk(sk);
        inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
    
        inet->nodefrag = 0;
    
        if (SOCK_RAW == sock->type) {
            inet->inet_num = protocol;
            if (IPPROTO_RAW == protocol)
                inet->hdrincl = 1;
        }
    
        if (ipv4_config.no_pmtu_disc)
            inet->pmtudisc = IP_PMTUDISC_DONT;
        else
            inet->pmtudisc = IP_PMTUDISC_WANT;
    
        inet->inet_id = 0;
    
        sock_init_data(sock, sk);
    
        sk->sk_destruct       = inet_sock_destruct;
        sk->sk_protocol       = protocol;
        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
    
        inet->uc_ttl    = -1;
        inet->mc_loop    = 1;
        inet->mc_ttl    = 1;
        inet->mc_all    = 1;
        inet->mc_index    = 0;
        inet->mc_list    = NULL;
    
        sk_refcnt_debug_inc(sk);
    
        if (inet->inet_num) {
            /* It assumes that any protocol which allows
             * the user to assign a number at socket
             * creation time automatically
             * shares.
             */
            inet->inet_sport = htons(inet->inet_num);
            /* Add to protocol hash chains. */
            sk->sk_prot->hash(sk);
        }
    
        if (sk->sk_prot->init) {
            err = sk->sk_prot->init(sk);
            if (err)
                sk_common_release(sk);
        }
    out:
        return err;
    out_rcu_unlock:
        rcu_read_unlock();
        goto out;
    }

    在分析inet_create()函数前,就要分析inetsw[SOCK_MAX]这个数组。

    static struct list_head inetsw[SOCK_MAX];

    这个数组是在inet_init()->inet_register_protosw()里面填充的。

        for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
            inet_register_protosw(q);

    inetsw_array定义:

    /* Upon startup we insert all the elements in inetsw_array[] into
     * the linked list inetsw.
     */
    static struct inet_protosw inetsw_array[] =
    {
        {
            .type =       SOCK_STREAM,
            .protocol =   IPPROTO_TCP,
            .prot =       &tcp_prot,
            .ops =        &inet_stream_ops,
            .no_check =   0,
            .flags =      INET_PROTOSW_PERMANENT |
                      INET_PROTOSW_ICSK,
        },
    
        {
            .type =       SOCK_DGRAM,
            .protocol =   IPPROTO_UDP,
            .prot =       &udp_prot,
            .ops =        &inet_dgram_ops,
            .no_check =   UDP_CSUM_DEFAULT,
            .flags =      INET_PROTOSW_PERMANENT,
           },
    
    
           {
               .type =       SOCK_RAW,
               .protocol =   IPPROTO_IP,    /* wild card */
               .prot =       &raw_prot,
               .ops =        &inet_sockraw_ops,
               .no_check =   UDP_CSUM_DEFAULT,
               .flags =      INET_PROTOSW_REUSE,
           }
    };
    
    #define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

    inet_register_protosw函数分析:

    void inet_register_protosw(struct inet_protosw *p)
    {
        struct list_head *lh;
        struct inet_protosw *answer;
        int protocol = p->protocol;
        struct list_head *last_perm;
    
        spin_lock_bh(&inetsw_lock);
    
        if (p->type >= SOCK_MAX)
            goto out_illegal;
    
        /* If we are trying to override a permanent protocol, bail. */
        answer = NULL;
        last_perm = &inetsw[p->type];
        list_for_each(lh, &inetsw[p->type]) {
            answer = list_entry(lh, struct inet_protosw, list);
    
            /* Check only the non-wild match. */
            if (INET_PROTOSW_PERMANENT & answer->flags) {
                if (protocol == answer->protocol)
                    break;
                last_perm = lh;
            }
    
            answer = NULL;
        }
        if (answer)
            goto out_permanent;
    
        /* Add the new entry after the last permanent entry if any, so that
         * the new entry does not override a permanent entry when matched with
         * a wild-card protocol. But it is allowed to override any existing
         * non-permanent entry.  This means that when we remove this entry, the
         * system automatically returns to the old behavior.
         */
        list_add_rcu(&p->list, last_perm);
    out:
        spin_unlock_bh(&inetsw_lock);
    
        return;
    
    out_permanent:
        printk(KERN_ERR "Attempt to override permanent protocol %d.
    ",
               protocol);
        goto out;
    
    out_illegal:
        printk(KERN_ERR
               "Ignoring attempt to register invalid socket type %d.
    ",
               p->type);
        goto out;
    }
    EXPORT_SYMBOL(inet_register_protosw);

    这个函数完成的工作,就是把inetsw_array 数组中,相同的协议类型(protocol成员)下边的协议,加入到inetsw 对应的协议类型的链表中去。
    因为事实上一对一的关系,所以这个函数要简单得多:
      因为不存在其它成员,所以每一次 list_entry 都为空值,所以不存在覆盖和追加的情况,直接调用list_add_rcu(&p->list, last_perm);
      把协议类型节点(struct inet_protosw 类型的数组的某个元素)添加到链表(链表首部本身是一个数组,数组索引是协议对应的协议类型的值的第一个成员。

    继续分析inet_create()函数:

      首先,根据sock的成员protocol,把之前在链表中注册的协议节点找出。

      然后,将创建的socket 的ops 函数指针集,指向协议类型的例如创建的是SOCK_STREAM,那么就指向了inet_stream_ops; answer_prot 指针指向了当前要创建的socket 的协议类型下边的协议,如上例,它就是IPPROTO_TCP 的tcp_prot结构。

      接着, 接下来一个重要的工作,就是为socket分配一个sock,并初始化它。

      最后,初始化一个 inet 。

    虽然create 的代码就到这儿了,不过要说清楚sk(socK)的分配,还得费上大力气。
    每一个Socket 套接字,都有一个对应的 struct socket 结构来描述(内核中一般使用名称为sock),但是同时又一个struct sock 结构(内核中一般使用名称为sk),两者之间是一一对应的关系。

    在后面的sock_init_data 函数中,可以看到:

    sk->sk_socket = sock; 
    sock->sk = sk;

    socket 结构和 sock 结构实际上是同一个事物的两个方面。不妨说,socket 结构是面向进程和系统调用界面的侧面,而 sock 结构则是面向底层驱动程序的侧面。

    设计者把socket套接字中,与文件系统关系比较密切的那一部份放在socket结构中,而把与通信关系比较密切的那一部份,则单独成为 一个数结结构,那就是sock 结构。

    由于这两部份逻辑上本来就是一体的,所以要通过指针互相指向对方,形成一对一的关系。

    调用sk_alloc()分配一个sk:

      在之前proto_register()函数创建的高速缓存中申请分配一个slab缓存项,并清零。然后设置协议族、并把sk中的sk_prot与对应的协议关联起来。

    分配完成sk后,另一个重要的功能就是初始化它

      sk的成员相当复杂,其主要的初始化工作是在函数sock_init_data()中完成的:
      sock 结构中,有三个重要的双向队列,分别是 sk_receive_queuesk_write_queuesk_error_queue。从它们的名字就可以看出来其作用了。
    队列并非采用通用的list_head来维护,而是使用skb_buffer队列:

    struct sk_buff_head { 
                /* These two members must be first. */ 
            struct sk_buff        *next; 
            struct sk_buff        *prev; 
     
                __u32                        qlen; 
            spinlock_t        lock; 
    };

    这样,队列中指向的每一个skb_buffer,就是一个数据包,分别是接收、发送和投递错误。
    inet 初始化:
    inet 是一个struct inet_sock 结构类型,来看它的定义:

    struct inet_sock { 
        /* sk and pinet6 has to be the first two members of inet_sock */ 
        struct sock sk; 
        …… 
    }

    只留意它的第一个成员就足够了。
    我们说sock 是面向用户态调用,而sk是面向内核驱动调用的,那sk是如何与协议栈交互的呢?
    对于每一个类型的协议,为了与sk联系起来,都定义了一个struct XXX_sock 结构,XXX是协议名,例如:

    struct tcp_sock { 
        /* inet_sock has to be the first member of tcp_sock */ 
        struct inet_sock inet; 
        int tcp_header_len; /* Bytes of tcp header to send */ 
        …… 
    } 

    很明显,它们的结构定构是“af_inet 一般属性+ 自己的私有属性” ,因为它们的第一个成员总是inet 。

    现在回头来照一下起初在af_inet.c中,封装协议注册proto_register()的时候,size成员,对于tcp而言:

    struct proto tcp_prot = {
        .name            = "TCP",
        .owner            = THIS_MODULE,
        .close            = tcp_close,
        .connect        = tcp_v4_connect,
        .disconnect        = tcp_disconnect,
        .accept            = inet_csk_accept,
        .ioctl            = tcp_ioctl,
        .init            = tcp_v4_init_sock,
        .destroy        = tcp_v4_destroy_sock,
        .shutdown        = tcp_shutdown,
        .setsockopt        = tcp_setsockopt,
        .getsockopt        = tcp_getsockopt,
        .recvmsg        = tcp_recvmsg,
        .sendmsg        = tcp_sendmsg,
            ...
        .obj_size        = sizeof(struct tcp_sock),
            ...     
    };

    其它协议类似。

    以obj_size 来确定每个 slab 缓存项分配的大小,所以,我们就可说,每次申请分配的,实际上是一个struct XXX_sock 结构大小的结构。因为都是定义于上层结构的第一个成员,可以使用强制类型转换来使用这块分配的内存空间。例如: 

    struct inet_sock {
        /* sk and pinet6 has to be the first two members of inet_sock */
        struct sock        sk;
    #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        struct ipv6_pinfo    *pinet6;
    #endif
        /* Socket demultiplex comparisons on incoming packets. */
        __be32            inet_daddr;
        __be32            inet_rcv_saddr;
        __be16            inet_dport;
        __u16            inet_num;
        __be32            inet_saddr;
        __s16            uc_ttl;
        __u16            cmsg_flags;
        __be16            inet_sport;
        __u16            inet_id;
        ...  
    };
    
    inet = inet_sk(sk); 
    static inline struct inet_sock *inet_sk(const struct sock *sk) 
    { 
      return (struct inet_sock *)sk; //inet_sock->sk
    }
    struct tcp_sock {
        /* inet_connection_sock has to be the first member of tcp_sock */
        struct inet_connection_sock    inet_conn;
        u16    tcp_header_len;    /* Bytes of tcp header to send        */
        ...
    };
    
    struct tcp_sock *tp = tcp_sk(sk); 
    static inline struct tcp_sock *tcp_sk(const struct sock *sk) 
    { 
      return (struct tcp_sock *)sk; //tcp_sock->inet_conn->icsk_inet->sk
    }

    inet_create()运行完,一个 socket 套接字基本上就创建完毕了,剩下的就是与文件系统挂钩。

    四、与文件系统交互

    sys_socket()函数中来,它在调用完sock_create()后,紧接着调用sock_map_fd()函数:

    int sock_map_fd(struct socket *sock, int flags)
    {
        struct file *newfile;
        int fd = sock_alloc_file(sock, &newfile, flags);
    
        if (likely(fd >= 0))
            fd_install(fd, newfile);
    
        return fd;
    }
    EXPORT_SYMBOL(sock_map_fd);

    这个函数的核心思想,在一开始,就已经分析过了。
    从进程的角度来讲,一个 socket 套接字就是一个特殊的,已打开的文件。
    前面分配好一个socket后,这里要做的就是将它与文件系统拉上亲戚关系。
    首先获取一个空闲的文件描述符号和file结构。然后在文件系统中分配一个目录项(d_alloc),使其指向已经分配的inode节点(d_add),然后把其目录项挂在sockfs文件系统的根目录之下。
    并且把目录项的指针d_op设置成指向 sockfs_dentry_operati,这个数据结构通过函数指针提供他与文件路径有关的操作:

    static const struct dentry_operations sockfs_dentry_operations = {
        .d_dname  = sockfs_dname,
    };

    最后一步,就是将file结构中的f_op和sock结构中的i_fop都指向socket_file_ops,它是一个函数指针集,指向了socket面向文件系统的用户态调用的一些接口函数:

    /*
     *    Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
     *    in the operation structures but are done directly via the socketcall() multiplexor.
     */
    
    static const struct file_operations socket_file_ops = {
        .owner =    THIS_MODULE,
        .llseek =    no_llseek,
        .aio_read =    sock_aio_read,
        .aio_write =    sock_aio_write,
        .poll =        sock_poll,
        .unlocked_ioctl = sock_ioctl,
    #ifdef CONFIG_COMPAT
        .compat_ioctl = compat_sock_ioctl,
    #endif
        .mmap =        sock_mmap,
        .open =        sock_no_open,    /* special open code to disallow open via /proc */
        .release =    sock_close,
        .fasync =    sock_fasync,
        .sendpage =    sock_sendpage,
        .splice_write = generic_splice_sendpage,
        .splice_read =    sock_splice_read,
    };

    到这里,整个socket 套接字的创建工作,就宣告完成了。

  • 相关阅读:
    Iterable,Iterator和forEach
    集合的线程安全性
    Servlet生命周期
    JavaWeb应用的生命周期
    将博客搬至CSDN
    (五)新类库的构件
    Python input和print函数
    python----调试
    Excel决定吃什么
    MATLAB—地图
  • 原文地址:https://www.cnblogs.com/cslunatic/p/3698653.html
Copyright © 2011-2022 走看看