zoukankan      html  css  js  c++  java
  • sockmap/eBPF

      This is how to use SOCKMAP: SOCKMAP or specifically "BPF_MAP_TYPE_SOCKMAP", is a type of an eBPF map. This map is an "array" - indices are integers. All this is pretty standard. The magic is in the map values - they must be TCP socket descriptors.

     copy from:https://blog.cloudflare.com/sockmap-tcp-splicing-of-the-future/

    也就是eBPF程序必须attach一个map,不是attach一个socket。so how to use SOCKMAP ?

    sock_map = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(int), sizeof(int), 2, 0)
    
    prog_parser = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...)
    prog_verdict = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...)
    bpf_prog_attach(prog_parser, sock_map, BPF_SK_SKB_STREAM_PARSER)
    bpf_prog_attach(prog_verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT)
    • 先看看 bpf_create_map的作用: 创建一个map内存块 
    • BPF map的应用场景有几种:

      • BPF程序和用户态态的交互:BPF程序运行完,得到的结果存储到map中,供用户态访问;
      • BPF程序内部交互:如果BPF程序内部需要用全局变量来交互,但是由于安全原因BPF程序不允许访问全局变量,可以使用map来充当全局变量;
      • BPF Tail call:Tail call是一个BPF程序跳转到另一BPF程序,BPF程序首先通过BPF_MAP_TYPE_PROG_ARRAY类型的map来知道另一个BPF程序的指针,然后调用tail_call()的helper function来执行Tail call。
      • BPF程序和内核态的交互:和BPF程序以外的内核程序交互,也可以使用map作为中介;
      • Map 类型(map_type),就是上文提到的各种 Map 类型
      • Map 的键大小(key_size),以字节为单位
      • Map 的值大小(value_size),以字节为单位
      • Map 的元素最大容量(max_entries),个数为单位
    {
        struct { /* anonymous struct used by BPF_MAP_CREATE command */
            __u32    map_type;    /* one of enum bpf_map_type */
            __u32    key_size;    /* size of key in bytes */
            __u32    value_size;    /* size of value in bytes */
            __u32    max_entries;    /* max number of entries in a map */
            __u32    map_flags;    /* BPF_MAP_CREATE related
                         * flags defined above.
                         */
            __u32    inner_map_fd;    /* fd pointing to the inner map */
            __u32    numa_node;    /* numa node (effective only if
                         * BPF_F_NUMA_NODE is set).
                         */
            char    map_name[BPF_OBJ_NAME_LEN];
            __u32    map_ifindex;    /* ifindex of netdev to create on */
            __u32    btf_fd;        /* fd pointing to a BTF type data */
            __u32    btf_key_type_id;    /* BTF type_id of the key */
            __u32    btf_value_type_id;    /* BTF type_id of the value */
            __u32    btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
                               * struct stored as the
                               * map value
                               */
        };
        ---------------------------
    }
    int bpf_create_map(enum bpf_map_type map_type, int key_size,
               int value_size, int max_entries, __u32 map_flags)
    {
        struct bpf_create_map_attr map_attr = {};
    
        map_attr.map_type = map_type;//BPF_MAP_TYPE_SOCKMAP  BPF_MAP_TYPE_HASH BPF_MAP_TYPE_ARRAY and so on
        map_attr.map_flags = map_flags;//map的标志位
        map_attr.key_size = key_size; //键值 中键的大小
        map_attr.value_size = value_size;// 键值中值的大小
        map_attr.max_entries = max_entries;//map键值对 最大数目
    
        return bpf_create_map_xattr(&map_attr);
    }
    int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
    {
        union bpf_attr attr;
    
        memset(&attr, '', sizeof(attr));
        // 完成 bpf_attr的赋值初始化
        attr.map_type = create_attr->map_type;
        attr.key_size = create_attr->key_size;
        attr.value_size = create_attr->value_size;
        attr.max_entries = create_attr->max_entries;
        attr.map_flags = create_attr->map_flags;
        if (create_attr->name)
            memcpy(attr.map_name, create_attr->name,
                   min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1));
        attr.numa_node = create_attr->numa_node;
        attr.btf_fd = create_attr->btf_fd;
        attr.btf_key_type_id = create_attr->btf_key_type_id;
        attr.btf_value_type_id = create_attr->btf_value_type_id;
        attr.map_ifindex = create_attr->map_ifindex;
        if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS)
            attr.btf_vmlinux_value_type_id =
                create_attr->btf_vmlinux_value_type_id;
        else
            attr.inner_map_fd = create_attr->inner_map_fd;
        //调用bpf 系统调用 创建 一个map bpf 第一个参数为命令参数,比如: BPF_MAP_CREATE BPF_MAP_UPDATE_ELEM BPF_MAP_DELETE_ELEM
        return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
    }
    View Code

    可以看到 实际上 会调用一个map_create 函数 分配内存 并初始化一个map

    static int map_create(union bpf_attr *attr)
    {
        int numa_node = bpf_map_attr_numa_node(attr);
        struct bpf_map_memory mem;
        struct bpf_map *map;
        int f_flags;
        int err;
    
        err = CHECK_ATTR(BPF_MAP_CREATE);
        if (err)
            return -EINVAL;
    
        if (attr->btf_vmlinux_value_type_id) {
            if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
                attr->btf_key_type_id || attr->btf_value_type_id)
                return -EINVAL;
        } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
            return -EINVAL;
        }
    
        f_flags = bpf_get_file_flag(attr->map_flags);
        if (f_flags < 0)
            return f_flags;
    
        if (numa_node != NUMA_NO_NODE &&
            ((unsigned int)numa_node >= nr_node_ids ||
             !node_online(numa_node)))
            return -EINVAL;
    
        /* find map type and init map: hashtable vs rbtree vs bloom vs ...
    分配内存使用    */
        map = find_and_alloc_map(attr);
        if (IS_ERR(map))
            return PTR_ERR(map);
    
        err = bpf_obj_name_cpy(map->name, attr->map_name,
                       sizeof(attr->map_name));
        if (err < 0)
            goto free_map;
    
        atomic64_set(&map->refcnt, 1);
        atomic64_set(&map->usercnt, 1);
        mutex_init(&map->freeze_mutex);
    
        map->spin_lock_off = -EINVAL;
        ----------------------------------------------
    
        err = bpf_map_alloc_id(map); // 将map 和 idx-id 相关联索引
        if (err)
            goto free_map_sec;
    
        err = bpf_map_new_fd(map, f_flags);// 将map 和fd 关联  一切皆文件
        if (err < 0) {
            /* failed to allocate fd.
             * bpf_map_put_with_uref() is needed because the above
             * bpf_map_alloc_id() has published the map
             * to the userspace and the userspace may
             * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
             */
            bpf_map_put_with_uref(map);
            return err;
        }
    
        return err;
    }

    map_create 会调用:对应map_type的ops去分配内存等

    以map_array为例:

    static const struct bpf_map_ops array_ops = {
        .map_alloc = array_map_alloc,
        .map_free = array_map_free,
        .map_get_next_key = array_map_get_next_key,
        .map_lookup_elem = array_map_lookup_elem,
        .map_update_elem = array_map_update_elem,
        .map_delete_elem = array_map_delete_elem,
    };
    
    static struct bpf_map_type_list array_type __read_mostly = {
        .ops = &array_ops,
        .type = BPF_MAP_TYPE_ARRAY,
    };
    
    
    static struct bpf_map *array_map_alloc(union bpf_attr *attr)
    {
        bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
        u32 elem_size, index_mask, max_entries;
        bool unpriv = !capable(CAP_SYS_ADMIN);
        struct bpf_array *array;
        u64 array_size, mask64;
    
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            attr->value_size == 0 || attr->map_flags)
            return ERR_PTR(-EINVAL);
    
        if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
            /* if value_size is bigger, the user space won't be able to
             * access the elements.
             */
            return ERR_PTR(-E2BIG);
    
        /* (1.1.1) 计算value的size,key的size不用计算也不用存储,因为这里的key直接就是index */
        elem_size = round_up(attr->value_size, 8);
    
        max_entries = attr->max_entries;
    
        /* On 32 bit archs roundup_pow_of_two() with max_entries that has
         * upper most bit set in u32 space is undefined behavior due to
         * resulting 1U << 32, so do it manually here in u64 space.
         */
        mask64 = fls_long(max_entries - 1);
        mask64 = 1ULL << mask64;
        mask64 -= 1;
    
        index_mask = mask64;
        if (unpriv) {
            /* round up array size to nearest power of 2,
             * since cpu will speculate within index_mask limits
             */
            max_entries = index_mask + 1;
            /* Check for overflows. */
            if (max_entries < attr->max_entries)
                return ERR_PTR(-E2BIG);
        }
    
        /* (1.1.2) 计算bpf_array + value数组的总大小,bpf_array包含了map的通用结构bpf_map */
        array_size = sizeof(*array);
        if (percpu)
            array_size += (u64) max_entries * sizeof(void *);
        else
            array_size += (u64) max_entries * elem_size;
    
        /* make sure there is no u32 overflow later in round_up() */
        if (array_size >= U32_MAX - PAGE_SIZE)
            return ERR_PTR(-ENOMEM);
    
        /* allocate all map elements and zero-initialize them */
        /* (1.1.3) 根据总大小,分配bpf_array空间 */
        array = bpf_map_area_alloc(array_size);
        if (!array)
            return ERR_PTR(-ENOMEM);
        array->index_mask = index_mask;
        array->map.unpriv_array = unpriv;
    
        /* copy mandatory map attributes */
        /* (1.1.4) 拷贝attr到array->map中 */
        array->map.map_type = attr->map_type;
        array->map.key_size = attr->key_size;
        array->map.value_size = attr->value_size;
        array->map.max_entries = attr->max_entries;
        array->elem_size = elem_size;
    
        if (!percpu)
            goto out;
    
        array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
    
        if (array_size >= U32_MAX - PAGE_SIZE ||
            elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
            bpf_map_area_free(array);
            return ERR_PTR(-ENOMEM);
        }
    out:
        array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
    
        return &array->map;
    }
    View Code

    bpf_load_program:用BPF_PROG_LOAD命令进行bpf系统调用加载 BPF 程序到内核中

    • 拷贝程序到内核;
    • 校验它的安全性;
    • 如果可能对它进行JIT编译;
    • 然后分配一个文件句柄fd给它

    完成这一切后,后续再把这段BPF程序挂载到需要运行的钩子上面。

    static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
    {
        enum bpf_prog_type type = attr->prog_type;
        struct bpf_prog *prog;
        int err;
        char license[128];
        bool is_gpl;
    
        if (CHECK_ATTR(BPF_PROG_LOAD))
            return -EINVAL;
    
        if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
                     BPF_F_ANY_ALIGNMENT |
                     BPF_F_TEST_STATE_FREQ |
                     BPF_F_TEST_RND_HI32))
            return -EINVAL;
    
        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
            (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
            !bpf_capable())
            return -EPERM;
    
        /* copy eBPF program license from user space 
        根据attr->license地址,从用户空间拷贝license字符串到内核 */
        if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
                      sizeof(license) - 1) < 0)
            return -EFAULT;
        license[sizeof(license) - 1] = 0;
    
        /* eBPF programs must be GPL compatible to use GPL-ed functions
        判断license是否符合GPL协议*/
        is_gpl = license_is_gpl_compatible(license);
            //判断BPF的总指令数是否超过BPF_MAXINSNS(4k)
        if (attr->insn_cnt == 0 ||
            attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
            return -E2BIG;
        //对BPF_PROG_TYPE_SOCKET_FILTER和BPF_PROG_TYPE_CGROUP_SKB以外的BPF程序加载,需要管理员权限
        if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
            type != BPF_PROG_TYPE_CGROUP_SKB &&
            !bpf_capable())
            return -EPERM;
    //对 CGROUP  SOCK等需要admin 权限 或者 对应net 空间的权限
        if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
            return -EPERM;
        if (is_perfmon_prog_type(type) && !perfmon_capable())
            return -EPERM;
    
        bpf_prog_load_fixup_attach_type(attr);
        if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
                           attr->attach_btf_id,
                           attr->attach_prog_fd))
            return -EINVAL;
    
        /* plain bpf_prog allocation 根据BPF指令数分配bpf_prog空间,和bpf_prog->aux空间*/
        prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
        if (!prog)
            return -ENOMEM;
    
        prog->expected_attach_type = attr->expected_attach_type;
        prog->aux->attach_btf_id = attr->attach_btf_id;
        if (attr->attach_prog_fd) {
            struct bpf_prog *tgt_prog;
    
            tgt_prog = bpf_prog_get(attr->attach_prog_fd);
            if (IS_ERR(tgt_prog)) {
                err = PTR_ERR(tgt_prog);
                goto free_prog_nouncharge;
            }
            prog->aux->linked_prog = tgt_prog;
        }
    
        prog->aux->offload_requested = !!attr->prog_ifindex;
    
        err = security_bpf_prog_alloc(prog->aux);
        if (err)
            goto free_prog_nouncharge;
    
        err = bpf_prog_charge_memlock(prog);
        if (err)
            goto free_prog_sec;
    
        prog->len = attr->insn_cnt;
    
        err = -EFAULT;//把BPF代码从用户空间地址attr->insns,拷贝到内核空间地址prog->insns
        if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
                   bpf_prog_insn_size(prog)) != 0)
            goto free_prog;
    
        prog->orig_prog = NULL;
        prog->jited = 0;
    
        atomic64_set(&prog->aux->refcnt, 1);
        prog->gpl_compatible = is_gpl ? 1 : 0;
    
        if (bpf_prog_is_dev_bound(prog->aux)) {
            err = bpf_prog_offload_init(prog, attr);
            if (err)
                goto free_prog;
        }
    
        /* find program type: socket_filter vs tracing_filter 
        根据attr->prog_type指定的type值,找到对应的bpf_prog_types,
            给bpf_prog->aux->ops赋值,这个ops是一个函数操作集*/
        err = find_prog_type(type, prog);
        if (err < 0)
            goto free_prog;
    
        prog->aux->load_time = ktime_get_boottime_ns();
        err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
                       sizeof(attr->prog_name));
        if (err < 0)
            goto free_prog;
    
        /* run eBPF verifier 使用verifer对BPF程序进行合法性扫描 */
        err = bpf_check(&prog, attr, uattr);
        if (err < 0)
            goto free_used_maps;
        /*尝试对BPF程序进行JIT转换*/
        prog = bpf_prog_select_runtime(prog, &err);
        if (err < 0)
            goto free_used_maps;
    //给BPF程序分配关联一个idx id索引
        err = bpf_prog_alloc_id(prog);
        if (err)
            goto free_used_maps;
    
        /* Upon success of bpf_prog_alloc_id(), the BPF prog is
         * effectively publicly exposed. However, retrieving via
         * bpf_prog_get_fd_by_id() will take another reference,
         * therefore it cannot be gone underneath us.
         *
         * Only for the time /after/ successful bpf_prog_new_fd()
         * and before returning to userspace, we might just hold
         * one reference and any parallel close on that fd could
         * rip everything out. Hence, below notifications must
         * happen before bpf_prog_new_fd().
         *
         * Also, any failure handling from this point onwards must
         * be using bpf_prog_put() given the program is exposed.
         */
        bpf_prog_kallsyms_add(prog);
        perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
        bpf_audit_prog(prog, BPF_AUDIT_LOAD);
    //给BPF程序分配一个文件句柄fd 
        err = bpf_prog_new_fd(prog);
        if (err < 0)
            bpf_prog_put(prog);
        return err;
    --------------------------------
    }
    bpf_prog_attach:如何把我的bpf程序,attach到这些类型上:
    重定向程序作为BPF_SK_SKB_STREAM_VERDICT附加到sockmap; 它应返回bpf_sk_redirect_map()的结果。
    一个strparser程序通过BPF_SK_SKB_STREAM_PARSER附加,并且应返回已解析数据的长度。

    能够获取什么样的context?

    指向包含包元数据/数据的结构__sk_buff的指针。 但是,sk_skb程序类型可以访问更多字段。 可用的额外字段集记录在include / linux / bpf.h中,如下所示:

    什么时候会运行?
    可以通过把BPF_SK_SKB_STREAM_PARSER 附加到sockmap上来把一个stream parser附加到一个socket上,然后,当socket通过、bpf/sockmap.c中的smap_parse_func_strparser() 接受的时候,就会执行。BPF_SK_SKB_STREAM_VERDICT也会附加到sockmap上,它通过smap_verdict_func()来执行。
    /*  bpf_load_program
    
    bpf_prog_attach(verdict_prog, map_fd, BPF_SMAP_STREAM_VERDICT, 0);
    
    int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
                unsigned int flags)
    {
        DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts,
            .flags = flags,
        );
    
        return bpf_prog_attach_xattr(prog_fd, target_fd, type, &opts);
    }
    
    int bpf_prog_attach_xattr(int prog_fd, int target_fd,
                  enum bpf_attach_type type,
                  const struct bpf_prog_attach_opts *opts)
    {
        union bpf_attr attr;
    
        if (!OPTS_VALID(opts, bpf_prog_attach_opts))
            return -EINVAL;
    
        memset(&attr, 0, sizeof(attr));
        attr.target_fd       = target_fd;
        attr.attach_bpf_fd = prog_fd;
        attr.attach_type   = type;
        attr.attach_flags  = OPTS_GET(opts, flags, 0);
        attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0);
    
        return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
    }
    int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
                 struct bpf_prog *old, u32 which)
    {
        struct sk_psock_progs *progs = sock_map_progs(map);
        struct bpf_prog **pprog;
    
        switch (which) {
    ------------------------------------------
        case BPF_SK_SKB_STREAM_PARSER:
            pprog = &progs->skb_parser;
            break;
        case BPF_SK_SKB_STREAM_VERDICT:
            pprog = &progs->skb_verdict;
            break;
    
        }
        psock_set_prog(pprog, prog);
        return 0;
    }
    
    
    
    
    int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
    {
        u32 ufd = attr->target_fd;
        struct bpf_map *map;
        struct fd f;
        int ret;
    
        if (attr->attach_flags || attr->replace_bpf_fd)
            return -EINVAL;
    
        f = fdget(ufd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
            return PTR_ERR(map);
        ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);---//找到对应的sk_psock_progs  并更新
        fdput(f);
        return ret;
    }
    
    */
    static int bpf_prog_attach(const union bpf_attr *attr)
    {
        enum bpf_prog_type ptype;BPF_SOCK_STREAM_VERDICT
        struct bpf_prog *prog = NULL;
        int ret;
    
        if (CHECK_ATTR(BPF_PROG_ATTACH))
            return -EINVAL;
    
        if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
            return -EINVAL;
        //------BPF_SK_SKB_STREAM_VERDICT-------> transmit -----BPF_PROG_TYPE_SK_SKB  也就是attach type 转换为 prog-type
        ptype = attach_type_to_prog_type(attr->attach_type);
        if (ptype == BPF_PROG_TYPE_UNSPEC)
            return -EINVAL;
    
        prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
        if (IS_ERR(prog))
            return PTR_ERR(prog);
    
        if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
            bpf_prog_put(prog);
            return -EINVAL;
        }
    /*
        const struct bpf_verifier_ops sk_skb_verifier_ops = {
            .get_func_proto     = sk_skb_func_proto,--------------bpf_sk_redirect_map_proto----------bpf_msg_redirect_map
            .is_valid_access    = sk_skb_is_valid_access,
            .convert_ctx_access = sk_skb_convert_ctx_access,
            .gen_prologue       = sk_skb_prologue,
        };
    
        */
        switch (ptype) {
        case BPF_PROG_TYPE_SK_SKB:
        case BPF_PROG_TYPE_SK_MSG:
            ret = sock_map_get_from_fd(attr, prog);// 根据target_fd 找到 map  并关联对应map
            break;
        case BPF_PROG_TYPE_LIRC_MODE2:
            ret = lirc_prog_attach(attr, prog);
            break;
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
            ret = netns_bpf_prog_attach(attr, prog);
            break;
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SKB:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_SOCK_OPS:
            ret = cgroup_bpf_prog_attach(attr, ptype, prog);
            break;
        default:
            ret = -EINVAL;
        }
    
        if (ret)
            bpf_prog_put(prog);
        return ret;
    }
    
    
    established sock_map eBPF map, with two eBPF programs attached: parser and verdict. 
    The next step is to add a TCP socket descriptor to this map

    int val = fd;
    bpf_map_update_elem(sock_map, &idx, &val, BPF_ANY);
    bpf_map_update_elem: 将fd socket 和map相关联

    会执行系统调用 bpf(BPF_MAP_UPDATE_ELEM,-----) 最后调用map_update_elem 函数处理

    static int map_update_elem(union bpf_attr *attr)
    {
        void __user *ukey = u64_to_user_ptr(attr->key);// 对应idx 索引
        void __user *uvalue = u64_to_user_ptr(attr->value);//对应 键值 value 比如 需要执行动作的socket--fd
        int ufd = attr->map_fd;
    -----------------------
        f = fdget(ufd);// map_fd--->file--->对应的map 内存
        map = __bpf_map_get(f);// map_fd--->file--->对应的map 内存 f.file->private_data;
       ------------------------------
      ----------------------------------// 将 key value 更新到map 中
        err = bpf_map_update_value(map, f, key, value, attr->flags);
    
    }
    static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
                    void *value, __u64 flags)
    {
        int err;
    
        /* Need to create a kthread, thus must support schedule */
        if (bpf_map_is_dev_bound(map)) {
            return bpf_map_offload_update_elem(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
               map->map_type == BPF_MAP_TYPE_SOCKHASH ||
               map->map_type == BPF_MAP_TYPE_SOCKMAP ||//sock_map_update_elem
               map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
            return map->ops->map_update_elem(map, key, value, flags);//
        }
    //------------------
    
        return err;

    以sock_map_update_elem 为例查看

    static int sock_map_update_elem(struct bpf_map *map, void *key,
                    void *value, u64 flags)
    {
        u32 idx = *(u32 *)key;
        struct socket *sock;
        struct sock *sk;
        int ret;
        u64 ufd;
    
        if (map->value_size == sizeof(u64))
            ufd = *(u64 *)value;
        else
            ufd = *(u32 *)value;
    ---------------------------
        sock = sockfd_lookup(ufd, &ret);// 根据value:sockt-fd 找到对应的struct socket
        ----------
        sk = sock->sk;//sock---对应的net sk 结构体
        -----------
        ret = sock_map_update_common(map, idx, sk, flags);
    
    }
    static int sock_map_update_common(struct bpf_map *map, u32 idx,
                      struct sock *sk, u64 flags)
    {
        struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
        struct sk_psock_link *link;
        struct sk_psock *psock;
        struct sock *osk;
        int ret;
    
        
        link = sk_psock_init_link();//分配内存
        
        /* Only sockets we can redirect into/from in BPF need to hold
         * refs to parser/verdict progs and have their sk_data_ready
         * and sk_write_space callbacks overridden.
         */
    
            ret = sock_map_link(map, &stab->progs, sk);
        
    
        psock = sk_psock(sk);
        WARN_ON_ONCE(!psock);
    
        raw_spin_lock_bh(&stab->lock);
        osk = stab->sks[idx];
    
        sock_map_add_link(psock, link, map, &stab->sks[idx]);
        stab->sks[idx] = sk;
        sock_map_unref(osk, &stab->sks[idx]);
        return 0;
    }
    
    static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
                 struct sock *sk)
    {
        struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
        struct sk_psock *psock;
        bool skb_progs;
        int ret;
    
        skb_verdict = READ_ONCE(progs->skb_verdict);-------赋值见-sock_map_prog_update
        skb_parser = READ_ONCE(progs->skb_parser);
        skb_progs = skb_parser && skb_verdict;
        ---------------------
    
        msg_parser = READ_ONCE(progs->msg_parser);
        ------------------
    
        psock = sock_map_psock_get_checked(sk);
        if (IS_ERR(psock)) {
            ret = PTR_ERR(psock);
            goto out_progs;
        }
    -------------------
        psock = sk_psock_init(sk, map->numa_node);
        将sk 和psock 相关联:创建psock ;psock->sk = sk;
    ---------------------
    //主要是sk->sk_prot=ops  替换sk 的ops 函数;替换为bpf_ops
        ret = sock_map_init_proto(sk, psock);
        if (ret < 0)
            goto out_drop;
    
    
        if (skb_progs && !psock->parser.enabled) {
            ret = sk_psock_init_strp(sk, psock);//设置strparser cb 回调函数
            if (ret) {
                write_unlock_bh(&sk->sk_callback_lock);
                goto out_drop;
            }
            psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
            psock_set_prog(&psock->progs.skb_parser, skb_parser);
            //设置 sk 的data_ready 数据到达唤醒函数
            sk_psock_start_strp(sk, psock);
        }
    
        return 0;
    }
    
    void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
    {
        struct sk_psock_parser *parser = &psock->parser;
    
        if (parser->enabled)
            return;
    
        parser->saved_data_ready = sk->sk_data_ready;
        sk->sk_data_ready = sk_psock_strp_data_ready;
        sk->sk_write_space = sk_psock_write_space;
        parser->enabled = true;
    }
    
    int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
    {
        static const struct strp_callbacks cb = {
            .rcv_msg    = sk_psock_strp_read,
            .read_sock_done    = sk_psock_strp_read_done,
            .parse_msg    = sk_psock_strp_parse,
        };
    
        psock->parser.enabled = false;
        return strp_init(&psock->parser.strp, sk, &cb);
    }
    
    设置strparser cb 回调函数
    int strp_init(struct strparser *strp, struct sock *sk,
              const struct strp_callbacks *cb)
    {
    --------------------
        /* The sk (sock) arg determines the mode of the stream parser.
         *
         * If the sock is set then the strparser is in receive callback mode.
         * The upper layer calls strp_data_ready to kick receive processing
         * and strparser calls the read_sock function on the socket to
         * get packets.
         *
         * If the sock is not set then the strparser is in general mode.
         * The upper layer calls strp_process for each skb to be parsed.
         */
    ---------------
        memset(strp, 0, sizeof(*strp));
    
        strp->sk = sk;
    
        strp->cb.lock = cb->lock ? : strp_sock_lock;
        strp->cb.unlock = cb->unlock ? : strp_sock_unlock;
        strp->cb.rcv_msg = cb->rcv_msg;
        strp->cb.parse_msg = cb->parse_msg;
        strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
        strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp;
    
        INIT_DELAYED_WORK(&strp->msg_timer_work, strp_msg_timeout);
        INIT_WORK(&strp->work, strp_work);
    
        return 0;
    }
    
    static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
                       struct proto *base)
    {
        prot[TCP_BPF_BASE]            = *base;
        prot[TCP_BPF_BASE].unhash        = sock_map_unhash;
        prot[TCP_BPF_BASE].close        = sock_map_close;
        prot[TCP_BPF_BASE].recvmsg        = tcp_bpf_recvmsg;
        prot[TCP_BPF_BASE].stream_memory_read    = tcp_bpf_stream_read;
    
        prot[TCP_BPF_TX]            = prot[TCP_BPF_BASE];
        prot[TCP_BPF_TX].sendmsg        = tcp_bpf_sendmsg;
        prot[TCP_BPF_TX].sendpage        = tcp_bpf_sendpage;
    }
    
    
    struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
    {
        int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
        int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
    
        if (!psock->sk_proto) {
            struct proto *ops = READ_ONCE(sk->sk_prot);
    
            if (tcp_bpf_assert_proto_ops(ops))
                return ERR_PTR(-EINVAL);
    
            tcp_bpf_check_v6_needs_rebuild(sk, ops);
        }
    
        return &tcp_bpf_prots[family][config];
    }
    
    static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
    {
        struct proto *prot;
    
        switch (sk->sk_type) {
        case SOCK_STREAM:
            prot = tcp_bpf_get_proto(sk, psock);
            break;
    
        case SOCK_DGRAM:
            prot = udp_bpf_get_proto(sk, psock);
            break;
    
    
        sk_psock_update_proto(sk, psock, prot);
        return 0;
    }
    From now on, each time our socket sd receives a packet,
    prog_parser and prog_verdict are called
    
    
    SEC("prog_parser")
    int _prog_parser(struct __sk_buff *skb)
    {
        return skb->len;
    }
    
    SEC("prog_verdict")
    int _prog_verdict(struct __sk_buff *skb)
    {
        uint32_t idx = 0;
        return bpf_sk_redirect_map(skb, &sock_map, idx, 0);
    }
    
    

    bpf_sk_redirect_map   tells the kernel: for the received packet, please oh please redirect it from a receive queue of some socket,to a transmit queue of the socket living in sock_map under index 0. In our case, these are the same sockets!Here we achieved exactly what the echo server is supposed to do, but purely in eBPF.

    const struct bpf_func_proto bpf_sk_redirect_map_proto = {
        .func           = bpf_sk_redirect_map,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type    = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_CONST_MAP_PTR,
        .arg3_type      = ARG_ANYTHING,
        .arg4_type      = ARG_ANYTHING,
    };
    
    BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
           struct bpf_map *, map, u32, key, u64, flags)
    {
        struct sock *sk;
    
        if (unlikely(flags & ~(BPF_F_INGRESS)))
            return SK_DROP;
    
        sk = __sock_map_lookup_elem(map, key);
        if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
            return SK_DROP;
    
        msg->flags = flags;
        msg->sk_redir = sk;
        return SK_PASS;
    }


    参考学习:
    eBPF学习用例:
    Linux 内核观测技术 BPF书籍
    https://davidlovezoe.club/wordpress/archives/862
    http://arthurchiao.art/blog/cilium-life-of-a-packet-pod-to-service-zh/
    https://switch-router.gitee.io/blog/strparser/

    https://davidlovezoe.club/wordpress/archives/963

    https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/

    https://jishuin.proginn.com/p/763bfbd2bc4e

    https://blog.csdn.net/pwl999/article/details/82884882

    https://github.com/zoidbergwill/awesome-ebpf

    https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/

    https://switch-router.gitee.io/blog/strparser/

    https://blogs.oracle.com/linux/notes-on-bpf-1

    总结:

    • eBPF程序处理截获报文的例子:psock,psock 使用 strpaser,将数据包的控制权转移到 eBPF 处理程序,用户可以在 eBPF 程序里完成网络报文的重定向;sockmap 建立在 psock 之上,而 psock 的底层则是 strparser

    strparser 的工作原理

    核心数据结构:struct strparser 是 strparser 框架的核心数据结构,它绑定(attach)一个 TCP sock 结构 sk 和一组回调函数 cb
    struct strparser {
        struct sock *sk;
        // code omitted ....
        struct strp_callbacks cb;
    };

    回调函数一共有以下六个:

    struct strp_callbacks {
        int (*parse_msg)(struct strparser *strp, struct sk_buff *skb);
        void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); 
        int (*read_sock_done)(struct strparser *strp, int err)
        void (*abort_parser)(struct strparser *strp, int err);
        void (*lock)(struct strparser *strp);
        void (*unlock)(struct strparser *strp);
    };

    parse_msg() 在 strpaser 收到报文时被框架调用。它用于从报文中提取下一个应用层消息(message)的长度。一个 TCP 报文里可能不止一个应用层消息,而 parse_msg() 就是提供给使用者去识别各个消息的手段

    strpaser 截获报文

    正常情况下,内核 TCP 层处理报文后,会调用 sock->sk_data_ready(sk) , 它的默认动作是 wake up 一个用户态进程.

    void tcp_data_ready(struct sock *sk)
    {
        const struct tcp_sock *tp = tcp_sk(sk);
        // code omitted
        sk->sk_data_ready(sk);
    }
    我们期望报文能进入 strpaser ,但报文显然不会平白无故地地进入 strpaser ,因此,我们需要在报文的上送路径上动一些手脚:替换掉 sk->sk_data_ready 函数
    static int tls_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len){
        // code omitted
        tsk->saved_sk_data_ready = tsk->socket->sk->sk_data_ready;
        tsk->saved_sk_write_space = tsk->socket->sk->sk_write_space;sk_write_space
        tsk->socket->sk->sk_data_ready = tls_data_ready; 
        tsk->socket->sk->sk_write_space = tls_write_space;
        tsk->socket->sk->sk_user_data = tsk;     
        // code omitted
    }

    在 psock 的例子中, sk_psock_strp_data_ready() 被赋值到 sk->sk_data_ready

    void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
    {
        struct sk_psock_parser *parser = &psock->parser;
        // code omitted
        parser->saved_data_ready = sk->sk_data_ready;
        sk->sk_data_ready = sk_psock_strp_data_ready;
        sk->sk_write_space = sk_psock_write_space;
        parser->enabled = true;
    }

    替换之后,当有 TCP 报文准备上送时,用户定义的 sk->sk_data_ready 函数就会被调用,在该函数中,KTLS/psock 需要调用框架函数strp_data_ready() 将报文转交给 strpaser 框架。

    对 KTLS

    static void tls_data_ready(struct sock *sk)
    {
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
    
        strp_data_ready(&ctx->strp);
    }
    View Code

    对 psock

    static void sk_psock_strp_data_ready(struct sock *sk)
    {
        struct sk_psock *psock;
    
        rcu_read_lock();
        psock = sk_psock(sk);
        if (likely(psock)) {
            write_lock_bh(&sk->sk_callback_lock);
            strp_data_ready(&psock->parser.strp);
            write_unlock_bh(&sk->sk_callback_lock);
        }
        rcu_read_unlock();
    }
    strpaser 处理报文

    strpaser 框架拿到报文之后,通常会依次调用用户设置的 parse_msg 和 rcv_msg 回调函数,用户在回调函数里用来决定报文应该何去何从

    strp_data_ready
      |- strp_read_sock
        |- tcp_read_sock
           |- strp_recv
             |- __strp_recv
               |- strp->cb.parse_msg(strp, head)
               ...
               |- strp->cb.rcv_msg(strp, head);
    比如对 KTLS, 就是将报文上送给应用层(AF_KTLS socket)
    
    static void tls_queue(struct strparser *strp, struct sk_buff *skb)
    {
        struct tls_sock *tsk;
        
        // code omitted 
        tsk = strp->sk->sk_user_data;
        // code omitted 
        
        ret = sock_queue_rcv_skb((struct sock *)tsk, skb);
        // code omitted 
    }

    而对于 psock, 则是运行 eBPF 程序,得到动作(verdict)。

    static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
    {
        struct sk_psock *psock = sk_psock_from_strp(strp);
        struct bpf_prog *prog;
        int ret = __SK_DROP;
    
        rcu_read_lock();
        prog = READ_ONCE(psock->progs.skb_verdict);
        if (likely(prog)) {
            skb_orphan(skb);
            tcp_skb_bpf_redirect_clear(skb);
            ret = sk_psock_bpf_run(psock, prog, skb); // if we rdir , return SK_PASS
            ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
        }
        rcu_read_unlock();
        sk_psock_verdict_apply(psock, skb, ret);

    strpaser 是这个框架只是限定如何处理报文,而只是在内核层面提供给了用户一个提前处理 TCP 报文的时机和一组回调函数,用户通过不同的回调函数可以实现不同的逻辑。

    https://switch-router.gitee.io/blog/strparser/-----------------------------------------*************************------------------------------------------------------





  • 相关阅读:
    CTFHUB-技能树-Web-文件上传
    CTFHUB-技能树-Web-XSS-反射型
    一个算法习题
    python2和python3如何共存并都可用
    beacon帧字段结构最全总结(一)——beacon基本结构
    WiFi基础知识整理
    无线 WIFI 的13个信道频率范围
    Wifi模块的工作原理
    WIFI无线协议802.11a/b/g/n/ac的演变以及区别
    HTTP网络通讯协议详解
  • 原文地址:https://www.cnblogs.com/codestack/p/13947183.html
Copyright © 2011-2022 走看看