zoukankan      html  css  js  c++  java
  • PostgreSQL在何处处理 sql查询之六十四

    Path的定义:

    /*
     * Type "Path" is used as-is for sequential-scan paths, as well as some other
     * simple plan types that we don't need any extra information in the path for.
     * For other path types it is the first component of a larger struct.
     *
     * "pathtype" is the NodeTag of the Plan node we could build from this Path.
     * It is partially redundant with the Path's NodeTag, but allows us to use
     * the same Path type for multiple Plan types when there is no need to
     * distinguish the Plan type during path processing.
     *
     * "param_info", if not NULL, links to a ParamPathInfo that identifies outer
     * relation(s) that provide parameter values to each scan of this path.
     * That means this path can only be joined to those rels by means of nestloop
     * joins with this path on the inside.    Also note that a parameterized path
     * is responsible for testing all "movable" joinclauses involving this rel
     * and the specified outer rel(s).
     *
     * "rows" is the same as parent->rows in simple paths, but in parameterized
     * paths and UniquePaths it can be less than parent->rows, reflecting the
     * fact that we've filtered by extra join conditions or removed duplicates.
     *
     * "pathkeys" is a List of PathKey nodes (see above), describing the sort
     * ordering of the path's output rows.
     */
    typedef struct Path
    {
        NodeTag        type;
    
        NodeTag        pathtype;        /* tag identifying scan/join method */
    
        RelOptInfo *parent;            /* the relation this path can build */
        ParamPathInfo *param_info;    /* parameterization info, or NULL if none */
    
        /* estimated size/costs for path (see costsize.c for more info) */
        double        rows;            /* estimated number of result tuples */
        Cost        startup_cost;    /* cost expended before fetching any tuples */
        Cost        total_cost;        /* total cost (assuming all tuples fetched) */
    
        List       *pathkeys;        /* sort ordering of path's output */
        /* pathkeys is a List of PathKey nodes; see above */
    } Path;

    hash join 的path计算是在这里:

    /*
     * hash_inner_and_outer
     *      Create hashjoin join paths by explicitly hashing both the outer and
     *      inner keys of each available hash clause.
     *
     * 'joinrel' is the join relation
     * 'outerrel' is the outer join relation
     * 'innerrel' is the inner join relation
     * 'restrictlist' contains all of the RestrictInfo nodes for restriction
     *        clauses that apply to this join
     * 'jointype' is the type of join to do
     * 'sjinfo' is extra info about the join for selectivity estimation
     * 'semifactors' contains valid data if jointype is SEMI or ANTI
     * 'param_source_rels' are OK targets for parameterization of result paths
     */
    static void
    hash_inner_and_outer(PlannerInfo *root,
                         RelOptInfo *joinrel,
                         RelOptInfo *outerrel,
                         RelOptInfo *innerrel,
                         List *restrictlist,
                         JoinType jointype,
                         SpecialJoinInfo *sjinfo,
                         SemiAntiJoinFactors *semifactors,
                         Relids param_source_rels)
    {
        bool        isouterjoin = IS_OUTER_JOIN(jointype);
        List       *hashclauses;
        ListCell   *l;
    
        /*
         * We need to build only one hashclauses list for any given pair of outer
         * and inner relations; all of the hashable clauses will be used as keys.
         *
         * Scan the join's restrictinfo list to find hashjoinable clauses that are
         * usable with this pair of sub-relations.
         */
        hashclauses = NIL;
        foreach(l, restrictlist)
        {
            RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
    
            /*
             * If processing an outer join, only use its own join clauses for
             * hashing.  For inner joins we need not be so picky.
             */
            if (isouterjoin && restrictinfo->is_pushed_down)
                continue;
    
            if (!restrictinfo->can_join ||
                restrictinfo->hashjoinoperator == InvalidOid)
                continue;            /* not hashjoinable */
    
            /*
             * Check if clause has the form "outer op inner" or "inner op outer".
             */
            if (!clause_sides_match_join(restrictinfo, outerrel, innerrel))
                continue;            /* no good for these input relations */
    
            hashclauses = lappend(hashclauses, restrictinfo);
        }
    
    
        /* If we found any usable hashclauses, make paths */
        if (hashclauses)
        {
            /*
             * We consider both the cheapest-total-cost and cheapest-startup-cost
             * outer paths.  There's no need to consider any but the
             * cheapest-total-cost inner path, however.
             */
            Path       *cheapest_startup_outer = outerrel->cheapest_startup_path;
            Path       *cheapest_total_outer = outerrel->cheapest_total_path;
            Path       *cheapest_total_inner = innerrel->cheapest_total_path;
    
            /* Unique-ify if need be; we ignore parameterized possibilities */
            if (jointype == JOIN_UNIQUE_OUTER)
            {
    
                fprintf(stderr,"add_paths_to_joinrel--------1\n");
    
                cheapest_total_outer = (Path *)
                    create_unique_path(root, outerrel,
                                       cheapest_total_outer, sjinfo);
                Assert(cheapest_total_outer);
                jointype = JOIN_INNER;
                try_hashjoin_path(root,
                                  joinrel,
                                  jointype,
                                  sjinfo,
                                  semifactors,
                                  param_source_rels,
                                  cheapest_total_outer,
                                  cheapest_total_inner,
                                  restrictlist,
                                  hashclauses);
                /* no possibility of cheap startup here */
            }
            else if (jointype == JOIN_UNIQUE_INNER)
            {
    
                fprintf(stderr,"add_paths_to_joinrel--------1\n");
    
                cheapest_total_inner = (Path *)
                    create_unique_path(root, innerrel,
                                       cheapest_total_inner, sjinfo);
                Assert(cheapest_total_inner);
                jointype = JOIN_INNER;
                try_hashjoin_path(root,
                                  joinrel,
                                  jointype,
                                  sjinfo,
                                  semifactors,
                                  param_source_rels,
                                  cheapest_total_outer,
                                  cheapest_total_inner,
                                  restrictlist,
                                  hashclauses);
                if (cheapest_startup_outer != cheapest_total_outer)
                    try_hashjoin_path(root,
                                      joinrel,
                                      jointype,
                                      sjinfo,
                                      semifactors,
                                      param_source_rels,
                                      cheapest_startup_outer,
                                      cheapest_total_inner,
                                      restrictlist,
                                      hashclauses);
            }
            else
            {
    
                fprintf(stderr,"add_paths_to_joinrel--------3\n");
    
                /*
                 * For other jointypes, we consider the cheapest startup outer
                 * together with the cheapest total inner, and then consider
                 * pairings of cheapest-total paths including parameterized ones.
                 * There is no use in generating parameterized paths on the basis
                 * of possibly cheap startup cost, so this is sufficient.
                 */
                ListCell   *lc1;
                ListCell   *lc2;
    
                try_hashjoin_path(root,
                                  joinrel,
                                  jointype,
                                  sjinfo,
                                  semifactors,
                                  param_source_rels,
                                  cheapest_startup_outer,
                                  cheapest_total_inner,
                                  restrictlist,
                                  hashclauses);
    
                foreach(lc1, outerrel->cheapest_parameterized_paths)
                {
                    Path       *outerpath = (Path *) lfirst(lc1);
    
                    /*
                     * We cannot use an outer path that is parameterized by the
                     * inner rel.
                     */
                    if (bms_overlap(PATH_REQ_OUTER(outerpath), innerrel->relids))
                        continue;
    
                    foreach(lc2, innerrel->cheapest_parameterized_paths)
                    {
                        Path       *innerpath = (Path *) lfirst(lc2);
    
                        /*
                         * We cannot use an inner path that is parameterized by
                         * the outer rel, either.
                         */
                        if (bms_overlap(PATH_REQ_OUTER(innerpath),
                                        outerrel->relids))
                            continue;
    
                        if (outerpath == cheapest_startup_outer &&
                            innerpath == cheapest_total_inner)
                            continue;        /* already tried it */
    
                        try_hashjoin_path(root,
                                          joinrel,
                                          jointype,
                                          sjinfo,
                                          semifactors,
                                          param_source_rels,
                                          outerpath,
                                          innerpath,
                                          restrictlist,
                                          hashclauses);
                    }
                }
            }
        }
    }

    对我的查询,进行简化:

    postgres=# select * from sales s inner join customers c on c.cust_id = s.cust_id;
     cust_id |   item   | cust_id | cust_name 
    ---------+----------+---------+-----------
           2 | camera   |       2 | John Doe
           3 | computer |       3 | Jane Doe
           3 | monitor  |       3 | Jane Doe
    (3 rows)
    
    postgres=# 

    简化后:

    static void
    hash_inner_and_outer(PlannerInfo *root,
                         RelOptInfo *joinrel,
                         RelOptInfo *outerrel,
                         RelOptInfo *innerrel,
                         List *restrictlist,
                         JoinType jointype,
                         SpecialJoinInfo *sjinfo,
                         SemiAntiJoinFactors *semifactors,
                         Relids param_source_rels)
    {
        ...
        hashclauses = NIL;
        foreach(l, restrictlist)
        {
            ...
            hashclauses = lappend(hashclauses, restrictinfo);
        }
    
    
        /* If we found any usable hashclauses, make paths */
        if (hashclauses)
        {
            ...
            Path       *cheapest_startup_outer = outerrel->cheapest_startup_path;
            Path       *cheapest_total_outer = outerrel->cheapest_total_path;
            Path       *cheapest_total_inner = innerrel->cheapest_total_path;
    
            /* Unique-ify if need be; we ignore parameterized possibilities */
            if (jointype == JOIN_UNIQUE_OUTER)
            {
               ...
            }
            else if (jointype == JOIN_UNIQUE_INNER)
            {
               ...
            }
            else
            {
    /*
                 * For other jointypes, we consider the cheapest startup outer
                 * together with the cheapest total inner, and then consider
                 * pairings of cheapest-total paths including parameterized ones.
                 * There is no use in generating parameterized paths on the basis
                 * of possibly cheap startup cost, so this is sufficient.
                 */
                ListCell   *lc1;
                ListCell   *lc2;
    
                try_hashjoin_path(root,
                                  joinrel,
                                  jointype,
                                  sjinfo,
                                  semifactors,
                                  param_source_rels,
                                  cheapest_startup_outer,
                                  cheapest_total_inner,
                                  restrictlist,
                                  hashclauses);
    
               ...
            }
        }
    }

    而 try_hashjoin_path是如何作呢?

    /*
     * try_hashjoin_path
     *      Consider a hash join path; if it appears useful, push it into
     *      the joinrel's pathlist via add_path().
     */
    static void
    try_hashjoin_path(PlannerInfo *root,
                      RelOptInfo *joinrel,
                      JoinType jointype,
                      SpecialJoinInfo *sjinfo,
                      SemiAntiJoinFactors *semifactors,
                      Relids param_source_rels,
                      Path *outer_path,
                      Path *inner_path,
                      List *restrict_clauses,
                      List *hashclauses)
    {
        Relids        required_outer;
        JoinCostWorkspace workspace;
    
        /*
         * Check to see if proposed path is still parameterized, and reject if the
         * parameterization wouldn't be sensible.
         */
        required_outer = calc_non_nestloop_required_outer(outer_path,
                                                          inner_path);
        if (required_outer &&
            !bms_overlap(required_outer, param_source_rels))
        {
            /* Waste no memory when we reject a path here */
            bms_free(required_outer);
            return;
        }
    
        /*
         * See comments in try_nestloop_path().  Also note that hashjoin paths
         * never have any output pathkeys, per comments in create_hashjoin_path.
         */
        initial_cost_hashjoin(root, &workspace, jointype, hashclauses,
                              outer_path, inner_path,
                              sjinfo, semifactors);
    
        if (add_path_precheck(joinrel,
                              workspace.startup_cost, workspace.total_cost,
                              NIL, required_outer))
        {
            add_path(joinrel, (Path *)
                     create_hashjoin_path(root,
                                          joinrel,
                                          jointype,
                                          &workspace,
                                          sjinfo,
                                          semifactors,
                                          outer_path,
                                          inner_path,
                                          restrict_clauses,
                                          required_outer,
                                          hashclauses));
        }
        else
        {
            /* Waste no memory when we reject a path here */
            bms_free(required_outer);
        }
    }

    接着看 create_hashjoin_path函数:

    /*
     * create_hashjoin_path
     *      Creates a pathnode corresponding to a hash join between two relations.
     *
     * 'joinrel' is the join relation
     * 'jointype' is the type of join required
     * 'workspace' is the result from initial_cost_hashjoin
     * 'sjinfo' is extra info about the join for selectivity estimation
     * 'semifactors' contains valid data if jointype is SEMI or ANTI
     * 'outer_path' is the cheapest outer path
     * 'inner_path' is the cheapest inner path
     * 'restrict_clauses' are the RestrictInfo nodes to apply at the join
     * 'required_outer' is the set of required outer rels
     * 'hashclauses' are the RestrictInfo nodes to use as hash clauses
     *        (this should be a subset of the restrict_clauses list)
     */
    HashPath *
    create_hashjoin_path(PlannerInfo *root,
                         RelOptInfo *joinrel,
                         JoinType jointype,
                         JoinCostWorkspace *workspace,
                         SpecialJoinInfo *sjinfo,
                         SemiAntiJoinFactors *semifactors,
                         Path *outer_path,
                         Path *inner_path,
                         List *restrict_clauses,
                         Relids required_outer,
                         List *hashclauses)
    {
        HashPath   *pathnode = makeNode(HashPath);
    
        pathnode->jpath.path.pathtype = T_HashJoin;
        pathnode->jpath.path.parent = joinrel;
        pathnode->jpath.path.param_info =
            get_joinrel_parampathinfo(root,
                                      joinrel,
                                      outer_path,
                                      inner_path,
                                      sjinfo,
                                      required_outer,
                                      &restrict_clauses);
    
        /*
         * A hashjoin never has pathkeys, since its output ordering is
         * unpredictable due to possible batching.    XXX If the inner relation is
         * small enough, we could instruct the executor that it must not batch,
         * and then we could assume that the output inherits the outer relation's
         * ordering, which might save a sort step.    However there is considerable
         * downside if our estimate of the inner relation size is badly off. For
         * the moment we don't risk it.  (Note also that if we wanted to take this
         * seriously, joinpath.c would have to consider many more paths for the
         * outer rel than it does now.)
         */
        pathnode->jpath.path.pathkeys = NIL;
        pathnode->jpath.jointype = jointype;
        pathnode->jpath.outerjoinpath = outer_path;
        pathnode->jpath.innerjoinpath = inner_path;
        pathnode->jpath.joinrestrictinfo = restrict_clauses;
        pathnode->path_hashclauses = hashclauses;
        /* final_cost_hashjoin will fill in pathnode->num_batches */
    
        final_cost_hashjoin(root, pathnode, workspace, sjinfo, semifactors);
    
        return pathnode;
    }

    再往下看 ,final_cost_hashjoin:

    /*
     * final_cost_hashjoin
     *      Final estimate of the cost and result size of a hashjoin path.
     *
     * Note: the numbatches estimate is also saved into 'path' for use later
     *
     * 'path' is already filled in except for the rows and cost fields and
     *        num_batches
     * 'workspace' is the result from initial_cost_hashjoin
     * 'sjinfo' is extra info about the join for selectivity estimation
     * 'semifactors' contains valid data if path->jointype is SEMI or ANTI
     */
    void
    final_cost_hashjoin(PlannerInfo *root, HashPath *path,
                        JoinCostWorkspace *workspace,
                        SpecialJoinInfo *sjinfo,
                        SemiAntiJoinFactors *semifactors)
    {
        Path       *outer_path = path->jpath.outerjoinpath;
        Path       *inner_path = path->jpath.innerjoinpath;
        double        outer_path_rows = outer_path->rows;
        double        inner_path_rows = inner_path->rows;
        List       *hashclauses = path->path_hashclauses;
        Cost        startup_cost = workspace->startup_cost;
        Cost        run_cost = workspace->run_cost;
        int            numbuckets = workspace->numbuckets;
        int            numbatches = workspace->numbatches;
        Cost        cpu_per_tuple;
        QualCost    hash_qual_cost;
        QualCost    qp_qual_cost;
        double        hashjointuples;
        double        virtualbuckets;
        Selectivity innerbucketsize;
        ListCell   *hcl;
    
        /* Mark the path with the correct row estimate */
        if (path->jpath.path.param_info)
            path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
        else
            path->jpath.path.rows = path->jpath.path.parent->rows;
    
        /*
         * We could include disable_cost in the preliminary estimate, but that
         * would amount to optimizing for the case where the join method is
         * disabled, which doesn't seem like the way to bet.
         */
        if (!enable_hashjoin)
            startup_cost += disable_cost;
    
        /* mark the path with estimated # of batches */
        path->num_batches = numbatches;
    
        /* and compute the number of "virtual" buckets in the whole join */
        virtualbuckets = (double) numbuckets *(double) numbatches;
    
        /*
         * Determine bucketsize fraction for inner relation.  We use the smallest
         * bucketsize estimated for any individual hashclause; this is undoubtedly
         * conservative.
         *
         * BUT: if inner relation has been unique-ified, we can assume it's good
         * for hashing.  This is important both because it's the right answer, and
         * because we avoid contaminating the cache with a value that's wrong for
         * non-unique-ified paths.
         */
        if (IsA(inner_path, UniquePath))
            innerbucketsize = 1.0 / virtualbuckets;
        else
        {
            innerbucketsize = 1.0;
            foreach(hcl, hashclauses)
            {
                RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(hcl);
                Selectivity thisbucketsize;
    
                Assert(IsA(restrictinfo, RestrictInfo));
    
                /*
                 * First we have to figure out which side of the hashjoin clause
                 * is the inner side.
                 *
                 * Since we tend to visit the same clauses over and over when
                 * planning a large query, we cache the bucketsize estimate in the
                 * RestrictInfo node to avoid repeated lookups of statistics.
                 */
                if (bms_is_subset(restrictinfo->right_relids,
                                  inner_path->parent->relids))
                {
                    /* righthand side is inner */
                    thisbucketsize = restrictinfo->right_bucketsize;
                    if (thisbucketsize < 0)
                    {
                        /* not cached yet */
                        thisbucketsize =
                            estimate_hash_bucketsize(root,
                                               get_rightop(restrictinfo->clause),
                                                     virtualbuckets);
                        restrictinfo->right_bucketsize = thisbucketsize;
                    }
                }
                else
                {
                    Assert(bms_is_subset(restrictinfo->left_relids,
                                         inner_path->parent->relids));
                    /* lefthand side is inner */
                    thisbucketsize = restrictinfo->left_bucketsize;
                    if (thisbucketsize < 0)
                    {
                        /* not cached yet */
                        thisbucketsize =
                            estimate_hash_bucketsize(root,
                                                get_leftop(restrictinfo->clause),
                                                     virtualbuckets);
                        restrictinfo->left_bucketsize = thisbucketsize;
                    }
                }
    
                if (innerbucketsize > thisbucketsize)
                    innerbucketsize = thisbucketsize;
            }
        }
    
        /*
         * Compute cost of the hashquals and qpquals (other restriction clauses)
         * separately.
         */
        cost_qual_eval(&hash_qual_cost, hashclauses, root);
        cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
        qp_qual_cost.startup -= hash_qual_cost.startup;
        qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
    
        /* CPU costs */
    
        if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_ANTI)
        {
            double        outer_matched_rows;
            Selectivity inner_scan_frac;
    
            /*
             * SEMI or ANTI join: executor will stop after first match.
             *
             * For an outer-rel row that has at least one match, we can expect the
             * bucket scan to stop after a fraction 1/(match_count+1) of the
             * bucket's rows, if the matches are evenly distributed.  Since they
             * probably aren't quite evenly distributed, we apply a fuzz factor of
             * 2.0 to that fraction.  (If we used a larger fuzz factor, we'd have
             * to clamp inner_scan_frac to at most 1.0; but since match_count is
             * at least 1, no such clamp is needed now.)
             */
            outer_matched_rows = rint(outer_path_rows * semifactors->outer_match_frac);
            inner_scan_frac = 2.0 / (semifactors->match_count + 1.0);
    
            startup_cost += hash_qual_cost.startup;
            run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
                clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
    
            /*
             * For unmatched outer-rel rows, the picture is quite a lot different.
             * In the first place, there is no reason to assume that these rows
             * preferentially hit heavily-populated buckets; instead assume they
             * are uncorrelated with the inner distribution and so they see an
             * average bucket size of inner_path_rows / virtualbuckets.  In the
             * second place, it seems likely that they will have few if any exact
             * hash-code matches and so very few of the tuples in the bucket will
             * actually require eval of the hash quals.  We don't have any good
             * way to estimate how many will, but for the moment assume that the
             * effective cost per bucket entry is one-tenth what it is for
             * matchable tuples.
             */
            run_cost += hash_qual_cost.per_tuple *
                (outer_path_rows - outer_matched_rows) *
                clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
    
            /* Get # of tuples that will pass the basic join */
            if (path->jpath.jointype == JOIN_SEMI)
                hashjointuples = outer_matched_rows;
            else
                hashjointuples = outer_path_rows - outer_matched_rows;
        }
        else
        {
            /*
             * The number of tuple comparisons needed is the number of outer
             * tuples times the typical number of tuples in a hash bucket, which
             * is the inner relation size times its bucketsize fraction.  At each
             * one, we need to evaluate the hashjoin quals.  But actually,
             * charging the full qual eval cost at each tuple is pessimistic,
             * since we don't evaluate the quals unless the hash values match
             * exactly.  For lack of a better idea, halve the cost estimate to
             * allow for that.
             */
            startup_cost += hash_qual_cost.startup;
            run_cost += hash_qual_cost.per_tuple * outer_path_rows *
                clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
    
            /*
             * Get approx # tuples passing the hashquals.  We use
             * approx_tuple_count here because we need an estimate done with
             * JOIN_INNER semantics.
             */
            hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
        }
    
        /*
         * For each tuple that gets through the hashjoin proper, we charge
         * cpu_tuple_cost plus the cost of evaluating additional restriction
         * clauses that are to be applied at the join.    (This is pessimistic since
         * not all of the quals may get evaluated at each tuple.)
         */
        startup_cost += qp_qual_cost.startup;
        cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
        run_cost += cpu_per_tuple * hashjointuples;
    
        path->jpath.path.startup_cost = startup_cost;
        path->jpath.path.total_cost = startup_cost + run_cost;
    }
  • 相关阅读:
    .Net操作Sharepoint常用方法(1) 判断List或Library是否存在
    Aspose.Cells读取Excel获取DataTable方法
    Nginx系统学习笔记(1)Say Hello
    老刘Hadoop学习笔记(6)Hadoop 单机模式 伪分布式 完全分布式区别
    老刘Hadoop学习笔记(5)安装JDK
    老刘Hadoop学习笔记(4)JDK和Open JDK区别
    老刘Hadoop学习笔记(3)新建用户及用户组
    老刘Hadoop学习笔记(2)修改虚拟机hostname及hosts文件
    老刘Hadoop学习笔记(1)开始学习
    MySQL中变量为空时不执行where过滤
  • 原文地址:https://www.cnblogs.com/gaojian/p/3134078.html
Copyright © 2011-2022 走看看