PostgreSQL 源碼解讀(232)- 查詢#125(NOT IN實(shí)現(xiàn)#3)

發(fā)布時(shí)間:2020-08-11 00:16:21

本節(jié)介紹了PostgreSQL含有NOT IN查詢語句實(shí)現(xiàn)ExecMaterial函數(shù)中部分依賴的函數(shù)。



/* ----------------
 *        SubPlanState node
 * ----------------
typedef struct SubPlanState
    NodeTag        type;
    SubPlan    *subplan;        /* expression plan node */
    struct PlanState *planstate;    /* subselect plan's state tree */
    struct PlanState *parent;    /* parent plan node's state tree */
    ExprState  *testexpr;        /* 組合表達(dá)式狀態(tài);state of combining expression */
    List       *args;            /* 參數(shù)表達(dá)式狀態(tài);states of argument expression(s) */
    HeapTuple    curTuple;        /* subplan最近的元組;copy of most recent tuple from subplan */
    Datum        curArray;        /* most recent array from ARRAY() subplan */
    /* these are used when hashing the subselect's output: */
    TupleDesc    descRight;        /* 投影后的子查詢描述符;subselect desc after projection */
    ProjectionInfo *projLeft;    /* for projecting lefthand exprs */
    ProjectionInfo *projRight;    /* for projecting subselect output */
    TupleHashTable hashtable;    /* hash table for no-nulls subselect rows */
    TupleHashTable hashnulls;    /* hash table for rows with null(s) */
    bool        havehashrows;    /* true if hashtable is not empty */
    bool        havenullrows;    /* true if hashnulls is not empty */
    MemoryContext hashtablecxt; /* memory context containing hash tables */
    MemoryContext hashtempcxt;    /* temp memory context for hash tables */
    ExprContext *innerecontext; /* econtext for computing inner tuples */
    AttrNumber *keyColIdx;        /* control data for hash tables */
    Oid           *tab_eq_funcoids;    /* equality func oids for table
                                     * datatype(s) */
    Oid           *tab_collations; /* collations for hash and comparison */
    FmgrInfo   *tab_hash_funcs; /* hash functions for table datatype(s) */
    FmgrInfo   *tab_eq_funcs;    /* equality functions for table datatype(s) */
    FmgrInfo   *lhs_hash_funcs; /* hash functions for lefthand datatype(s) */
    FmgrInfo   *cur_eq_funcs;    /* equality functions for LHS vs. table */
    ExprState  *cur_eq_comp;    /* equality comparator for LHS vs. table */
} SubPlanState;


 * SubPlan - executable expression node for a subplan (sub-SELECT)
 * The planner replaces SubLink nodes in expression trees with SubPlan
 * nodes after it has finished planning the subquery.  SubPlan references
 * a sub-plantree stored in the subplans list of the toplevel PlannedStmt.
 * (We avoid a direct link to make it easier to copy expression trees
 * without causing multiple processing of the subplan.)
 * 查詢規(guī)劃器在完成子查詢的規(guī)劃后使用SubPlan節(jié)點(diǎn)替換表達(dá)式樹中的SubLink節(jié)點(diǎn)。
 * SubPlan引用了存儲(chǔ)在高層PlannedStmt中的subplans鏈表中的sub-plantree。
 * (避免使用直接鏈接,從而使得拷貝表達(dá)式樹相對(duì)比較簡單)
 * In an ordinary subplan, testexpr points to an executable expression
 * (OpExpr, an AND/OR tree of OpExprs, or RowCompareExpr) for the combining
 * operator(s); the left-hand arguments are the original lefthand expressions,
 * and the right-hand arguments are PARAM_EXEC Param nodes representing the
 * outputs of the sub-select.  (NOTE: runtime coercion functions may be
 * inserted as well.)  This is just the same expression tree as testexpr in
 * the original SubLink node, but the PARAM_SUBLINK nodes are replaced by
 * suitably numbered PARAM_EXEC nodes.
 * 常規(guī)情況下,testexpr指向用于組合操作的可執(zhí)行表達(dá)式(OpExpr、OpExprs的AND/OR樹或者RowCompareExpr);
 * 左參數(shù)是原始的左表達(dá)式,右參數(shù)是PARAM_EXEC參數(shù)節(jié)點(diǎn)用以表示子查詢的輸出。
 * 與原始SubLink節(jié)點(diǎn)的testexpr具有相同的表達(dá)式樹,但PARAM_SUBLINK節(jié)點(diǎn)則使用合適的已編號(hào)PARAM_EXEC節(jié)點(diǎn)替代。
 * If the sub-select becomes an initplan rather than a subplan, the executable
 * expression is part of the outer plan's expression tree (and the SubPlan
 * node itself is not, but rather is found in the outer plan's initPlan
 * list).  In this case testexpr is NULL to avoid duplication.
 * 如果子查詢成了initplan而不是subplan,可執(zhí)行的表達(dá)式是外層plan表達(dá)式樹的一部分。
 * 這種情況下,testexpr為NULL以避免重復(fù)。
 * The planner also derives lists of the values that need to be passed into
 * and out of the subplan.  Input values are represented as a list "args" of
 * expressions to be evaluated in the outer-query context (currently these
 * args are always just Vars, but in principle they could be any expression).
 * The values are assigned to the global PARAM_EXEC params indexed by parParam
 * (the parParam and args lists must have the same ordering).  setParam is a
 * list of the PARAM_EXEC params that are computed by the sub-select, if it
 * is an initplan; they are listed in order by sub-select output column
 * position.  (parParam and setParam are integer Lists, not Bitmapsets,
 * because their ordering is significant.)
 * 規(guī)劃器還派生了需要傳入和傳出子計(jì)劃的值的鏈表。
 * 輸入值標(biāo)識(shí)位表達(dá)式的“args”鏈表,在外層查詢上下文中進(jìn)行解析。
 * (這些args通常是Vars,但原則上它們可以是任意表達(dá)式)
 * 這些值以parParam為索引給全局PARAM_EXEC參數(shù)賦值。
 * setParam是PARAM_EXEC參數(shù)鏈表,通過子查詢(如為initplan)計(jì)算所得。
 * 它們按子查詢輸出列的位置進(jìn)行排序組織為鏈表形式。
 * (parParam和setParam是整型鏈表,而不是Bitmapsets鏈表)
 * Also, the planner computes startup and per-call costs for use of the
 * SubPlan.  Note that these include the cost of the subquery proper,
 * evaluation of the testexpr if any, and any hashtable management overhead.
 * 同時(shí),規(guī)劃器計(jì)算SubPlan啟動(dòng)和每次調(diào)用的成本。注意:包括子查詢正常解析testexpr的成本以及哈希表管理成本。
typedef struct SubPlan
    Expr        xpr;//表達(dá)式
    /* Fields copied from original SubLink: */
    SubLinkType subLinkType;    /* see above */
    /* The combining operators, transformed to an executable expression: */
    Node       *testexpr;        /* OpExpr or RowCompareExpr expression tree */
    List       *paramIds;        /* 參數(shù)IDs;IDs of Params embedded in the above */
    /* Identification of the Plan tree to use: */
    //Plan tree標(biāo)識(shí)
    int            plan_id;        /* Index (from 1) in PlannedStmt.subplans */
    /* Identification of the SubPlan for EXPLAIN and debugging purposes: */
    char       *plan_name;        /* A name assigned during planning */
    /* Extra data useful for determining subplan's output type: */
    Oid            firstColType;    /* subplan結(jié)果的第一個(gè)列類型;Type of first column of subplan result */
    int32        firstColTypmod; /* 第一列的Typmod;Typmod of first column of subplan result */
    Oid            firstColCollation;    /* 第一列的Collation;Collation of first column of subplan
                                     * result */
    /* Information about execution strategy: */
    bool        useHashTable;    /* 是否使用哈希表存儲(chǔ)子查詢輸出;true to store subselect output in a hash
                                 * table (implies we are doing "IN") */
    bool        unknownEqFalse; /* 如OK為T,如為未知?jiǎng)t為F;快速處理null值;true if it's okay to return FALSE when the
                                 * spec result is UNKNOWN; this allows much
                                 * simpler handling of null values */
    bool        parallel_safe;    /* 是否并行安全?is the subplan parallel-safe? */
    /* Note: parallel_safe does not consider contents of testexpr or args */
    /* Information for passing params into and out of the subselect: */
    /* setParam and parParam are lists of integers (param IDs) */
    //setParam和parParam是整型鏈表(param IDs)
    List       *setParam;        /* initplan subqueries have to set these
                                 * Params for parent plan */
    List       *parParam;        /* indices of input Params from parent plan */
    List       *args;            /* 以parParam值進(jìn)行傳遞的表達(dá)式;exprs to pass as parParam values */
    /* Estimated execution costs: */
    Cost        startup_cost;    /* one-time setup cost */
    Cost        per_call_cost;    /* cost for each subplan evaluation */
} SubPlan;


 * SubLink
 * A SubLink represents a subselect appearing in an expression, and in some
 * cases also the combining operator(s) just above it.  The subLinkType
 * indicates the form of the expression represented:
 *    ALL_SUBLINK            (lefthand) op ALL (SELECT ...)
 *    ANY_SUBLINK            (lefthand) op ANY (SELECT ...)
 *    ROWCOMPARE_SUBLINK    (lefthand) op (SELECT ...)
 *    EXPR_SUBLINK        (SELECT with single targetlist item ...)
 *    MULTIEXPR_SUBLINK    (SELECT with multiple targetlist items ...)
 *    ARRAY_SUBLINK        ARRAY(SELECT with single targetlist item ...)
 *    CTE_SUBLINK            WITH query (never actually part of an expression)
 *  我們使用SubLink表示在表達(dá)式中出現(xiàn)的子查詢,在某些情況下組合操作符會(huì)出現(xiàn)在SubLink之上。
 *  subLinkType表示表達(dá)式的形式:
 *    ALL_SUBLINK            (lefthand) op ALL (SELECT ...)
 *    ANY_SUBLINK            (lefthand) op ANY (SELECT ...)
 *    ROWCOMPARE_SUBLINK    (lefthand) op (SELECT ...)
 *    EXPR_SUBLINK        (SELECT with single targetlist item ...)
 *    MULTIEXPR_SUBLINK    (SELECT with multiple targetlist items ...)
 *    ARRAY_SUBLINK        ARRAY(SELECT with single targetlist item ...)
 *    CTE_SUBLINK            WITH query (never actually part of an expression) 
 * For ALL, ANY, and ROWCOMPARE, the lefthand is a list of expressions of the
 * same length as the subselect's targetlist.  ROWCOMPARE will *always* have
 * a list with more than one entry; if the subselect has just one target
 * then the parser will create an EXPR_SUBLINK instead (and any operator
 * above the subselect will be represented separately).
 * ROWCOMPARE, EXPR, and MULTIEXPR require the subselect to deliver at most
 * one row (if it returns no rows, the result is NULL).
 * ALL, ANY, and ROWCOMPARE require the combining operators to deliver boolean
 * results.  ALL and ANY combine the per-row results using AND and OR
 * semantics respectively.
 * ARRAY requires just one target column, and creates an array of the target
 * column's type using any number of rows resulting from the subselect.
 * 對(duì)于ALL,ANY和ROWCOMPARE,左操作符是與子查詢目標(biāo)鏈表長度一致的表達(dá)式鏈表。
 * ROWCOMPARE通常有超過一個(gè)條目的鏈表;如果子查詢剛好只有一個(gè)目標(biāo)列,那么解析器會(huì)創(chuàng)建EXPR_SUBLINK
 * (同時(shí)所有在子查詢之上的操作符會(huì)單獨(dú)表示)
 * ROWCOMPARE, EXPR, 和MULTIEXPR要求子查詢至少輸出一行(如返回0行,則結(jié)果為NULL)。
 * ALL,ANY和ROWCOMPARE要求組合操作符輸出布爾型結(jié)果。
 * ALL/ANY使用AND/OR語義來組合每一行的結(jié)果。
 * SubLink is classed as an Expr node, but it is not actually executable;
 * it must be replaced in the expression tree by a SubPlan node during
 * planning.
 * SubLink歸類為Expr節(jié)點(diǎn),但實(shí)際上并不是可執(zhí)行的,必須在計(jì)劃階段通過SubPlan替代。
 * NOTE: in the raw output of gram.y, testexpr contains just the raw form
 * of the lefthand expression (if any), and operName is the String name of
 * the combining operator.  Also, subselect is a raw parsetree.  During parse
 * analysis, the parser transforms testexpr into a complete boolean expression
 * that compares the lefthand value(s) to PARAM_SUBLINK nodes representing the
 * output columns of the subselect.  And subselect is transformed to a Query.
 * This is the representation seen in saved rules and in the rewriter.
 * 注意:在gram.y的裸輸出中,testexpr只包含左表達(dá)式的裸形式,operName是組合操作符的字符串名稱。
 * 同時(shí),子查詢是裸parsetree。在解析分析期間,
 * 解析器轉(zhuǎn)換testexpr為完整的布爾表達(dá)式用于比較左操作符值與PARAM_SUBLINK節(jié)點(diǎn)所代表的子查詢輸出列值。
 * 子查詢會(huì)轉(zhuǎn)換為Query結(jié)構(gòu)體。
 * 在已存儲(chǔ)的規(guī)則和重寫時(shí)可見的表示形式。
 * In EXISTS, EXPR, MULTIEXPR, and ARRAY SubLinks, testexpr and operName
 * are unused and are always null.
 * 在EXISTS/EXPR/MULTEXPR/ARRAY SubLinks中,testexpr和operName不再使用通常是NULL值。
 * subLinkId is currently used only for MULTIEXPR SubLinks, and is zero in
 * other SubLinks.  This number identifies different multiple-assignment
 * subqueries within an UPDATE statement's SET list.  It is unique only
 * within a particular targetlist.  The output column(s) of the MULTIEXPR
 * are referenced by PARAM_MULTIEXPR Params appearing elsewhere in the tlist.
 * subLinkId當(dāng)前只用于MULTIEXPR,在其他SubLinks中取值為0.
 * 該數(shù)字標(biāo)識(shí)了在UPDATE語句SET鏈表中不同的多個(gè)賦值子查詢。
 * 只有在特定的targetlist內(nèi)是唯一的。
 * 出現(xiàn)在tlist其他地方的PARAM_MULTIEXPR參數(shù)依賴于MULTIEXPR的輸出列。
 * The CTE_SUBLINK case never occurs in actual SubLink nodes, but it is used
 * in SubPlans generated for WITH subqueries.
 * CTE_SUBLINK不會(huì)出現(xiàn)在實(shí)際的SubLink節(jié)點(diǎn)中,但用于WITH子查詢所產(chǎn)生的SubPlans中。
typedef enum SubLinkType
    CTE_SUBLINK                    /* 僅用于SubPlans中;for SubPlans only */
} SubLinkType;


typedef struct SubLink
    Expr        xpr;
    SubLinkType subLinkType;    /* see above */
    int            subLinkId;        /* ID (1..n); 0 if not MULTIEXPR */
    Node       *testexpr;        /* outer-query test for ALL/ANY/ROWCOMPARE */
    List       *operName;        /* originally specified operator name */
    Node       *subselect;        /* subselect as Query* or raw parsetree */
    int            location;        /* token location, or -1 if unknown */
} SubLink;


/* ----------------
 *     MaterialState information
 *        materialize nodes are used to materialize the results
 *        of a subplan into a temporary file.
 *        materialize節(jié)點(diǎn)用于物化subplan的結(jié)果為臨時(shí)文件。
 *        ss.ss_ScanTupleSlot refers to output of underlying plan.
 *        ss.ss_ScanTupleSlot指向underlyling plan的輸出(subplan)
 * ----------------
typedef struct MaterialState
    ScanState    ss;                /* its first field is NodeTag */
    int            eflags;            /* 傳遞給tuplestore的capability標(biāo)記;capability flags to pass to tuplestore */
    bool        eof_underlying; /* 已經(jīng)到達(dá)underlying plan的末尾?reached end of underlying plan? */
    Tuplestorestate *tuplestorestate;
} MaterialState;



/* ----------------------------------------------------------------
 *        ExecMaterial
 *        As long as we are at the end of the data collected in the tuplestore,
 *        we collect one new row from the subplan on each call, and stash it
 *        aside in the tuplestore before returning it.  The tuplestore is
 *        only read if we are asked to scan backwards, rescan, or mark/restore.
 *      只要在tuplestore中數(shù)據(jù)收集結(jié)束時(shí),就會(huì)在每次調(diào)用時(shí)從subplan中收集一條新行,
 *      并在返回之前將其保存在tuplestore中。
 *      只要在往后掃描、重新掃描或標(biāo)記/恢復(fù)時(shí)tuplestore才會(huì)讀取。
 * ----------------------------------------------------------------
static TupleTableSlot *            /* 從subplan中返回的結(jié)果;result tuple from subplan */
ExecMaterial(PlanState *pstate)
    MaterialState *node = castNode(MaterialState, pstate);//物化節(jié)點(diǎn)
    EState       *estate;//運(yùn)行期狀態(tài)
    ScanDirection dir;//掃描方向
    bool        forward;//是否往前掃描
    Tuplestorestate *tuplestorestate;//Tuplestorestate結(jié)構(gòu)體指針
    bool        eof_tuplestore;//是否完成?
    TupleTableSlot *slot;//存儲(chǔ)元組的slot
     * get state info from node
     * 從物化節(jié)點(diǎn)中獲取相關(guān)信息
    estate = node->ss.ps.state;
    dir = estate->es_direction;//方向
    forward = ScanDirectionIsForward(dir);//是否往前掃描
    tuplestorestate = node->tuplestorestate;
     * If first time through, and we need a tuplestore, initialize it.
     * 第一次,需要tuplestore并初始化
    if (tuplestorestate == NULL && node->eflags != 0)
        tuplestorestate = tuplestore_begin_heap(true, false, work_mem);
        tuplestore_set_eflags(tuplestorestate, node->eflags);
        if (node->eflags & EXEC_FLAG_MARK)
             * Allocate a second read pointer to serve as the mark. We know it
             * must have index 1, so needn't store that.
             * 分配用于mark的讀指針
            int            ptrno PG_USED_FOR_ASSERTS_ONLY;
            ptrno = tuplestore_alloc_read_pointer(tuplestorestate,
            Assert(ptrno == 1);
        node->tuplestorestate = tuplestorestate;
     * If we are not at the end of the tuplestore, or are going backwards, try
     * to fetch a tuple from tuplestore.
     * 如果不在tuplestore的末尾或者正在往后掃描,嘗試從tuplestore中提取一個(gè)元組
    eof_tuplestore = (tuplestorestate == NULL) ||
    if (!forward && eof_tuplestore)
        if (!node->eof_underlying)
             * When reversing direction at tuplestore EOF, the first
             * gettupleslot call will fetch the last-added tuple; but we want
             * to return the one before that, if possible. So do an extra
             * fetch.
             * 在EOF處反轉(zhuǎn)方向,第一次的gettupleslot調(diào)用會(huì)提取最后添加的元組;
             * 但如可能,希望返回在此之前的元組,執(zhí)行額外的提取操作。
            if (!tuplestore_advance(tuplestorestate, forward))
                return NULL;    /* the tuplestore must be empty */
        eof_tuplestore = false;
     * If we can fetch another tuple from the tuplestore, return it.
     * 如能從tuplestore中提取另外一個(gè)tuple,返回
    slot = node->ss.ps.ps_ResultTupleSlot;
    if (!eof_tuplestore)
        if (tuplestore_gettupleslot(tuplestorestate, forward, false, slot))
            return slot;
        if (forward)
            eof_tuplestore = true;
     * If necessary, try to fetch another row from the subplan.
     * 如需要(tuplestore末尾),嘗試從subplan中提取另外一行
     * Note: the eof_underlying state variable exists to short-circuit further
     * subplan calls.  It's not optional, unfortunately, because some plan
     * node types are not robust about being called again when they've already
     * returned NULL.
    if (eof_tuplestore && !node->eof_underlying)
        PlanState  *outerNode;
        TupleTableSlot *outerslot;
         * We can only get here with forward==true, so no need to worry about
         * which direction the subplan will go.
        outerNode = outerPlanState(node);
        outerslot = ExecProcNode(outerNode);
        if (TupIsNull(outerslot))
            node->eof_underlying = true;
            return NULL;
         * Append a copy of the returned tuple to tuplestore.  NOTE: because
         * the tuplestore is certainly in EOF state, its read position will
         * move forward over the added tuple.  This is what we want.
         * 追加返回的元組到tuplestore中。
         * 注意:因?yàn)閠uplestore當(dāng)前處于EOF狀態(tài),讀取的位置會(huì)前移至已添加的tuple前面,這是我們希望看到的。
        if (tuplestorestate)
            tuplestore_puttupleslot(tuplestorestate, outerslot);
        ExecCopySlot(slot, outerslot);
        return slot;
     * Nothing left ...
    return ExecClearTuple(slot);


 * tuplestore_begin_heap
 * Create a new tuplestore; other types of tuple stores (other than
 * "heap" tuple stores, for heap tuples) are possible, but not presently
 * implemented.
 * 創(chuàng)建新的tuplestore:目前僅實(shí)現(xiàn)了heap tuples。
 * randomAccess: if true, both forward and backward accesses to the
 * tuple store are allowed.
 * randomAccess : 如為T,支持往前和往后訪問。
 * interXact: if true, the files used for on-disk storage persist beyond the
 * end of the current transaction.  NOTE: It's the caller's responsibility to
 * create such a tuplestore in a memory context and resource owner that will
 * also survive transaction boundaries, and to ensure the tuplestore is closed
 * when it's no longer wanted.
 * interXact : 如為T,磁盤上的存儲(chǔ)文件在當(dāng)前事務(wù)結(jié)束后也會(huì)一直保持。
 * 注意:調(diào)用者有責(zé)任在事務(wù)邊界內(nèi)存活的內(nèi)存上下文和資源擁有者中創(chuàng)建tuplestore并確保不再使用時(shí)銷毀tuplestore。
 * maxKBytes: how much data to store in memory (any data beyond this
 * amount is paged to disk).  When in doubt, use work_mem.
 * maxKBytes:有多少數(shù)據(jù)需要存儲(chǔ)到內(nèi)存中(超長此大小的會(huì)分頁到磁盤上)。
 * 如存在問題,則使用work_mem。
Tuplestorestate *
tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes)
    Tuplestorestate *state;
    int            eflags;
     * This interpretation of the meaning of randomAccess is compatible with
     * the pre-8.3 behavior of tuplestores.
    eflags = randomAccess ?
    state = tuplestore_begin_common(eflags, interXact, maxKBytes);
    state->copytup = copytup_heap;
    state->writetup = writetup_heap;
    state->readtup = readtup_heap;
    return state;
 *        tuplestore_begin_xxx
 * Initialize for a tuple store operation.
 * 初始化tuplestore
static Tuplestorestate *
tuplestore_begin_common(int eflags, bool interXact, int maxKBytes)
    Tuplestorestate *state;
    state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate));
    state->status = TSS_INMEM;
    state->eflags = eflags;
    state->interXact = interXact;
    state->truncated = false;
    state->allowedMem = maxKBytes * 1024L;
    state->availMem = state->allowedMem;
    state->myfile = NULL;
    state->context = CurrentMemoryContext;
    state->resowner = CurrentResourceOwner;
    state->memtupdeleted = 0;
    state->memtupcount = 0;
    state->tuples = 0;
     * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
     * see comments in grow_memtuples().
    state->memtupsize = Max(16384 / sizeof(void *),
                            ALLOCSET_SEPARATE_THRESHOLD / sizeof(void *) + 1);
    state->growmemtuples = true;
    state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *));
    USEMEM(state, GetMemoryChunkSpace(state->memtuples));
    state->activeptr = 0;
    state->readptrcount = 1;
    state->readptrsize = 8;        /* arbitrary */
    state->readptrs = (TSReadPointer *)
        palloc(state->readptrsize * sizeof(TSReadPointer));
    state->readptrs[0].eflags = eflags;
    state->readptrs[0].eof_reached = false;
    state->readptrs[0].current = 0;
    return state;


 * tuplestore_advance - exported function to adjust position without fetching
 * We could optimize this case to avoid palloc/pfree overhead, but for the
 * moment it doesn't seem worthwhile.
tuplestore_advance(Tuplestorestate *state, bool forward)
    void       *tuple;
    bool        should_free;
    tuple = tuplestore_gettuple(state, forward, &should_free);
    if (tuple)
        if (should_free)
        return true;
        return false;


 * tuplestore_gettupleslot - exported function to fetch a MinimalTuple
 * 提取MinimalTuple
 * If successful, put tuple in slot and return true; else, clear the slot
 * and return false.
 * 如成功,則把元組塞進(jìn)slot中并返回T,否則清空slot返回F
 * If copy is true, the slot receives a copied tuple (allocated in current
 * memory context) that will stay valid regardless of future manipulations of
 * the tuplestore's state.  If copy is false, the slot may just receive a
 * pointer to a tuple held within the tuplestore.  The latter is more
 * efficient but the slot contents may be corrupted if additional writes to
 * the tuplestore occur.  (If using tuplestore_trim, see comments therein.)
 * 如copy為T,則slot會(huì)接收拷貝之后的元組,獨(dú)立于tuplestore的狀態(tài).
 * 如copy為F,則slot可能接收到tuplestore中的元組指針.
tuplestore_gettupleslot(Tuplestorestate *state, bool forward,
                        bool copy, TupleTableSlot *slot)
    MinimalTuple tuple;
    bool        should_free;
    tuple = (MinimalTuple) tuplestore_gettuple(state, forward, &should_free);
    if (tuple)
        if (copy && !should_free)
            tuple = heap_copy_minimal_tuple(tuple);
            should_free = true;
        ExecStoreMinimalTuple(tuple, slot, should_free);
        return true;
        return false;


 * Fetch the next tuple in either forward or back direction.
 * Returns NULL if no more tuples.  If should_free is set, the
 * caller must pfree the returned tuple when done with it.
 * 往前/后返回下一個(gè)元組。
 * 如無更多元組,返回NULL。如should_free有值,調(diào)用者必須在處理完畢后釋放返回的元組
 * Backward scan is only allowed if randomAccess was set true or
 * EXEC_FLAG_BACKWARD was specified to tuplestore_set_eflags().
 * 在randomAccess設(shè)置為T或者指定EXEC_FLAG_BACKWARD時(shí)才允許。
static void *
tuplestore_gettuple(Tuplestorestate *state, bool forward,
                    bool *should_free)
    TSReadPointer *readptr = &state->readptrs[state->activeptr];//讀取指針
    unsigned int tuplen;
    void       *tup;
    Assert(forward || (readptr->eflags & EXEC_FLAG_BACKWARD));
    switch (state->status)
        case TSS_INMEM://內(nèi)存中
            *should_free = false;
            if (forward)
                if (readptr->eof_reached)
                    return NULL;
                if (readptr->current < state->memtupcount)
                    /* We have another tuple, so return it */
                    return state->memtuples[readptr->current++];
                readptr->eof_reached = true;
                return NULL;
                 * if all tuples are fetched already then we return last
                 * tuple, else tuple before last returned.
                if (readptr->eof_reached)
                    readptr->current = state->memtupcount;
                    readptr->eof_reached = false;
                    if (readptr->current <= state->memtupdeleted)
                        return NULL;
                    readptr->current--; /* last returned tuple */
                if (readptr->current <= state->memtupdeleted)
                    return NULL;
                return state->memtuples[readptr->current - 1];
        case TSS_WRITEFILE://寫文件
            /* Skip state change if we'll just return NULL */
            if (readptr->eof_reached && forward)
                return NULL;
             * Switch from writing to reading.
             * 從寫切換至讀
                        &state->writepos_file, &state->writepos_offset);
            if (!readptr->eof_reached)
                if (BufFileSeek(state->myfile,
                                readptr->file, readptr->offset,
                                SEEK_SET) != 0)
                             errmsg("could not seek in tuplestore temporary file: %m")));
            state->status = TSS_READFILE;
            /* FALLTHROUGH */
        case TSS_READFILE:
            *should_free = true;
            if (forward)
                if ((tuplen = getlen(state, true)) != 0)
                    tup = READTUP(state, tuplen);
                    return tup;
                    readptr->eof_reached = true;
                    return NULL;
             * Backward.
             * 往后讀
             * if all tuples are fetched already then we return last tuple,
             * else tuple before last returned.
             * 如果所有元組時(shí)已提取,則返回最后一個(gè)元組,否則返回先前最后返回的元組
             * Back up to fetch previously-returned tuple's ending length
             * word. If seek fails, assume we are at start of file.
             * 往回向上提取先前已返回的元組結(jié)束長度字,如檢索失敗,假定處于文件的開始位置.
            if (BufFileSeek(state->myfile, 0, -(long) sizeof(unsigned int),
                            SEEK_CUR) != 0)
                /* even a failed backwards fetch gets you out of eof state */
                readptr->eof_reached = false;
                return NULL;
            tuplen = getlen(state, false);
            if (readptr->eof_reached)
                readptr->eof_reached = false;
                /* We will return the tuple returned before returning NULL */
                 * Back up to get ending length word of tuple before it.
                 * 獲取結(jié)束長度字
                if (BufFileSeek(state->myfile, 0,
                                -(long) (tuplen + 2 * sizeof(unsigned int)),
                                SEEK_CUR) != 0)
                     * If that fails, presumably the prev tuple is the first
                     * in the file.  Back up so that it becomes next to read
                     * in forward direction (not obviously right, but that is
                     * what in-memory case does).
                    if (BufFileSeek(state->myfile, 0,
                                    -(long) (tuplen + sizeof(unsigned int)),
                                    SEEK_CUR) != 0)
                                 errmsg("could not seek in tuplestore temporary file: %m")));
                    return NULL;
                tuplen = getlen(state, false);
             * Now we have the length of the prior tuple, back up and read it.
             * Note: READTUP expects we are positioned after the initial
             * length word of the tuple, so back up to that point.
             * 已獲得優(yōu)先元組的長度,讀取之.
            if (BufFileSeek(state->myfile, 0,
                            -(long) tuplen,
                            SEEK_CUR) != 0)
                         errmsg("could not seek in tuplestore temporary file: %m")));
            tup = READTUP(state, tuplen);
            return tup;
            elog(ERROR, "invalid tuplestore state");
            return NULL;        /* keep compiler quiet */



[pg12@localhost ~]$ psql -d testdb
Timing is on.
Expanded display is used automatically.
psql (12.0)
Type "help" for help.
[local]:5432 pg12@testdb=# 
[local]:5432 pg12@testdb=# select * from tbl;
 id | value 
  1 |     2
(1 row)
Time: 2.678 ms
[local]:5432 pg12@testdb=# select count(*) from t_big_null;
(1 row)
Time: 679.972 ms
[local]:5432 pg12@testdb=# analyze tbl;
Time: 64.442 ms
[local]:5432 pg12@testdb=# analyze t_big_null;
Time: 434.702 ms
[local]:5432 pg12@testdb=# 
[local]:5432 pg12@testdb=# select pg_backend_pid();
(1 row)
Time: 1.990 ms
[local]:5432 pg12@testdb=# select * from tbl a where a.id not in (select b.id from t_big_null b);


(gdb) b ExecMaterial
Breakpoint 1 at 0x720edb: file nodeMaterial.c, line 41.
(gdb) c
Breakpoint 1, ExecMaterial (pstate=0x1230128) at nodeMaterial.c:41
41        MaterialState *node = castNode(MaterialState, pstate);


(gdb) n
54        estate = node->ss.ps.state;
55        dir = estate->es_direction;
56        forward = ScanDirectionIsForward(dir);
57        tuplestorestate = node->tuplestorestate;
62        if (tuplestorestate == NULL && node->eflags != 0)
64            tuplestorestate = tuplestore_begin_heap(true, false, work_mem);
65            tuplestore_set_eflags(tuplestorestate, node->eflags);
66            if (node->eflags & EXEC_FLAG_MARK)
78            node->tuplestorestate = tuplestorestate;
85        eof_tuplestore = (tuplestorestate == NULL) ||
86            tuplestore_ateof(tuplestorestate);
85        eof_tuplestore = (tuplestorestate == NULL) ||
88        if (!forward && eof_tuplestore)
(gdb) p eof_tuplestore
$1 = false


(gdb) n
107        slot = node->ss.ps.ps_ResultTupleSlot;
108        if (!eof_tuplestore)
110            if (tuplestore_gettupleslot(tuplestorestate, forward, false, slot))
(gdb) step
tuplestore_gettupleslot (state=0x3069c18, forward=true, copy=false, slot=0x30687a8)
    at tuplestore.c:1084
1084        tuple = (MinimalTuple) tuplestore_gettuple(state, forward, &should_free);


(gdb) step
tuplestore_gettuple (state=0x3069c18, forward=true, should_free=0x7ffd18474ff7)
    at tuplestore.c:906
906        TSReadPointer *readptr = &state->readptrs[state->activeptr];


(gdb) n
910        Assert(forward || (readptr->eflags & EXEC_FLAG_BACKWARD));
(gdb) p *readptr
$2 = {eflags = 2, eof_reached = false, current = 0, file = 2139062143, 
  offset = 9187201950435737471}


(gdb) n
912        switch (state->status)
(gdb) p *state
$3 = {status = TSS_INMEM, eflags = 2, backward = false, interXact = false, 
  truncated = false, availMem = 4177896, allowedMem = 4194304, tuples = 0, myfile = 0x0, 
  context = 0x3067da0, resowner = 0x2fa62c8, copytup = 0xaba7bd <copytup_heap>, 
  writetup = 0xaba811 <writetup_heap>, readtup = 0xaba9d9 <readtup_heap>, 
  memtuples = 0x3051e90, memtupdeleted = 0, memtupcount = 0, memtupsize = 2048, 
  growmemtuples = true, readptrs = 0x3077f70, activeptr = 0, readptrcount = 1, 
  readptrsize = 8, writepos_file = 0, writepos_offset = 0}
(gdb) p state->status


(gdb) n
915                *should_free = false;
(gdb) n
916                if (forward)
918                    if (readptr->eof_reached)
920                    if (readptr->current < state->memtupcount)
(gdb) p readptr->current
$5 = 0
(gdb) p state->memtupcount
$6 = 0
(gdb) n
925                    readptr->eof_reached = true;
926                    return NULL;
1062    }


(gdb) n
tuplestore_gettupleslot (state=0x3069c18, forward=true, copy=false, slot=0x30687a8)
    at tuplestore.c:1086
1086        if (tuple)
1098            ExecClearTuple(slot);
1099            return false;


(gdb) n
1101    }
ExecMaterial (pstate=0x3068158) at nodeMaterial.c:112
112            if (forward)
113                eof_tuplestore = true;


(gdb) n
124        if (eof_tuplestore && !node->eof_underlying)
(gdb) p node->eof_underlying
$7 = false
(gdb) n
133            outerNode = outerPlanState(node);
#define innerPlanState(node)        (((PlanState *)(node))->righttree)
#define outerPlanState(node)        (((PlanState *)(node))->lefttree)
134            outerslot = ExecProcNode(outerNode);
(gdb) p outerNode
$8 = (PlanState *) 0x3068270
(gdb) p *outerNode
$9 = {type = T_SeqScanState, plan = 0x3037628, state = 0x3067eb8, 
  ExecProcNode = 0x6f802a <ExecProcNodeFirst>, ExecProcNodeReal = 0x72b904 <ExecSeqScan>, 
  instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, qual = 0x0, 
  lefttree = 0x0, righttree = 0x0, initPlan = 0x0, subPlan = 0x0, chgParam = 0x0, 
  ps_ResultTupleDesc = 0x3068578, ps_ResultTupleSlot = 0x0, ps_ExprContext = 0x3068388, 
  ps_ProjInfo = 0x0, scandesc = 0x7fab449cae98, 
  scanops = 0xc3e780 <TTSOpsBufferHeapTuple>, outerops = 0x0, innerops = 0x0, 
  resultops = 0xc3e780 <TTSOpsBufferHeapTuple>, scanopsfixed = true, 
  outeropsfixed = false, inneropsfixed = false, resultopsfixed = true, scanopsset = true, 
  outeropsset = false, inneropsset = false, resultopsset = true}
(gdb) p *outerNode->state
$10 = {type = T_EState, es_direction = ForwardScanDirection, es_snapshot = 0x2f9cd10, 
  es_crosscheck_snapshot = 0x0, es_range_table = 0x3042130, 
  es_range_table_array = 0x3068108, es_range_table_size = 2, es_relations = 0x3068130, 
  es_rowmarks = 0x0, es_plannedstmt = 0x3042438, 
  es_sourceText = 0x2f74d88 "select * from tbl a where a.id not in (select b.id from t_big_null b);", es_junkFilter = 0x0, es_output_cid = 0, es_result_relations = 0x0, 
  es_num_result_relations = 0, es_result_relation_info = 0x0, 
  es_root_result_relations = 0x0, es_num_root_result_relations = 0, 
  es_partition_directory = 0x0, es_tuple_routing_result_relations = 0x0, 
  es_trig_target_relations = 0x0, es_param_list_info = 0x0, 
  es_param_exec_vals = 0x30680d0, es_queryEnv = 0x0, es_query_cxt = 0x3067da0, 
  es_tupleTable = 0x3068540, es_processed = 0, es_top_eflags = 16, es_instrument = 0, 
  es_finished = false, es_exprcontexts = 0x3068448, es_subplanstates = 0x3068950, 
  es_auxmodifytables = 0x0, es_per_tuple_exprcontext = 0x0, es_epq_active = 0x0, 
  es_use_parallel_mode = false, es_query_dsa = 0x0, es_jit_flags = 25, es_jit = 0x0, 
  es_jit_worker_instr = 0x0}
(gdb) p ((PlanState *)node)->righttree
$21 = (struct PlanState *) 0x0

回過頭來看執(zhí)行計(jì)劃,Materialize Node的lefttree是Seq Scan on public.t_big_null b,righttree為NULL。

[local]:5432 pg12@testdb=# explain verbose select * from tbl a where a.id not in (select b.id from t_big_null b);
                                         QUERY PLAN                                        
 Seq Scan on public.tbl a  (cost=0.00..129156.33 rows=1 width=8)
   Output: a.id, a.value
   Filter: (NOT (SubPlan 1))
   SubPlan 1
     ->  Materialize  (cost=0.00..233310.68 rows=9999979 width=4)
           Output: b.id
           ->  Seq Scan on public.t_big_null b  (cost=0.00..144247.79 rows=9999979 width=4)
                 Output: b.id
(8 rows)
Time: 7.681 ms


(gdb) n
135            if (TupIsNull(outerslot))
(gdb) p *outerslot
$16 = {type = T_TupleTableSlot, tts_flags = 16, tts_nvalid = 0, 
  tts_ops = 0xc3e780 <TTSOpsBufferHeapTuple>, tts_tupleDescriptor = 0x7fab449cae98, 
  tts_values = 0x30684f0, tts_isnull = 0x30684f8, tts_mcxt = 0x3067da0, tts_tid = {
    ip_blkid = {bi_hi = 0, bi_lo = 0}, ip_posid = 1}, tts_tableOid = 49155}
(gdb) p *outerslot->tts_values
$17 = 0
(gdb) p outerslot->tts_values[1]
$18 = 0
(gdb) p outerslot->tts_values[0]
$19 = 0
(gdb) p *outerslot->tts_tupleDescriptor
$20 = {natts = 1, tdtypeid = 49157, tdtypmod = -1, tdrefcount = 2, constr = 0x0, 
  attrs = 0x7fab449caeb0}


(gdb) p *node
$22 = {ss = {ps = {type = T_MaterialState, plan = 0x3040a60, state = 0x3067eb8, 
      ExecProcNode = 0x720ecf <ExecMaterial>, ExecProcNodeReal = 0x720ecf <ExecMaterial>, 
      instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, qual = 0x0, 
      lefttree = 0x3068270, righttree = 0x0, initPlan = 0x0, subPlan = 0x0, 
      chgParam = 0x0, ps_ResultTupleDesc = 0x3068690, ps_ResultTupleSlot = 0x30687a8, 
      ps_ExprContext = 0x0, ps_ProjInfo = 0x0, scandesc = 0x3068578, 
      scanops = 0xc3e720 <TTSOpsMinimalTuple>, outerops = 0x0, innerops = 0x0, 
      resultops = 0xc3e720 <TTSOpsMinimalTuple>, scanopsfixed = true, 
      outeropsfixed = false, inneropsfixed = false, resultopsfixed = true, 
      scanopsset = true, outeropsset = false, inneropsset = false, resultopsset = true}, 
    ss_currentRelation = 0x0, ss_currentScanDesc = 0x0, ss_ScanTupleSlot = 0x3068868}, 
  eflags = 2, eof_underlying = false, tuplestorestate = 0x3069c18}
(gdb) n
146            if (tuplestorestate)
147                tuplestore_puttupleslot(tuplestorestate, outerslot);
(gdb) p outerslot->tts_values[0]
$23 = 0
(gdb) n
149            ExecCopySlot(slot, outerslot);
(gdb) p outerslot->tts_values[0]
$24 = 0
(gdb) n
150            return slot;
(gdb) p outerslot->tts_values[0]
$25 = 0
(gdb) p slot->tts_values[0]
$26 = 0
(gdb) n
157    }


(gdb) n
ExecProcNodeFirst (node=0x3068158) at execProcnode.c:446
446    }
(gdb) c
Breakpoint 1, ExecMaterial (pstate=0x3068158) at nodeMaterial.c:41
41        MaterialState *node = castNode(MaterialState, pstate);
(gdb) n
54        estate = node->ss.ps.state;
55        dir = estate->es_direction;
56        forward = ScanDirectionIsForward(dir);
57        tuplestorestate = node->tuplestorestate;
62        if (tuplestorestate == NULL && node->eflags != 0)
85        eof_tuplestore = (tuplestorestate == NULL) ||
86            tuplestore_ateof(tuplestorestate);
85        eof_tuplestore = (tuplestorestate == NULL) ||
88        if (!forward && eof_tuplestore)
107        slot = node->ss.ps.ps_ResultTupleSlot;
108        if (!eof_tuplestore)
124        if (eof_tuplestore && !node->eof_underlying)
133            outerNode = outerPlanState(node);
(gdb) p eof_tuplestore
$27 = true
(gdb) n
134            outerslot = ExecProcNode(outerNode);
135            if (TupIsNull(outerslot))
146            if (tuplestorestate)
147                tuplestore_puttupleslot(tuplestorestate, outerslot);
149            ExecCopySlot(slot, outerslot);
150            return slot;
(gdb) p slot->tts_values[0]
$28 = 2


[local]:5432 pg12@testdb=# select * from tbl a where a.id not in (select b.id from t_big_null b);
 id | value 
(0 rows)
Time: 3633462.666 ms (01:00:33.463) --> 包括了debug的時(shí)間,實(shí)際時(shí)間是5s左右
[local]:5432 pg12@testdb=# 
[local]:5432 pg12@testdb=# select * from tbl a where a.id not in (select b.id from t_big_null b);
 id | value 
(0 rows)
Time: 6.480 ms --> 第2+次就快很多
[local]:5432 pg12@testdb=#




