溫馨提示×

您好,登錄后才能下訂單哦!

密碼登錄×
登錄注冊(cè)×
其他方式登錄
點(diǎn)擊 登錄注冊(cè) 即表示同意《億速云用戶服務(wù)條款》

PostgreSQL 源碼解讀(193)- 查詢#109(排序#2 - ExecSort)

發(fā)布時(shí)間:2020-08-11 11:11:23 來源:ITPUB博客 閱讀:179 作者:husthxd 欄目:關(guān)系型數(shù)據(jù)庫(kù)

本節(jié)繼續(xù)介紹排序的實(shí)現(xiàn),上一節(jié)介紹了ExecInitSort,本節(jié)主要介紹排序的實(shí)現(xiàn)函數(shù)ExecSort.

一、數(shù)據(jù)結(jié)構(gòu)

SortState
排序運(yùn)行期狀態(tài)信息


/* ----------------
 *     SortState information
 *     排序運(yùn)行期狀態(tài)信息
 * ----------------
 */
typedef struct SortState
{
    //基類
    ScanState    ss;                /* its first field is NodeTag */
    //是否需要隨機(jī)訪問排序輸出?
    bool        randomAccess;    /* need random access to sort output? */
    //結(jié)果集是否存在邊界?
    bool        bounded;        /* is the result set bounded? */
    //如存在邊界,需要多少個(gè)元組?
    int64        bound;            /* if bounded, how many tuples are needed */
    //是否已完成排序?
    bool        sort_Done;        /* sort completed yet? */
    //是否使用有界值?
    bool        bounded_Done;    /* value of bounded we did the sort with */
    //使用的有界值?
    int64        bound_Done;        /* value of bound we did the sort with */
    //tuplesort.c的私有狀態(tài)
    void       *tuplesortstate; /* private state of tuplesort.c */
    //是否worker?
    bool        am_worker;        /* are we a worker? */
    //每個(gè)worker對(duì)應(yīng)一個(gè)條目
    SharedSortInfo *shared_info;    /* one entry per worker */
} SortState;
/* ----------------
 *     Shared memory container for per-worker sort information
 *     per-worker排序信息的共享內(nèi)存容器
 * ----------------
 */
typedef struct SharedSortInfo
{
    //worker個(gè)數(shù)?
    int            num_workers;
    //排序機(jī)制
    TuplesortInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
} SharedSortInfo;

TuplesortInstrumentation
報(bào)告排序統(tǒng)計(jì)的數(shù)據(jù)結(jié)構(gòu).


/*
 * Data structures for reporting sort statistics.  Note that
 * TuplesortInstrumentation can't contain any pointers because we
 * sometimes put it in shared memory.
 * 報(bào)告排序統(tǒng)計(jì)的數(shù)據(jù)結(jié)構(gòu).
 * 注意TuplesortInstrumentation不能包含指針因?yàn)橛袝r(shí)候會(huì)把該結(jié)構(gòu)體放在共享內(nèi)存中.
 */
typedef enum
{
    SORT_TYPE_STILL_IN_PROGRESS = 0,//仍然在排序中
    SORT_TYPE_TOP_N_HEAPSORT,//TOP N 堆排序
    SORT_TYPE_QUICKSORT,//快速排序
    SORT_TYPE_EXTERNAL_SORT,//外部排序
    SORT_TYPE_EXTERNAL_MERGE//外部排序后的合并
} TuplesortMethod;//排序方法
typedef enum
{
    SORT_SPACE_TYPE_DISK,//需要用上磁盤
    SORT_SPACE_TYPE_MEMORY//使用內(nèi)存
} TuplesortSpaceType;
typedef struct TuplesortInstrumentation
{
    //使用的排序算法
    TuplesortMethod sortMethod; /* sort algorithm used */
    //排序使用空間類型
    TuplesortSpaceType spaceType;    /* type of space spaceUsed represents */
    //空間消耗(以K為單位)
    long        spaceUsed;        /* space consumption, in kB */
} TuplesortInstrumentation;

二、源碼解讀

ExecSort使用tuplesort排序從outer子樹節(jié)點(diǎn)獲取的元組,在內(nèi)存或者臨時(shí)文件中緩存結(jié)果.在初次調(diào)用后,后續(xù)的每次調(diào)用返回一行.
實(shí)現(xiàn)邏輯不復(fù)雜,從outer plan中獲取所有元組,調(diào)用tuplesort進(jìn)行排序,如work_mem大小足夠則在內(nèi)存中存儲(chǔ)否則在磁盤中使用臨時(shí)文件存儲(chǔ).


/* ----------------------------------------------------------------
 *        ExecSort
 *
 *        Sorts tuples from the outer subtree of the node using tuplesort,
 *        which saves the results in a temporary file or memory. After the
 *        initial call, returns a tuple from the file with each call.
 *        使用tuplesort排序outer子樹節(jié)點(diǎn)獲取的元組,
 *          在內(nèi)存或者臨時(shí)文件中緩存結(jié)果.
 *        在初次調(diào)用后,后續(xù)的每次調(diào)用返回一行.
 *
 *        Conditions:
 *          -- none.
 *          -- 無
 *
 *        Initial States:
 *          -- the outer child is prepared to return the first tuple.
 *        初始狀態(tài):
 *          -- outer子節(jié)點(diǎn)已準(zhǔn)備返回第一個(gè)元組.
 * ----------------------------------------------------------------
 */
static TupleTableSlot *
ExecSort(PlanState *pstate)
{
    //排序運(yùn)行狀態(tài)
    SortState  *node = castNode(SortState, pstate);
    EState       *estate;//運(yùn)行狀態(tài)
    ScanDirection dir;//掃描方向
    Tuplesortstate *tuplesortstate;//元組排序狀態(tài)
    TupleTableSlot *slot;//元組slot
    CHECK_FOR_INTERRUPTS();
    /*
     * get state info from node
     * 從節(jié)點(diǎn)中獲取運(yùn)行狀態(tài)
     */
    SO1_printf("ExecSort: %s\n",
               "entering routine");
    estate = node->ss.ps.state;
    dir = estate->es_direction;
    tuplesortstate = (Tuplesortstate *) node->tuplesortstate;
    /*
     * If first time through, read all tuples from outer plan and pass them to
     * tuplesort.c. Subsequent calls just fetch tuples from tuplesort.
     * 如果第一輪,從outer plan中讀取所有元組并傳遞給tuplesort.c.
     * 后續(xù)的調(diào)用只是從tuplesort中提取元組.
     */
    if (!node->sort_Done)
    {
        //-------------- 首次,需要進(jìn)行排序
        Sort       *plannode = (Sort *) node->ss.ps.plan;
        PlanState  *outerNode;
        TupleDesc    tupDesc;
        SO1_printf("ExecSort: %s\n",
                   "sorting subplan");
        /*
         * Want to scan subplan in the forward direction while creating the
         * sorted data.
         * 在創(chuàng)建結(jié)果排序數(shù)據(jù)時(shí),向前掃描子計(jì)劃
         */
        //設(shè)置掃描方向
        estate->es_direction = ForwardScanDirection;
        /*
         * Initialize tuplesort module.
         * 初始化tuplesort模塊
         */
        SO1_printf("ExecSort: %s\n",
                   "calling tuplesort_begin");
        //獲取outer plan運(yùn)行狀態(tài)
        outerNode = outerPlanState(node);
        //獲取結(jié)果類型(元組描述符)
        tupDesc = ExecGetResultType(outerNode);
        //執(zhí)行tuplesort_begin_heap
        tuplesortstate = tuplesort_begin_heap(tupDesc,
                                              plannode->numCols,
                                              plannode->sortColIdx,
                                              plannode->sortOperators,
                                              plannode->collations,
                                              plannode->nullsFirst,
                                              work_mem,
                                              NULL, node->randomAccess);
        if (node->bounded)
            //節(jié)點(diǎn)有界,則設(shè)置邊界
            tuplesort_set_bound(tuplesortstate, node->bound);
        node->tuplesortstate = (void *) tuplesortstate;
        /*
         * Scan the subplan and feed all the tuples to tuplesort.
         * 掃描子計(jì)劃并把所有元組都發(fā)給tuplesort
         */
        for (;;)
        {
            //從outer plan中獲取元組
            slot = ExecProcNode(outerNode);
            if (TupIsNull(slot))
                break;//直至全部獲取完畢
            //排序
            tuplesort_puttupleslot(tuplesortstate, slot);
        }
        /*
         * Complete the sort.
         * 完成排序
         */
        tuplesort_performsort(tuplesortstate);
        /*
         * restore to user specified direction
         * 恢復(fù)用戶指定的掃描方向
         */
        estate->es_direction = dir;
        /*
         * finally set the sorted flag to true
         * 最后,設(shè)置已排序標(biāo)記為T
         */
        node->sort_Done = true;
        node->bounded_Done = node->bounded;
        node->bound_Done = node->bound;
        if (node->shared_info && node->am_worker)
        {
            TuplesortInstrumentation *si;
            Assert(IsParallelWorker());
            Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
            si = &node->shared_info->sinstrument[ParallelWorkerNumber];
            tuplesort_get_stats(tuplesortstate, si);
        }
        SO1_printf("ExecSort: %s\n", "sorting done");
    }
    SO1_printf("ExecSort: %s\n",
               "retrieving tuple from tuplesort");
    /*
     * Get the first or next tuple from tuplesort. Returns NULL if no more
     * tuples.  Note that we only rely on slot tuple remaining valid until the
     * next fetch from the tuplesort.
     * 在tuplesort中獲取第一個(gè)/下一個(gè)元組.
     * 如無更多的元組,返回NULL.
     * 注意我們會(huì)一直保持存儲(chǔ)在slot中的元組可用直至從tuplesort中提取下一個(gè)元組.
     */
    slot = node->ss.ps.ps_ResultTupleSlot;
    (void) tuplesort_gettupleslot(tuplesortstate,
                                  ScanDirectionIsForward(dir),
                                  false, slot, NULL);
    return slot;
}

三、跟蹤分析

創(chuàng)建數(shù)據(jù)表,插入測(cè)試數(shù)據(jù)


drop table  if exists t_sort;
create table t_sort(bh varchar(20),c1 int,c2 int,c3 int,c4 int,c5 int,c6 int);
insert into t_sort select 'GZ01',col,col,col,col,col,col from generate_series(1,100000) as col;
testdb=# explain (verbose,analyze) select * from t_sort order by c1,c2;
                                                        QUERY PLAN                                                         
---------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=8172.55..8308.71 rows=54464 width=82) (actual time=173.447..225.213 rows=100000 loops=1)
   Output: bh, c1, c2, c3, c4, c5, c6
   Sort Key: t_sort.c1, t_sort.c2
   Sort Method: external merge  Disk: 4120kB
   ->  Seq Scan on public.t_sort  (cost=0.00..1280.64 rows=54464 width=82) (actual time=0.092..55.257 rows=100000 loops=1)
         Output: bh, c1, c2, c3, c4, c5, c6
 Planning Time: 4.648 ms
 Execution Time: 238.227 ms
(8 rows)

測(cè)試腳本


testdb=# select * from t_sort order by c1,c2;

跟蹤調(diào)試


(gdb) b ExecSort
Breakpoint 1 at 0x711909: file nodeSort.c, line 42.
(gdb) c
Continuing.
Breakpoint 1, ExecSort (pstate=0x17a29b0) at nodeSort.c:42
42        SortState  *node = castNode(SortState, pstate);
(gdb)

輸入?yún)?shù)


(gdb) p *pstate
$1 = {type = T_SortState, plan = 0x179e540, state = 0x17a2798, ExecProcNode = 0x7118fd <ExecSort>, 
  ExecProcNodeReal = 0x7118fd <ExecSort>, instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, 
  qual = 0x0, lefttree = 0x17a2ac8, righttree = 0x0, initPlan = 0x0, subPlan = 0x0, chgParam = 0x0, 
  ps_ResultTupleSlot = 0x17a3900, ps_ExprContext = 0x0, ps_ProjInfo = 0x0, scandesc = 0x17a2e50}

左樹節(jié)點(diǎn)是SeqScan,亦即outer node


(gdb) p *pstate->lefttree
$2 = {type = T_SeqScanState, plan = 0x17a8960, state = 0x17a2798, ExecProcNode = 0x6e0670 <ExecProcNodeFirst>, 
  ExecProcNodeReal = 0x710589 <ExecSeqScan>, instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, 
  qual = 0x0, lefttree = 0x0, righttree = 0x0, initPlan = 0x0, subPlan = 0x0, chgParam = 0x0, 
  ps_ResultTupleSlot = 0x17a3268, ps_ExprContext = 0x17a2be0, ps_ProjInfo = 0x0, scandesc = 0x7faf6c0ec160}

獲取節(jié)點(diǎn)的運(yùn)行狀態(tài)&掃描方向


(gdb) n
48        CHECK_FOR_INTERRUPTS();
(gdb) 
56        estate = node->ss.ps.state;
(gdb) 
57        dir = estate->es_direction;
(gdb) 
58        tuplesortstate = (Tuplesortstate *) node->tuplesortstate;
(gdb) 
(gdb) n
65        if (!node->sort_Done)
(gdb) 
(gdb) p *estate
$3 = {type = T_EState, es_direction = ForwardScanDirection, es_snapshot = 0x17698a0, es_crosscheck_snapshot = 0x0, 
  es_range_table = 0x17a8e58, es_plannedstmt = 0x16a7e80, es_sourceText = 0x16a6d78 "select * from t_sort order by c1,c2;", 
  es_junkFilter = 0x0, es_output_cid = 0, es_result_relations = 0x0, es_num_result_relations = 0, 
  es_result_relation_info = 0x0, es_root_result_relations = 0x0, es_num_root_result_relations = 0, 
  es_tuple_routing_result_relations = 0x0, es_trig_target_relations = 0x0, es_trig_tuple_slot = 0x0, 
  es_trig_oldtup_slot = 0x0, es_trig_newtup_slot = 0x0, es_param_list_info = 0x0, es_param_exec_vals = 0x0, 
  es_queryEnv = 0x0, es_query_cxt = 0x17a2680, es_tupleTable = 0x17a2e18, es_rowMarks = 0x0, es_processed = 0, 
  es_lastoid = 0, es_top_eflags = 16, es_instrument = 0, es_finished = false, es_exprcontexts = 0x17a2ca0, 
  es_subplanstates = 0x0, es_auxmodifytables = 0x0, es_per_tuple_exprcontext = 0x0, es_epqTuple = 0x0, 
  es_epqTupleSet = 0x0, es_epqScanDone = 0x0, es_use_parallel_mode = false, es_query_dsa = 0x0, es_jit_flags = 0, 
  es_jit = 0x0, es_jit_worker_instr = 0x0}
(gdb) p dir
$4 = ForwardScanDirection
(gdb) p *tuplesortstate
Cannot access memory at address 0x0

未完成排序,進(jìn)入排序邏輯,設(shè)置掃描方向


(gdb) n
67            Sort       *plannode = (Sort *) node->ss.ps.plan;
(gdb) n
78            estate->es_direction = ForwardScanDirection;
(gdb) 
86            outerNode = outerPlanState(node);
(gdb) p *plannode
$5 = {plan = {type = T_Sort, startup_cost = 12434.82023721841, total_cost = 12684.82023721841, plan_rows = 100000, 
    plan_width = 29, parallel_aware = false, parallel_safe = true, plan_node_id = 0, targetlist = 0x17a8fc8, qual = 0x0, 
    lefttree = 0x17a8960, righttree = 0x0, initPlan = 0x0, extParam = 0x0, allParam = 0x0}, numCols = 2, 
  sortColIdx = 0x17a89f8, sortOperators = 0x17a8a18, collations = 0x17a8a38, nullsFirst = 0x17a8a58}
(gdb)

獲取結(jié)果類型并調(diào)用tuplesort_begin_heap獲取排序狀態(tài)tuplesortstate


(gdb) n
87            tupDesc = ExecGetResultType(outerNode);
(gdb) 
96                                                  NULL, node->randomAccess);
(gdb) p *tupDesc
$6 = {natts = 7, tdtypeid = 2249, tdtypmod = -1, tdhasoid = false, tdrefcount = -1, constr = 0x0, attrs = 0x17a2e70}
(gdb) n
89            tuplesortstate = tuplesort_begin_heap(tupDesc,
(gdb) 
97            if (node->bounded)
(gdb) 
99            node->tuplesortstate = (void *) tuplesortstate;
(gdb)

循環(huán)獲取元組并調(diào)用tuplesort_puttupleslot(下一節(jié)介紹),放到待排序中


(gdb) n
107                slot = ExecProcNode(outerNode);
(gdb) 
109                if (TupIsNull(slot))
(gdb) 
112                tuplesort_puttupleslot(tuplesortstate, slot);
(gdb) 
113            }
(gdb) 
107                slot = ExecProcNode(outerNode);
(gdb)

設(shè)置斷點(diǎn),完成循環(huán),并執(zhí)行tuplesort_performsort(下一節(jié)介紹)完成排序


(gdb) b nodeSort.c:118
Breakpoint 2 at 0x711a72: file nodeSort.c, line 118.
(gdb) c
Continuing.
Breakpoint 2, ExecSort (pstate=0x17a29b0) at nodeSort.c:118
118            tuplesort_performsort(tuplesortstate);
(gdb)

設(shè)置相關(guān)其他標(biāo)記


(gdb) n
123            estate->es_direction = dir;
(gdb) 
128            node->sort_Done = true;
(gdb) 
129            node->bounded_Done = node->bounded;
(gdb) 
130            node->bound_Done = node->bound;
(gdb) 
131            if (node->shared_info && node->am_worker)
(gdb) 
151        slot = node->ss.ps.ps_ResultTupleSlot;
(gdb) p *node
$7 = {ss = {ps = {type = T_SortState, plan = 0x179e540, state = 0x17a2798, ExecProcNode = 0x7118fd <ExecSort>, 
      ExecProcNodeReal = 0x7118fd <ExecSort>, instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, 
      qual = 0x0, lefttree = 0x17a2ac8, righttree = 0x0, initPlan = 0x0, subPlan = 0x0, chgParam = 0x0, 
      ps_ResultTupleSlot = 0x17a3900, ps_ExprContext = 0x0, ps_ProjInfo = 0x0, scandesc = 0x17a2e50}, 
    ss_currentRelation = 0x0, ss_currentScanDesc = 0x0, ss_ScanTupleSlot = 0x17a33a8}, randomAccess = false, 
  bounded = false, bound = 0, sort_Done = true, bounded_Done = false, bound_Done = 0, tuplesortstate = 0x17ac7b8, 
  am_worker = false, shared_info = 0x0}
(gdb)

完成排序,獲取元組


(gdb) p node->tuplesortstate
$8 = (void *) 0x17ac7b8
(gdb) n
152        (void) tuplesort_gettupleslot(tuplesortstate,
(gdb) 
155        return slot;
(gdb) p *slot
$9 = {type = T_TupleTableSlot, tts_isempty = false, tts_shouldFree = false, tts_shouldFreeMin = false, tts_slow = false, 
  tts_tuple = 0x17a3940, tts_tupleDescriptor = 0x17a34e8, tts_mcxt = 0x17a2680, tts_buffer = 0, tts_nvalid = 0, 
  tts_values = 0x17a3960, tts_isnull = 0x17a3998, tts_mintuple = 0x1bc07f8, tts_minhdr = {t_len = 56, t_self = {ip_blkid = {
        bi_hi = 0, bi_lo = 0}, ip_posid = 0}, t_tableOid = 0, t_data = 0x1bc07f0}, tts_off = 0, 
  tts_fixedTupleDescriptor = true}
(gdb)

下一次調(diào)用,直接提取元組


(gdb) c
Continuing.
Breakpoint 1, ExecSort (pstate=0x17a29b0) at nodeSort.c:42
42        SortState  *node = castNode(SortState, pstate);
(gdb) n
48        CHECK_FOR_INTERRUPTS();
(gdb) 
56        estate = node->ss.ps.state;
(gdb) 
57        dir = estate->es_direction;
(gdb) 
58        tuplesortstate = (Tuplesortstate *) node->tuplesortstate;
(gdb) 
65        if (!node->sort_Done)
(gdb) 
151        slot = node->ss.ps.ps_ResultTupleSlot;
(gdb) 
152        (void) tuplesort_gettupleslot(tuplesortstate,
(gdb)

四、參考資料

N/A

向AI問一下細(xì)節(jié)

免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。

AI