本节继续介绍排序的实现,上一节介绍了ExecInitSort,本节主要介绍排序的实现函数ExecSort.

一、数据结构

SortState
排序运行期状态信息

/* ---------------- * SortState information * 排序运行期状态信息 * ---------------- */typedef struct SortState{ //基类 ScanState ss; /* its first field is NodeTag */ //是否需要随机访问排序输出? bool randomAccess; /* need random access to sort output? */ //结果集是否存在边界? bool bounded; /* is the result set bounded? */ //如存在边界,需要多少个元组? int64 bound; /* if bounded, how many tuples are needed */ //是否已完成排序? bool sort_Done; /* sort completed yet? */ //是否使用有界值? bool bounded_Done; /* value of bounded we did the sort with */ //使用的有界值? int64 bound_Done; /* value of bound we did the sort with */ //tuplesort.c的私有状态 void *tuplesortstate; /* private state of tuplesort.c */ //是否worker? bool am_worker; /* are we a worker? */ //每个worker对应一个条目 SharedSortInfo *shared_info; /* one entry per worker */} SortState;/* ---------------- * Shared memory container for per-worker sort information * per-worker排序信息的共享内存容器 * ---------------- */typedef struct SharedSortInfo{ //worker个数? int num_workers; //排序机制 TuplesortInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];} SharedSortInfo;

TuplesortInstrumentation
报告排序统计的数据结构.

/* * Data structures for reporting sort statistics. Note that * TuplesortInstrumentation can't contain any pointers because we * sometimes put it in shared memory. * 报告排序统计的数据结构. * 注意TuplesortInstrumentation不能包含指针因为有时候会把该结构体放在共享内存中. */typedef enum{ SORT_TYPE_STILL_IN_PROGRESS = 0,//仍然在排序中 SORT_TYPE_TOP_N_HEAPSORT,//TOP N 堆排序 SORT_TYPE_QUICKSORT,//快速排序 SORT_TYPE_EXTERNAL_SORT,//外部排序 SORT_TYPE_EXTERNAL_MERGE//外部排序后的合并} TuplesortMethod;//排序方法typedef enum{ SORT_SPACE_TYPE_DISK,//需要用上磁盘 SORT_SPACE_TYPE_MEMORY//使用内存} TuplesortSpaceType;typedef struct TuplesortInstrumentation{ //使用的排序算法 TuplesortMethod sortMethod; /* sort algorithm used */ //排序使用空间类型 TuplesortSpaceType spaceType; /* type of space spaceUsed represents */ //空间消耗(以K为单位) long spaceUsed; /* space consumption, in kB */} TuplesortInstrumentation;二、源码解读

ExecSort使用tuplesort排序从outer子树节点获取的元组,在内存或者临时文件中缓存结果.在初次调用后,后续的每次调用返回一行.
实现逻辑不复杂,从outer plan中获取所有元组,调用tuplesort进行排序,如work_mem大小足够则在内存中存储否则在磁盘中使用临时文件存储.

/* ---------------------------------------------------------------- * ExecSort * * Sorts tuples from the outer subtree of the node using tuplesort, * which saves the results in a temporary file or memory. After the * initial call, returns a tuple from the file with each call. * 使用tuplesort排序outer子树节点获取的元组, * 在内存或者临时文件中缓存结果. * 在初次调用后,后续的每次调用返回一行. * * Conditions: * -- none. * -- 无 * * Initial States: * -- the outer child is prepared to return the first tuple. * 初始状态: * -- outer子节点已准备返回第一个元组. * ---------------------------------------------------------------- */static TupleTableSlot *ExecSort(PlanState *pstate){ //排序运行状态 SortState *node = castNode(SortState, pstate); EState *estate;//运行状态 ScanDirection dir;//扫描方向 Tuplesortstate *tuplesortstate;//元组排序状态 TupleTableSlot *slot;//元组slot CHECK_FOR_INTERRUPTS(); /* * get state info from node * 从节点中获取运行状态 */ SO1_printf("ExecSort: %s\n", "entering routine"); estate = node->ss.ps.state; dir = estate->es_direction; tuplesortstate = (Tuplesortstate *) node->tuplesortstate; /* * If first time through, read all tuples from outer plan and pass them to * tuplesort.c. Subsequent calls just fetch tuples from tuplesort. * 如果第一轮,从outer plan中读取所有元组并传递给tuplesort.c. * 后续的调用只是从tuplesort中提取元组. */ if (!node->sort_Done) { //-------------- 首次,需要进行排序 Sort *plannode = (Sort *) node->ss.ps.plan; PlanState *outerNode; TupleDesc tupDesc; SO1_printf("ExecSort: %s\n", "sorting subplan"); /* * Want to scan subplan in the forward direction while creating the * sorted data. * 在创建结果排序数据时,向前扫描子计划 */ //设置扫描方向 estate->es_direction = ForwardScanDirection; /* * Initialize tuplesort module. * 初始化tuplesort模块 */ SO1_printf("ExecSort: %s\n", "calling tuplesort_begin"); //获取outer plan运行状态 outerNode = outerPlanState(node); //获取结果类型(元组描述符) tupDesc = ExecGetResultType(outerNode); //执行tuplesort_begin_heap tuplesortstate = tuplesort_begin_heap(tupDesc, plannode->numCols, plannode->sortColIdx, plannode->sortOperators, plannode->collations, plannode->nullsFirst, work_mem, NULL, node->randomAccess); if (node->bounded) //节点有界,则设置边界 tuplesort_set_bound(tuplesortstate, node->bound); node->tuplesortstate = (void *) tuplesortstate; /* * Scan the subplan and feed all the tuples to tuplesort. * 扫描子计划并把所有元组都发给tuplesort */ for (;;) { //从outer plan中获取元组 slot = ExecProcNode(outerNode); if (TupIsNull(slot)) break;//直至全部获取完毕 //排序 tuplesort_puttupleslot(tuplesortstate, slot); } /* * Complete the sort. * 完成排序 */ tuplesort_performsort(tuplesortstate); /* * restore to user specified direction * 恢复用户指定的扫描方向 */ estate->es_direction = dir; /* * finally set the sorted flag to true * 最后,设置已排序标记为T */ node->sort_Done = true; node->bounded_Done = node->bounded; node->bound_Done = node->bound; if (node->shared_info && node->am_worker) { TuplesortInstrumentation *si; Assert(IsParallelWorker()); Assert(ParallelWorkerNumber <= node->shared_info->num_workers); si = &node->shared_info->sinstrument[ParallelWorkerNumber]; tuplesort_get_stats(tuplesortstate, si); } SO1_printf("ExecSort: %s\n", "sorting done"); } SO1_printf("ExecSort: %s\n", "retrieving tuple from tuplesort"); /* * Get the first or next tuple from tuplesort. Returns NULL if no more * tuples. Note that we only rely on slot tuple remaining valid until the * next fetch from the tuplesort. * 在tuplesort中获取第一个/下一个元组. * 如无更多的元组,返回NULL. * 注意我们会一直保持存储在slot中的元组可用直至从tuplesort中提取下一个元组. */ slot = node->ss.ps.ps_ResultTupleSlot; (void) tuplesort_gettupleslot(tuplesortstate, ScanDirectionIsForward(dir), false, slot, NULL); return slot;}三、跟踪分析

创建数据表,插入测试数据

drop table if exists t_sort;create table t_sort(bh varchar(20),c1 int,c2 int,c3 int,c4 int,c5 int,c6 int);insert into t_sort select 'GZ01',col,col,col,col,col,col from generate_series(1,100000) as col;testdb=# explain (verbose,analyze) select * from t_sort order by c1,c2; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------- Sort (cost=8172.55..8308.71 rows=54464 width=82) (actual time=173.447..225.213 rows=100000 loops=1) Output: bh, c1, c2, c3, c4, c5, c6 Sort Key: t_sort.c1, t_sort.c2 Sort Method: external merge Disk: 4120kB -> Seq Scan on public.t_sort (cost=0.00..1280.64 rows=54464 width=82) (actual time=0.092..55.257 rows=100000 loops=1) Output: bh, c1, c2, c3, c4, c5, c6 Planning Time: 4.648 ms Execution Time: 238.227 ms(8 rows)

测试脚本

testdb=# select * from t_sort order by c1,c2;

跟踪调试

(gdb) b ExecSortBreakpoint 1 at 0x711909: file nodeSort.c, line 42.(gdb) cContinuing.Breakpoint 1, ExecSort (pstate=0x17a29b0) at nodeSort.c:4242 SortState *node = castNode(SortState, pstate);(gdb)

输入参数

(gdb) p *pstate$1 = {type = T_SortState, plan = 0x179e540, state = 0x17a2798, ExecProcNode = 0x7118fd <ExecSort>, ExecProcNodeReal = 0x7118fd <ExecSort>, instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, qual = 0x0, lefttree = 0x17a2ac8, righttree = 0x0, initPlan = 0x0, subPlan = 0x0, chgParam = 0x0, ps_ResultTupleSlot = 0x17a3900, ps_ExprContext = 0x0, ps_ProjInfo = 0x0, scandesc = 0x17a2e50}

左树节点是SeqScan,亦即outer node

(gdb) p *pstate->lefttree$2 = {type = T_SeqScanState, plan = 0x17a8960, state = 0x17a2798, ExecProcNode = 0x6e0670 <ExecProcNodeFirst>, ExecProcNodeReal = 0x710589 <ExecSeqScan>, instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, qual = 0x0, lefttree = 0x0, righttree = 0x0, initPlan = 0x0, subPlan = 0x0, chgParam = 0x0, ps_ResultTupleSlot = 0x17a3268, ps_ExprContext = 0x17a2be0, ps_ProjInfo = 0x0, scandesc = 0x7faf6c0ec160}

获取节点的运行状态&扫描方向

(gdb) n48 CHECK_FOR_INTERRUPTS();(gdb) 56 estate = node->ss.ps.state;(gdb) 57 dir = estate->es_direction;(gdb) 58 tuplesortstate = (Tuplesortstate *) node->tuplesortstate;(gdb) (gdb) n65 if (!node->sort_Done)(gdb) (gdb) p *estate$3 = {type = T_EState, es_direction = ForwardScanDirection, es_snapshot = 0x17698a0, es_crosscheck_snapshot = 0x0, es_range_table = 0x17a8e58, es_plannedstmt = 0x16a7e80, es_sourceText = 0x16a6d78 "select * from t_sort order by c1,c2;", es_junkFilter = 0x0, es_output_cid = 0, es_result_relations = 0x0, es_num_result_relations = 0, es_result_relation_info = 0x0, es_root_result_relations = 0x0, es_num_root_result_relations = 0, es_tuple_routing_result_relations = 0x0, es_trig_target_relations = 0x0, es_trig_tuple_slot = 0x0, es_trig_oldtup_slot = 0x0, es_trig_newtup_slot = 0x0, es_param_list_info = 0x0, es_param_exec_vals = 0x0, es_queryEnv = 0x0, es_query_cxt = 0x17a2680, es_tupleTable = 0x17a2e18, es_rowMarks = 0x0, es_processed = 0, es_lastoid = 0, es_top_eflags = 16, es_instrument = 0, es_finished = false, es_exprcontexts = 0x17a2ca0, es_subplanstates = 0x0, es_auxmodifytables = 0x0, es_per_tuple_exprcontext = 0x0, es_epqTuple = 0x0, es_epqTupleSet = 0x0, es_epqScanDone = 0x0, es_use_parallel_mode = false, es_query_dsa = 0x0, es_jit_flags = 0, es_jit = 0x0, es_jit_worker_instr = 0x0}(gdb) p dir$4 = ForwardScanDirection(gdb) p *tuplesortstateCannot access memory at address 0x0

未完成排序,进入排序逻辑,设置扫描方向

(gdb) n67 Sort *plannode = (Sort *) node->ss.ps.plan;(gdb) n78 estate->es_direction = ForwardScanDirection;(gdb) 86 outerNode = outerPlanState(node);(gdb) p *plannode$5 = {plan = {type = T_Sort, startup_cost = 12434.82023721841, total_cost = 12684.82023721841, plan_rows = 100000, plan_width = 29, parallel_aware = false, parallel_safe = true, plan_node_id = 0, targetlist = 0x17a8fc8, qual = 0x0, lefttree = 0x17a8960, righttree = 0x0, initPlan = 0x0, extParam = 0x0, allParam = 0x0}, numCols = 2, sortColIdx = 0x17a89f8, sortOperators = 0x17a8a18, collations = 0x17a8a38, nullsFirst = 0x17a8a58}(gdb)

获取结果类型并调用tuplesort_begin_heap获取排序状态tuplesortstate

(gdb) n87 tupDesc = ExecGetResultType(outerNode);(gdb) 96 NULL, node->randomAccess);(gdb) p *tupDesc$6 = {natts = 7, tdtypeid = 2249, tdtypmod = -1, tdhasoid = false, tdrefcount = -1, constr = 0x0, attrs = 0x17a2e70}(gdb) n89 tuplesortstate = tuplesort_begin_heap(tupDesc,(gdb) 97 if (node->bounded)(gdb) 99 node->tuplesortstate = (void *) tuplesortstate;(gdb)

循环获取元组并调用tuplesort_puttupleslot(下一节介绍),放到待排序中

(gdb) n107 slot = ExecProcNode(outerNode);(gdb) 109 if (TupIsNull(slot))(gdb) 112 tuplesort_puttupleslot(tuplesortstate, slot);(gdb) 113 }(gdb) 107 slot = ExecProcNode(outerNode);(gdb)

设置断点,完成循环,并执行tuplesort_performsort(下一节介绍)完成排序

(gdb) b nodeSort.c:118Breakpoint 2 at 0x711a72: file nodeSort.c, line 118.(gdb) cContinuing.Breakpoint 2, ExecSort (pstate=0x17a29b0) at nodeSort.c:118118 tuplesort_performsort(tuplesortstate);(gdb)

设置相关其他标记

(gdb) n123 estate->es_direction = dir;(gdb) 128 node->sort_Done = true;(gdb) 129 node->bounded_Done = node->bounded;(gdb) 130 node->bound_Done = node->bound;(gdb) 131 if (node->shared_info && node->am_worker)(gdb) 151 slot = node->ss.ps.ps_ResultTupleSlot;(gdb) p *node$7 = {ss = {ps = {type = T_SortState, plan = 0x179e540, state = 0x17a2798, ExecProcNode = 0x7118fd <ExecSort>, ExecProcNodeReal = 0x7118fd <ExecSort>, instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, qual = 0x0, lefttree = 0x17a2ac8, righttree = 0x0, initPlan = 0x0, subPlan = 0x0, chgParam = 0x0, ps_ResultTupleSlot = 0x17a3900, ps_ExprContext = 0x0, ps_ProjInfo = 0x0, scandesc = 0x17a2e50}, ss_currentRelation = 0x0, ss_currentScanDesc = 0x0, ss_ScanTupleSlot = 0x17a33a8}, randomAccess = false, bounded = false, bound = 0, sort_Done = true, bounded_Done = false, bound_Done = 0, tuplesortstate = 0x17ac7b8, am_worker = false, shared_info = 0x0}(gdb)

完成排序,获取元组

(gdb) p node->tuplesortstate$8 = (void *) 0x17ac7b8(gdb) n152 (void) tuplesort_gettupleslot(tuplesortstate,(gdb) 155 return slot;(gdb) p *slot$9 = {type = T_TupleTableSlot, tts_isempty = false, tts_shouldFree = false, tts_shouldFreeMin = false, tts_slow = false, tts_tuple = 0x17a3940, tts_tupleDescriptor = 0x17a34e8, tts_mcxt = 0x17a2680, tts_buffer = 0, tts_nvalid = 0, tts_values = 0x17a3960, tts_isnull = 0x17a3998, tts_mintuple = 0x1bc07f8, tts_minhdr = {t_len = 56, t_self = {ip_blkid = { bi_hi = 0, bi_lo = 0}, ip_posid = 0}, t_tableOid = 0, t_data = 0x1bc07f0}, tts_off = 0, tts_fixedTupleDescriptor = true}(gdb)

下一次调用,直接提取元组

(gdb) cContinuing.Breakpoint 1, ExecSort (pstate=0x17a29b0) at nodeSort.c:4242 SortState *node = castNode(SortState, pstate);(gdb) n48 CHECK_FOR_INTERRUPTS();(gdb) 56 estate = node->ss.ps.state;(gdb) 57 dir = estate->es_direction;(gdb) 58 tuplesortstate = (Tuplesortstate *) node->tuplesortstate;(gdb) 65 if (!node->sort_Done)(gdb) 151 slot = node->ss.ps.ps_ResultTupleSlot;(gdb) 152 (void) tuplesort_gettupleslot(tuplesortstate,(gdb)四、参考资料

N/A