PostgreSQL 源码解读（194）- 查询#110(排序#3 - 实现)

2025-04-09 技术教程

本节继续介绍排序的实现,上一节介绍了ExecSort,本节介绍该函数中调用的排序的具体实现函数,本节是第一部分,包括tuplesort_begin_heap/tuplesort_puttupleslot.

一、数据结构

SortState
排序运行期状态信息

/* ---------------- * SortState information * 排序运行期状态信息 * ---------------- */typedef struct SortState{ //基类 ScanState ss; /* its first field is NodeTag */ //是否需要随机访问排序输出? bool randomAccess; /* need random access to sort output? */ //结果集是否存在边界? bool bounded; /* is the result set bounded? */ //如存在边界,需要多少个元组? int64 bound; /* if bounded, how many tuples are needed */ //是否已完成排序? bool sort_Done; /* sort completed yet? */ //是否使用有界值? bool bounded_Done; /* value of bounded we did the sort with */ //使用的有界值? int64 bound_Done; /* value of bound we did the sort with */ //tuplesort.c的私有状态 void *tuplesortstate; /* private state of tuplesort.c */ //是否worker? bool am_worker; /* are we a worker? */ //每个worker对应一个条目 SharedSortInfo *shared_info; /* one entry per worker */} SortState;/* ---------------- * Shared memory container for per-worker sort information * per-worker排序信息的共享内存容器 * ---------------- */typedef struct SharedSortInfo{ //worker个数? int num_workers; //排序机制 TuplesortInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];} SharedSortInfo;

TuplesortInstrumentation
报告排序统计的数据结构.

/* * Data structures for reporting sort statistics. Note that * TuplesortInstrumentation can't contain any pointers because we * sometimes put it in shared memory. * 报告排序统计的数据结构. * 注意TuplesortInstrumentation不能包含指针因为有时候会把该结构体放在共享内存中. */typedef enum{ SORT_TYPE_STILL_IN_PROGRESS = 0,//仍然在排序中 SORT_TYPE_TOP_N_HEAPSORT,//TOP N 堆排序 SORT_TYPE_QUICKSORT,//快速排序 SORT_TYPE_EXTERNAL_SORT,//外部排序 SORT_TYPE_EXTERNAL_MERGE//外部排序后的合并} TuplesortMethod;//排序方法typedef enum{ SORT_SPACE_TYPE_DISK,//需要用上磁盘 SORT_SPACE_TYPE_MEMORY//使用内存} TuplesortSpaceType;typedef struct TuplesortInstrumentation{ //使用的排序算法 TuplesortMethod sortMethod; /* sort algorithm used */ //排序使用空间类型 TuplesortSpaceType spaceType; /* type of space spaceUsed represents */ //空间消耗(以K为单位) long spaceUsed; /* space consumption, in kB */} TuplesortInstrumentation;二、源码解读

tuplesort_begin_heap和tuplesort_puttupleslot都是准备工作,把tuple放到数组(堆)中,为后续的实际排序实现作准备.
tuplesort_begin_heap

Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,//元组描述符 int nkeys, //排序键个数 AttrNumber *attNums,//属性编号 Oid *sortOperators, //排序操作符 Oid *sortCollations,//排序规则 bool *nullsFirstFlags,//标记 int workMem, //内存大小 SortCoordinate coordinate,//协调器 bool randomAccess)//是否随机访问{ //获取排序状态 Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, randomAccess); MemoryContext oldcontext; int i; oldcontext = MemoryContextSwitchTo(state->sortcontext); AssertArg(nkeys > 0);#ifdef TRACE_SORT if (trace_sort) elog(LOG, "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", nkeys, workMem, randomAccess ? 't' : 'f');#endif state->nKeys = nkeys; TRACE_POSTGRESQL_SORT_START(HEAP_SORT, false, /* no unique check */ nkeys, workMem, randomAccess, PARALLEL_SORT(state)); //设置运行状态 state->comparetup = comparetup_heap; state->copytup = copytup_heap; state->writetup = writetup_heap; state->readtup = readtup_heap; //假定不需要拷贝元组描述符 state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ state->abbrevNext = 10; /* Prepare SortSupport data for each column */ //为每一列准备SortSupport数据(分配内存空间) state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); for (i = 0; i < nkeys; i++) { //------- 遍历排序键 //排序键 SortSupport sortKey = state->sortKeys + i; AssertArg(attNums[i] != 0); AssertArg(sortOperators[i] != 0); //设置SortSupport sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = sortCollations[i]; sortKey->ssup_nulls_first = nullsFirstFlags[i]; sortKey->ssup_attno = attNums[i]; /* Convey if abbreviation optimization is applicable in principle */ //缩写优化是否原则上可用? sortKey->abbreviate = (i == 0); //设置 PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); } /* * The "onlyKey" optimization cannot be used with abbreviated keys, since * tie-breaker comparisons may be required. Typically, the optimization * is only of value to pass-by-value types anyway, whereas abbreviated * keys are typically only of value to pass-by-reference types. * 对于缩写优化"onlyKey"优化不能使用,因为可能需要tie-breaker比较. * 典型的,优化器只对按值传递类型有价值,而缩写建通常只对引用传递类型有价值. */ if (nkeys == 1 && !state->sortKeys->abbrev_converter) state->onlyKey = state->sortKeys; MemoryContextSwitchTo(oldcontext); return state;}

tuplesort_puttupleslot
接收一个元组(一行)

/* * Accept one tuple while collecting input data for sort. * 接收一个元组(一行) * * Note that the input data is always copied; the caller need not save it. * 注意输入数据通常会被拷贝,调用者不需要保存这些数据. */voidtuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot){ MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); SortTuple stup; /* * Copy the given tuple into memory we control, and decrease availMem. * Then call the common code. * 拷贝给定的元组在我们控制的内存中,同时减少可用内存. * 然后调用puttuple_common函数. */ //#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) COPYTUP(state, &stup, (void *) slot); puttuple_common(state, &stup); MemoryContextSwitchTo(oldcontext);}/* * Shared code for tuple and datum cases. * 元组和datum共享的代码. */static voidputtuple_common(Tuplesortstate *state, SortTuple *tuple){ Assert(!LEADER(state)); switch (state->status) { case TSS_INITIAL://初始化 /* * Save the tuple into the unsorted array. First, grow the array * as needed. Note that we try to grow the array when there is * still one free slot remaining --- if we fail, there'll still be * room to store the incoming tuple, and then we'll switch to * tape-based operation. * 在未排序的数组中存储元组. * 首先,如需要则扩展数组大小.注意在只有1个slot剩下来的时候才尝试扩展数组. * 假如扩展失败,仍有空间可用于存储输入的元组,然后将切换至tape-based(基于磁带)操作. */ if (state->memtupcount >= state->memtupsize - 1) { (void) grow_memtuples(state); Assert(state->memtupcount < state->memtupsize); } state->memtuples[state->memtupcount++] = *tuple;//在数组中保存元组 /* * Check if it's time to switch over to a bounded heapsort. We do * so if the input tuple count exceeds twice the desired tuple * count (this is a heuristic for where heapsort becomes cheaper * than a quicksort), or if we've just filled workMem and have * enough tuples to meet the bound. * 检查是否切换至有界heapsort. * 在输入元组个数超出期望元组个数两倍的情况下执行该检查 * (在堆排序比快速排序成本更低时所获得的洞见), * 或者如果我们已经填充了workMem并且有足够的元组满足bound时也执行该检查. * * Note that once we enter TSS_BOUNDED state we will always try to * complete the sort that way. In the worst case, if later input * tuples are larger than earlier ones, this might cause us to * exceed workMem significantly. * 注意我们一旦进入TSS_BOUNDED状态,将尝试按指定的方式完成排序. * 在最坏的情况下,如果后续的输入元组比早前的要大,这可能会导致内存大小会超出workMem. */ if (state->bounded && (state->memtupcount > state->bound * 2 || (state->memtupcount > state->bound && LACKMEM(state)))) {#ifdef TRACE_SORT if (trace_sort) elog(LOG, "switching to bounded heapsort at %d tuples: %s", state->memtupcount, pg_rusage_show(&state->ru_start));#endif //切换至堆排序 make_bounded_heap(state); return; } /* * Done if we still fit in available memory and have array slots. * 如果内存空间可用并且数组有位置存储,则已完成任务! */ if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; /* * Nope; time to switch to tape-based operation. * 切换至tape-base操作 */ inittapes(state, true); /* * Dump all tuples. * dump所有元组 */ dumptuples(state, false); break; case TSS_BOUNDED://有界堆排序 /* * We don't want to grow the array here, so check whether the new * tuple can be discarded before putting it in. This should be a * good speed optimization, too, since when there are many more * input tuples than the bound, most input tuples can be discarded * with just this one comparison. Note that because we currently * have the sort direction reversed, we must check for <= not >=. * 不希望在这里扩展数组,因此检查在放到数组前检查是否可用废弃某些新元组. * 这将是一个很好的性能优化,因为存在比边界要大得多的输入元组, * 大多数输入元组会通过对比被废弃. */ if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) { /* new tuple <= top of the heap, so we can discard it */ // 新元组 <= 堆顶,可以废弃 free_sort_tuple(state, tuple); CHECK_FOR_INTERRUPTS(); } else { /* discard top of heap, replacing it with the new tuple */ //废弃堆顶,使用新元组替换 free_sort_tuple(state, &state->memtuples[0]); tuplesort_heap_replace_top(state, tuple); } break; case TSS_BUILDRUNS://构建运行期信息 /* * Save the tuple into the unsorted array (there must be space) * 保存元组到未排序数组中(存在空闲空间) */ state->memtuples[state->memtupcount++] = *tuple; /* * If we are over the memory limit, dump all tuples. * 如果已超出内存限制,则dump所有元组 */ dumptuples(state, false); break; default: elog(ERROR, "invalid tuplesort state"); break; }}三、跟踪分析

N/A

四、参考资料

N/A