导航：首页 > 数据库 >

PostgreSQL 源码解读（194）- 查询#110(排序#3 - 实现)

发表于：2025-11-14 作者：千家信息网编辑

千家信息网最后更新 2025年11月14日，本节继续介绍排序的实现,上一节介绍了ExecSort,本节介绍该函数中调用的排序的具体实现函数,本节是第一部分,包括tuplesort_begin_heap/tuplesort_puttupleslo

千家信息网最后更新 2025年11月14日PostgreSQL 源码解读（194）- 查询#110(排序#3 - 实现)

本节继续介绍排序的实现,上一节介绍了ExecSort,本节介绍该函数中调用的排序的具体实现函数,本节是第一部分,包括tuplesort_begin_heap/tuplesort_puttupleslot.

一、数据结构

SortState
排序运行期状态信息

/* ---------------- *     SortState information *     排序运行期状态信息 * ---------------- */typedef struct SortState{    //基类    ScanState    ss;                /* its first field is NodeTag */    //是否需要随机访问排序输出?    bool        randomAccess;    /* need random access to sort output? */    //结果集是否存在边界?    bool        bounded;        /* is the result set bounded? */    //如存在边界,需要多少个元组?    int64        bound;            /* if bounded, how many tuples are needed */    //是否已完成排序?    bool        sort_Done;        /* sort completed yet? */    //是否使用有界值?    bool        bounded_Done;    /* value of bounded we did the sort with */    //使用的有界值?    int64        bound_Done;        /* value of bound we did the sort with */    //tuplesort.c的私有状态    void       *tuplesortstate; /* private state of tuplesort.c */    //是否worker?    bool        am_worker;        /* are we a worker? */    //每个worker对应一个条目    SharedSortInfo *shared_info;    /* one entry per worker */} SortState;/* ---------------- *     Shared memory container for per-worker sort information *     per-worker排序信息的共享内存容器 * ---------------- */typedef struct SharedSortInfo{    //worker个数?    int            num_workers;    //排序机制    TuplesortInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];} SharedSortInfo;

TuplesortInstrumentation
报告排序统计的数据结构.

/* * Data structures for reporting sort statistics.  Note that * TuplesortInstrumentation can't contain any pointers because we * sometimes put it in shared memory. * 报告排序统计的数据结构. * 注意TuplesortInstrumentation不能包含指针因为有时候会把该结构体放在共享内存中. */typedef enum{    SORT_TYPE_STILL_IN_PROGRESS = 0,//仍然在排序中    SORT_TYPE_TOP_N_HEAPSORT,//TOP N 堆排序    SORT_TYPE_QUICKSORT,//快速排序    SORT_TYPE_EXTERNAL_SORT,//外部排序    SORT_TYPE_EXTERNAL_MERGE//外部排序后的合并} TuplesortMethod;//排序方法typedef enum{    SORT_SPACE_TYPE_DISK,//需要用上磁盘    SORT_SPACE_TYPE_MEMORY//使用内存} TuplesortSpaceType;typedef struct TuplesortInstrumentation{    //使用的排序算法    TuplesortMethod sortMethod; /* sort algorithm used */    //排序使用空间类型    TuplesortSpaceType spaceType;    /* type of space spaceUsed represents */    //空间消耗(以K为单位)    long        spaceUsed;        /* space consumption, in kB */} TuplesortInstrumentation;

二、源码解读

tuplesort_begin_heap和tuplesort_puttupleslot都是准备工作,把tuple放到数组(堆)中,为后续的实际排序实现作准备.
tuplesort_begin_heap

Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,//元组描述符                     int nkeys, //排序键个数                     AttrNumber *attNums,//属性编号                     Oid *sortOperators, //排序操作符                     Oid *sortCollations,//排序规则                     bool *nullsFirstFlags,//标记                     int workMem, //内存大小                     SortCoordinate coordinate,//协调器                      bool randomAccess)//是否随机访问{    //获取排序状态    Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,                                                   randomAccess);    MemoryContext oldcontext;    int            i;    oldcontext = MemoryContextSwitchTo(state->sortcontext);    AssertArg(nkeys > 0);#ifdef TRACE_SORT    if (trace_sort)        elog(LOG,             "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",             nkeys, workMem, randomAccess ? 't' : 'f');#endif    state->nKeys = nkeys;    TRACE_POSTGRESQL_SORT_START(HEAP_SORT,                                false,    /* no unique check */                                nkeys,                                workMem,                                randomAccess,                                PARALLEL_SORT(state));    //设置运行状态    state->comparetup = comparetup_heap;    state->copytup = copytup_heap;    state->writetup = writetup_heap;    state->readtup = readtup_heap;    //假定不需要拷贝元组描述符    state->tupDesc = tupDesc;    /* assume we need not copy tupDesc */    state->abbrevNext = 10;    /* Prepare SortSupport data for each column */    //为每一列准备SortSupport数据(分配内存空间)    state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));    for (i = 0; i < nkeys; i++)    {        //------- 遍历排序键        //排序键        SortSupport sortKey = state->sortKeys + i;        AssertArg(attNums[i] != 0);        AssertArg(sortOperators[i] != 0);        //设置SortSupport        sortKey->ssup_cxt = CurrentMemoryContext;        sortKey->ssup_collation = sortCollations[i];        sortKey->ssup_nulls_first = nullsFirstFlags[i];        sortKey->ssup_attno = attNums[i];        /* Convey if abbreviation optimization is applicable in principle */        //缩写优化是否原则上可用?        sortKey->abbreviate = (i == 0);        //设置        PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);    }    /*     * The "onlyKey" optimization cannot be used with abbreviated keys, since     * tie-breaker comparisons may be required.  Typically, the optimization     * is only of value to pass-by-value types anyway, whereas abbreviated     * keys are typically only of value to pass-by-reference types.     * 对于缩写优化"onlyKey"优化不能使用,因为可能需要tie-breaker比较.     * 典型的,优化器只对按值传递类型有价值,而缩写建通常只对引用传递类型有价值.     */    if (nkeys == 1 && !state->sortKeys->abbrev_converter)        state->onlyKey = state->sortKeys;    MemoryContextSwitchTo(oldcontext);    return state;}

tuplesort_puttupleslot
接收一个元组(一行)

/* * Accept one tuple while collecting input data for sort. * 接收一个元组(一行) * * Note that the input data is always copied; the caller need not save it. * 注意输入数据通常会被拷贝,调用者不需要保存这些数据. */voidtuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot){    MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);    SortTuple    stup;    /*     * Copy the given tuple into memory we control, and decrease availMem.     * Then call the common code.     * 拷贝给定的元组在我们控制的内存中,同时减少可用内存.     * 然后调用puttuple_common函数.     */    //#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))    COPYTUP(state, &stup, (void *) slot);    puttuple_common(state, &stup);    MemoryContextSwitchTo(oldcontext);}/* * Shared code for tuple and datum cases. * 元组和datum共享的代码. */static voidputtuple_common(Tuplesortstate *state, SortTuple *tuple){    Assert(!LEADER(state));    switch (state->status)    {        case TSS_INITIAL://初始化            /*             * Save the tuple into the unsorted array.  First, grow the array             * as needed.  Note that we try to grow the array when there is             * still one free slot remaining --- if we fail, there'll still be             * room to store the incoming tuple, and then we'll switch to             * tape-based operation.             * 在未排序的数组中存储元组.             * 首先,如需要则扩展数组大小.注意在只有1个slot剩下来的时候才尝试扩展数组.             * 假如扩展失败,仍有空间可用于存储输入的元组,然后将切换至tape-based(基于磁带)操作.             */            if (state->memtupcount >= state->memtupsize - 1)            {                (void) grow_memtuples(state);                Assert(state->memtupcount < state->memtupsize);            }            state->memtuples[state->memtupcount++] = *tuple;//在数组中保存元组            /*             * Check if it's time to switch over to a bounded heapsort. We do             * so if the input tuple count exceeds twice the desired tuple             * count (this is a heuristic for where heapsort becomes cheaper             * than a quicksort), or if we've just filled workMem and have             * enough tuples to meet the bound.             * 检查是否切换至有界heapsort.             * 在输入元组个数超出期望元组个数两倍的情况下执行该检查             * (在堆排序比快速排序成本更低时所获得的洞见),             *   或者如果我们已经填充了workMem并且有足够的元组满足bound时也执行该检查.             *             * Note that once we enter TSS_BOUNDED state we will always try to             * complete the sort that way.  In the worst case, if later input             * tuples are larger than earlier ones, this might cause us to             * exceed workMem significantly.             * 注意我们一旦进入TSS_BOUNDED状态,将尝试按指定的方式完成排序.             * 在最坏的情况下,如果后续的输入元组比早前的要大,这可能会导致内存大小会超出workMem.             */            if (state->bounded &&                (state->memtupcount > state->bound * 2 ||                 (state->memtupcount > state->bound && LACKMEM(state))))            {#ifdef TRACE_SORT                if (trace_sort)                    elog(LOG, "switching to bounded heapsort at %d tuples: %s",                         state->memtupcount,                         pg_rusage_show(&state->ru_start));#endif                //切换至堆排序                make_bounded_heap(state);                return;            }            /*             * Done if we still fit in available memory and have array slots.             * 如果内存空间可用并且数组有位置存储,则已完成任务!             */            if (state->memtupcount < state->memtupsize && !LACKMEM(state))                return;            /*             * Nope; time to switch to tape-based operation.             * 切换至tape-base操作             */            inittapes(state, true);            /*             * Dump all tuples.             * dump所有元组             */            dumptuples(state, false);            break;        case TSS_BOUNDED://有界堆排序            /*             * We don't want to grow the array here, so check whether the new             * tuple can be discarded before putting it in.  This should be a             * good speed optimization, too, since when there are many more             * input tuples than the bound, most input tuples can be discarded             * with just this one comparison.  Note that because we currently             * have the sort direction reversed, we must check for <= not >=.             * 不希望在这里扩展数组,因此检查在放到数组前检查是否可用废弃某些新元组.             * 这将是一个很好的性能优化,因为存在比边界要大得多的输入元组,             *   大多数输入元组会通过对比被废弃.             */            if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)            {                /* new tuple <= top of the heap, so we can discard it */                // 新元组 <= 堆顶,可以废弃                free_sort_tuple(state, tuple);                CHECK_FOR_INTERRUPTS();            }            else            {                /* discard top of heap, replacing it with the new tuple */                //废弃堆顶,使用新元组替换                free_sort_tuple(state, &state->memtuples[0]);                tuplesort_heap_replace_top(state, tuple);            }            break;        case TSS_BUILDRUNS://构建运行期信息            /*             * Save the tuple into the unsorted array (there must be space)             * 保存元组到未排序数组中(存在空闲空间)             */            state->memtuples[state->memtupcount++] = *tuple;            /*             * If we are over the memory limit, dump all tuples.             * 如果已超出内存限制,则dump所有元组             */            dumptuples(state, false);            break;        default:            elog(ERROR, "invalid tuplesort state");            break;    }}