none
Performance improvement for std::list::sort. RRS feed

  • General discussion

  • In the transition from VS 2013 to VS 2015, std::list::sort() was changed from a bottom up merge sort using an array of lists to top down merge sort using iterators. The switch to using iterators eliminated allocations issues such as a caller passed list with no default allocator. Although the no default allocator issue could be handled with the initial declaration of all members of the array of lists with _Myt(get_allocator()), with the switch to iterators, the merge logic which is implemented via splice() to move nodes within a list would be operating only on the caller's list, providing exception safety: if the compare throws an exception, the caller's list would be reordered, but all there, instead of parts of it scattered within the array of list.

    The performance issue with the top down strategy is this line:

    iterator _Mid = _STD next(_First, static_cast<difference_type>(_Size >> 1));
    The code is recursively scanning lists and sub-lists in order to split them into two parts. If the nodes are within cache boundaries, it is not much of a performance hit, but if the list is large and the nodes are randomly scattered, top down strategy takes about 40% longer.

    The switch to top down strategy wasn't needed, as the prior bottom up strategy can be converted to using an array of iterators to keep track of runs within the caller's list, using the same merge via splice logic to move nodes within the callers list. It turned out to be relatively straight forward as all merges involved adjacent runs, and the array only needs to store the iterators to the start of runs, as a prior (non-empty) entry or a local variable will contain the iterator to the end of a run. 

    Example standalone code:

    template <typename T>
    typename std::list<T>::iterator Merge(std::list<T> &ll,
                        typename std::list<T>::iterator li,
                        typename std::list<T>::iterator ri,
                        typename std::list<T>::iterator ei);
    
    // iterator array size
    #define ASZ 32
    
    template <typename T>
    void SortList(std::list<T> &ll)
    {
        if (ll.size() < 2)                  // return if nothing to do
            return;
        std::list<T>::iterator ai[ASZ];     // array of iterators
        std::list<T>::iterator li;          // left   iterator
        std::list<T>::iterator ri;          // right  iterator
        std::list<T>::iterator ei;          // end    iterator
        size_t i;
        for (i = 0; i < ASZ; i++)           // "clear" array
            ai[i] = ll.end();
        // merge nodes into array
        for (ei = ll.begin(); ei != ll.end();) {
            ri = ei++;
            for (i = 0; (i < ASZ) && ai[i] != ll.end(); i++) {
                ri = Merge(ll, ai[i], ri, ei);
                ai[i] = ll.end();
            }
            if(i == ASZ)
                i--;
            ai[i] = ri;
        }
        // merge array into single list
        ei = ll.end();                              
        for(i = 0; (i < ASZ) && ai[i] == ei; i++);
        ri = ai[i++];
        while(1){
            for( ; (i < ASZ) && ai[i] == ei; i++);
            if (i == ASZ)
                break;
            li = ai[i++];
            ri = Merge(ll, li, ri, ei);
        }
    }
    
    template <typename T>
    typename std::list<T>::iterator Merge(std::list<T> &ll,
                                 typename std::list<T>::iterator li,
                                 typename std::list<T>::iterator ri,
                                 typename std::list<T>::iterator ei)
    {
        std::list<T>::iterator ni;
        (*ri < *li) ? ni = ri : ni = li;
        while(1){
            if(*ri < *li){
                ll.splice(li, ll, ri++);
                if(ri == ei)
                    return ni;
            } else {
                if(++li == ri)
                    return ni;
            }
        }
    }


    Example replacement for _Sort() in VS 2019's version of <list>. The merge code is the same, but moved into a separate function _Merge().

    private:
        template <class _Pr2>
        iterator _Merge(_Pr2 _Pred, iterator _First, iterator _Mid, iterator _Last){
            iterator _Newfirst = _First;
            for (bool _Initial_loop = true;;
                _Initial_loop       = false) { // [_First, _Mid) and [_Mid, _Last) are sorted and non-empty
                if (_DEBUG_LT_PRED(_Pred, *_Mid, *_First)) { // consume _Mid
                    if (_Initial_loop) {
                        _Newfirst = _Mid; // update return value
                    }
                    splice(_First, *this, _Mid++);
                    if (_Mid == _Last) {
                        return _Newfirst; // exhausted [_Mid, _Last); done
                    }
                }
                else { // consume _First
                    ++_First;
                    if (_First == _Mid) {
                        return _Newfirst; // exhausted [_First, _Mid); done
                    }
                }
            }
        }
    
        template <class _Pr2>
        void _Sort(iterator _First, iterator _Last, _Pr2 _Pred,
            size_type _Size) { // order [_First, _Last), using _Pred, return new first
                               // _Size must be distance from _First to _Last
            if (_Size < 2) {
                return;        // nothing to do
            }
            const size_t _ASZ = 32;         // array size
            iterator _Ai[_ASZ];             // array of iterators to runs
            iterator _Mi;                   // middle   iterator
            iterator _Li;                   // last     iterator
            size_t _I;                      // index to _Ai
            for (_I = 0; _I < _ASZ; _I++)   // "empty" array
                _Ai[_I] = _Last;            //   _Ai[] == _Last => empty entry
            // merge nodes into array
            for (_Li = _First; _Li != _Last;) {
                _Mi = _Li++;
                for (_I = 0; (_I < _ASZ) && _Ai[_I] != _Last; _I++) {
                    _Mi = _Merge(_Pass_fn(_Pred), _Ai[_I], _Mi, _Li);
                    _Ai[_I] = _Last;
                }
                if (_I == _ASZ)
                    _I--;
                _Ai[_I] = _Mi;
            }
            // merge array runs into single run
            for (_I = 0; _I < _ASZ && _Ai[_I] == _Last; _I++);
            _Mi = _Ai[_I++];
            while (1) {
                for (; _I < _ASZ && _Ai[_I] == _Last; _I++);
                if (_I == _ASZ)
                    break;
                _Mi = _Merge(_Pass_fn(_Pred), _Ai[_I++], _Mi, _Last);
            }
        }

    Test and benchmark code, using include "listx.h" instead of <list>. 

    #include <ctime>
    #include <iostream>
    #include "listx.h"     // copy of <list>

    typedef unsigned long long uint64_t;

    static clock_t ctTimeStart;             // clock values
    static clock_t ctTimeStop;

    #define COUNT (4*1024*1024-1)   // number of values to sort

    int main(void)
    {
    uint64_t r = COUNT;             // random number

        std::list<uint64_t> ll(COUNT);
        std::list<uint64_t>::iterator it;
        // fill list with random values
        for(it = ll.begin(); it != ll.end(); it++){
            r  = (((uint64_t)((rand()>>4) & 0xff)) << 0);
            r += (((uint64_t)((rand()>>4) & 0xff)) << 8);
            r += (((uint64_t)((rand()>>4) & 0xff)) <<16);
            r += (((uint64_t)((rand()>>4) & 0xff)) <<24);
            r += (((uint64_t)((rand()>>4) & 0xff)) <<32);
            r += (((uint64_t)((rand()>>4) & 0xff)) <<40);
            r += (((uint64_t)((rand()>>4) & 0xff)) <<48);
            r += (((uint64_t)((rand()>>4) & 0xff)) <<56);
            *it = r;
        }

        ctTimeStart = clock();
        ll.sort();
        ctTimeStop = clock();
        std::cout << "# of ticks " << ctTimeStop - ctTimeStart << std::endl;

        it = ll.begin();
        r = *it;
        while (++it != ll.end()) {
            if (r > *it) {
                std::cout << "error" << std::endl;
                break;
            }
            r = *it;
        }

        // refill scrambled list with random values
        for (it = ll.begin(); it != ll.end(); it++) {
            r =  (((uint64_t)((rand() >> 4) & 0xff)) <<  0);
            r += (((uint64_t)((rand() >> 4) & 0xff)) <<  8);
            r += (((uint64_t)((rand() >> 4) & 0xff)) << 16);
            r += (((uint64_t)((rand() >> 4) & 0xff)) << 24);
            r += (((uint64_t)((rand() >> 4) & 0xff)) << 32);
            r += (((uint64_t)((rand() >> 4) & 0xff)) << 40);
            r += (((uint64_t)((rand() >> 4) & 0xff)) << 48);
            r += (((uint64_t)((rand() >> 4) & 0xff)) << 56);
            *it = r;
        }

        ctTimeStart = clock();
        ll.sort();
        ctTimeStop = clock();
        std::cout << "# of ticks " << ctTimeStop - ctTimeStart << std::endl;

        it = ll.begin();
        r = *it;
        while (++it != ll.end()) {
            if (r > *it) {
                std::cout << "error" << std::endl;
                break;
            }
            r = *it;
        }

        return(0);
    }

    • Edited by rcgldrc Friday, May 10, 2019 1:52 AM
    Friday, May 10, 2019 1:11 AM

All replies