Home > OS >  pipeline stall optimize :: no branch programing
pipeline stall optimize :: no branch programing

Time:09-27

I'm studied about pipeline stall on branch predict miss so I make some codes of mine avoid stalling and be faster. But I can't know if this optimization really matters or makes things worse. I don't know muschabout asm, or cpus.

I add some disassembly codes of mine. So guys, am I optimizing the program correctly? Is it faster than before? Can you tell me If I optimize codes like this, what should be a problem?

// before
switch (i - '0')
{
    case 0: a.f1(); break;
    case 1: a.f2(); break;
    case 2: a.f3(); break;
    case 3: a.f4(); break;
}

///asm with 12 cases
switch (i - '0')
00007FF620461434  movsx       ecx,byte ptr [rax]  
00007FF620461437  add         ecx,0FFFFFFD0h            
00007FF62046143A  cmp         ecx,0Bh                   
00007FF62046143D  ja          main 185h (07FF6204614D5h)    
00007FF620461443  movsxd      rcx,ecx                   
00007FF620461446  mov         edx,dword ptr [r11 rcx*4 1614h]       
00007FF62046144E  add         rdx,r11                   
00007FF620461451  jmp         rdx                   


// asm with 4 cases
    64:             switch (i - '0')
00007FF6927413A5  movsx       eax,byte ptr [rdx]  
00007FF6927413A8  sub         eax,30h  
00007FF6927413AB  je          main 110h (07FF6927413E0h)  
00007FF6927413AD  sub         eax,1  
00007FF6927413B0  je          main 104h (07FF6927413D4h)  
00007FF6927413B2  sub         eax,1  
00007FF6927413B5  je          main 0F8h (07FF6927413C8h)  
00007FF6927413B7  cmp         eax,1  
00007FF6927413BA  jne         main 11Ah (07FF6927413EAh)  
    69:             case 3: a.f4(); break;
00007FF6927413BC  lea         rcx,[a]  
00007FF6927413C1  call        OBJ::f4 (07FF6927412C0h)  
00007FF6927413C6  jmp         main 11Ah (07FF6927413EAh)  
    68:             case 2: a.f3(); break;
00007FF6927413C8  lea         rcx,[a]  
00007FF6927413CD  call        OBJ::f3 (07FF6927412B0h)  
00007FF6927413D2  jmp         main 11Ah (07FF6927413EAh)  
    67:             case 1: a.f2(); break;
00007FF6927413D4  lea         rcx,[a]  
00007FF6927413D9  call        OBJ::f2 (07FF6927412A0h)  
00007FF6927413DE  jmp         main 11Ah (07FF6927413EAh)  
    65:             {
    66:             case 0: a.f1(); break;
00007FF6927413E0  lea         rcx,[a]  
00007FF6927413E5  call        OBJ::f1 (07FF692741290h)
//after
static decltype(&OBJ::f1) func[4] = { &OBJ::f1, &OBJ::f2, &OBJ::f3, &OBJ::f4 };
(a.*func[i - '0'])();


// asm
    61:             static decltype(&OBJ::f1) func[4] = { &OBJ::f1, &OBJ::f2, &OBJ::f3, &OBJ::f4 };
    62:             (a.*func[i - '0'])();
00007FF71D7213B9  movsx       rax,byte ptr [rbx]  
00007FF71D7213BD  lea         rcx,[a]  
00007FF71D7213C2  call        qword ptr [r13 rax*8-180h] 

I'm using MSVC. this code is in main-loop. below is my test code, input is

12031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100120310230120310203102301203012030120310203102310230120301230120301203012302033230302002010230222101001230020101001203102301203102031023012030120301203102031023102301203012301203012030123020332303020020102302221010012300201010012031023012031020310230120301203012031020310231023012030123012030120301230203323030200201023022210100123002010100
#include <iostream>
#include <chrono>

using clk = std::chrono::high_resolution_clock;
using namespace std::chrono;
using namespace std::literals::string_view_literals;

namespace timer {
    static clk::time_point StopWatch;

    inline void start() {
        StopWatch = clk::now();
    }

    inline void end(const std::string_view mess = ""sv)
    {
        auto t = clk::now();
        std::cout << mess << " : " << duration_cast<milliseconds>(t - StopWatch) << '\n';
    }
}

// controll //
#define noBranch
#define noInline
// controll //


#ifdef noInline
#define INLINE __declspec(noinline)
#else 
#define INLINE 
#endif

class OBJ {
public:
    size_t x = 0;
    INLINE void f1() {
        x  = 13;
    }
    INLINE void f2() {
        x  = 23;
    }
    INLINE void f3() {
        x  = 18;
    }
    INLINE void f4() {
        x  = 15;
    }
};

int main()
{
    size_t sum = 0;
    std::string in;
    std::cin >> in;
    timer::start();
    for (size_t q = 0; q < 1'000'000; q  ) {
        for (const auto i : in) {
            OBJ a;
#ifdef noBranch
            static decltype(&OBJ::f1) func[4] = { &OBJ::f1, &OBJ::f2, &OBJ::f3, &OBJ::f4 };
            (a.*func[i - '0'])();
#else
            switch (i - '0')
            {
            case 0: a.f1(); break;
            case 1: a.f2(); break;
            case 2: a.f3(); break;
            case 3: a.f4(); break;
            }
#endif
            sum  = a.x;
        }
    }
    std::cout << "sum" << sum << std::endl;
    timer::end();
}

CodePudding user response:

Call by a pointer is also a branch, so branch mis-prediction apply here as well.

C pointer-to-member-function is a complex construct that is hard for the compiler to optimize.

Making truly branchless code using a lookup table would have worked, if you look up data, not code, and make no branches based on that data.

  • Related