代码之家 › 专栏 › 技术社区 › halivingston

通过avx指令对间接访问进行矢量化

avx512 simd compiler-optimization vectorization c++

halivingston · 技术社区 · 6 年前

我最近被介绍到矢量指令(理论上)并且对如何使用它们来加速我的应用程序感到兴奋。

我想改进的一个方面是一个非常热的循环:

__declspec(noinline) void pleaseVectorize(int* arr, int* someGlobalArray, int* output)
{
    for (int i = 0; i < 16; ++i)
    {
        auto someIndex = arr[i];
        output[i] = someGlobalArray[someIndex];
    }

    for (int i = 0; i < 16; ++i)
    {
         if (output[i] == 1)
         {
             return i;
         }
    }

    return -1;
}

当然,三大编译器(msvc、gcc、clang)都拒绝将其矢量化。我能理解为什么,但我想得到确认。

如果我必须手动将其矢量化,它将是:

(1)vectorload“arr”,这会带来16个4字节的整数,比如说zmm0

(2)从ZMM0(0…3)指向的地址中的16个内存负载进入ZMM1(0…3),从ZMM0(4…7)指向的地址加载到ZMM1[4…7 ]等等。

(3)比较ZMM0和ZMM1

(4)矢量popcnt进入输出,找出最有效位,基本上除以8得到匹配的索引。

首先,向量指令能做这些事情吗?像他们这样做“收集”操作,即从地址指向ZMM0的负载?

以下是Clang产生的结果:

0000000000400530 <_Z5superPiS_S_>:
  400530:       48 63 07                movslq (%rdi),%rax
  400533:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400536:       89 02                   mov    %eax,(%rdx)
  400538:       48 63 47 04             movslq 0x4(%rdi),%rax
  40053c:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40053f:       89 42 04                mov    %eax,0x4(%rdx)
  400542:       48 63 47 08             movslq 0x8(%rdi),%rax
  400546:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400549:       89 42 08                mov    %eax,0x8(%rdx)
  40054c:       48 63 47 0c             movslq 0xc(%rdi),%rax
  400550:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400553:       89 42 0c                mov    %eax,0xc(%rdx)
  400556:       48 63 47 10             movslq 0x10(%rdi),%rax
  40055a:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40055d:       89 42 10                mov    %eax,0x10(%rdx)
  400560:       48 63 47 14             movslq 0x14(%rdi),%rax
  400564:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400567:       89 42 14                mov    %eax,0x14(%rdx)
  40056a:       48 63 47 18             movslq 0x18(%rdi),%rax
  40056e:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400571:       89 42 18                mov    %eax,0x18(%rdx)
  400574:       48 63 47 1c             movslq 0x1c(%rdi),%rax
  400578:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40057b:       89 42 1c                mov    %eax,0x1c(%rdx)
  40057e:       48 63 47 20             movslq 0x20(%rdi),%rax
  400582:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400585:       89 42 20                mov    %eax,0x20(%rdx)
  400588:       48 63 47 24             movslq 0x24(%rdi),%rax
  40058c:       8b 04 86                mov    (%rsi,%rax,4),%eax
  40058f:       89 42 24                mov    %eax,0x24(%rdx)
  400592:       48 63 47 28             movslq 0x28(%rdi),%rax
  400596:       8b 04 86                mov    (%rsi,%rax,4),%eax
  400599:       89 42 28                mov    %eax,0x28(%rdx)
  40059c:       48 63 47 2c             movslq 0x2c(%rdi),%rax
  4005a0:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005a3:       89 42 2c                mov    %eax,0x2c(%rdx)
  4005a6:       48 63 47 30             movslq 0x30(%rdi),%rax
  4005aa:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005ad:       89 42 30                mov    %eax,0x30(%rdx)
  4005b0:       48 63 47 34             movslq 0x34(%rdi),%rax
  4005b4:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005b7:       89 42 34                mov    %eax,0x34(%rdx)
  4005ba:       48 63 47 38             movslq 0x38(%rdi),%rax
  4005be:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005c1:       89 42 38                mov    %eax,0x38(%rdx)
  4005c4:       48 63 47 3c             movslq 0x3c(%rdi),%rax
  4005c8:       8b 04 86                mov    (%rsi,%rax,4),%eax
  4005cb:       89 42 3c                mov    %eax,0x3c(%rdx)
  4005ce:       c3                      retq
  4005cf:       90                      nop

1 回复 | 直到 6 年前

Peter Cordes Steve Bohrer 6 年前

你对它如何工作的想法很接近,只是你想 bit-scan / find-first-set-bit (x86 bsf或 TZCNT )比较位图的,而不是填充计数( 数位集)。

AVX2/AVX512有 vpgatherdd 它确实使用了一个有符号的32位标度索引向量。它几乎不值得在哈斯韦尔使用,在布罗德韦尔改进,在天空湖非常好。( http://agner.org/optimize/ ,并查看中的其他链接 the x86 tag wiki ,比如英特尔的优化手册,它有一个关于收集性能的章节。SIMD比较和bitscan比较便宜;单UOP和完全流水线。

GCC8.1可以自动矢量化你的聚集, 如果它可以证明你的输入没有重叠 output 函数ARG . 有时内联后可能,但对于非内嵌版本,您可以承诺这一点 int * __restrict output . 或者如果你 输出 本地临时参数,而不是函数arg。(一般规则:通过非 _restrict 指针通常会禁止自动矢量化,特别是当它是 char* 它可以别名任何东西。)

gcc和clang从不将搜索循环矢量化;只有在进入循环之前可以计算行程计数的循环 . 但是 ICC CAN ;它执行标量收集并存储结果(即使 output[] 是本地的所以不是有要做到这一点,作为运行该函数的副作用),然后使用simd压缩比较+位扫描。

Compiler output for a __restrict version . 请注意,在为Skylake-AVX512进行调整时,GCC8.1和ICC默认避免512位向量。512位向量可以限制最大turbo,并且总是在端口1上的向量alu处于管道中时将其关闭,因此在该函数只是大型程序的一小部分的情况下,使用avx512或avx2与256位向量一起使用是有意义的。(编译器不知道这个函数在程序中是超热的。)

如果 输出[ ] 一个本地的、更好的代码Gen策略可能会在收集时进行比较,所以早期的点击跳过剩余的负载。完全标量(CLAN和MSVC)的编译器都错过了这一优化。事实上,它们甚至存储在本地数组中,即使CLAN大多不重读(保持寄存器中的结果)。在第一个循环中编写带有compare的源代码可以获得更好的标量代码。(取决于聚集的缓存未命中与非simd搜索的分支预测失误,标量可能是一个好的策略。尤其是如果前几个元素中的命中率是常见的。当前的采集硬件不能利用来自同一高速缓存行的多个元素,因此硬限制仍然是每个时钟周期加载的2个元素。但是,如果您的数据主要在缓存中处于热状态,则使用宽向量加载来为索引提供聚集将显著降低加载端口/缓存访问压力。)

编译程序能够已经自动矢量化了 限制,限制 你的代码版本是这样的。(gcc管理收集部分,icc管理simd比较部分)

;; Windows x64 calling convention: rcx,rdx, r8,r9
; but of course you'd actually inline this
; only uses ZMM16..31, so vzeroupper not required

vmovdqu32   zmm16, [rcx/arr]   ; You def. want to reach an alignment boundary if you can for ZMM loads, vmovdqa32 will enforce that

kxnorw      k1, k0,k0      ; k1 = -1.  k0 false dep is likely not a problem.
  ; optional: vpxord  xmm17, xmm17, xmm17   ; break merge-masking false dep
vpgatherdd  zmm17{k1}, [rdx + zmm16 * 4]    ; GlobalArray + scaled-vector-index
; sets k1 = 0 when done

vmovdqu32   [r8/output], zmm17

vpcmpd      k1, zmm17, zmm31, 0    ; 0->EQ.  Outside the loop, do zmm31=set1_epi32(1)
                                   ; k1 = compare bitmap
kortestw    k1, k1
jz         .not_found      ; early check for not-found

kmovw       edx, k1

           ; tzcnt doesn't have a false dep on the output on Skylake
           ; so no AVX512 CPUs need to worry about that HSW/BDW issue
tzcnt       eax, edx       ; bit-scan for the first (lowest-address) set element
                           ; input=0 produces output=32
      ; or avoid the branch and let 32 be the not-found return value.
      ; or do a branchless kortestw / cmov if -1 is directly useful without branching
ret

.not_found:
   mov eax, -1
   ret

你可以自己用内部函数来实现 :

英特尔指令集参考手册(HTML摘录 http://felixcloutier.com/x86/index.html )包括每个指令的C/C++内部名称,或在 https://software.intel.com/sites/landingpage/IntrinsicsGuide/

我改变了 输出 类型到 __m512i . 如果不手动对调用方进行矢量化,则可以将其改回数组。 你 一定地 希望此函数内联。

#include <immintrin.h>

//__declspec(noinline)  // I *hope* this was just to see the stand-alone asm version
                        // but it means the output array can't optimize away at all

//static inline
int find_first_1(const int *__restrict arr, const int *__restrict someGlobalArray, __m512i *__restrict output)
{
    __m512i vindex = _mm512_load_si512(arr);
    __m512i gather = _mm512_i32gather_epi32(vindex, someGlobalArray, 4);  // indexing by 4-byte int
    *output = gather;  

    __mmask16 cmp = _mm512_cmpeq_epi32_mask(gather, _mm512_set1_epi32(1));
       // Intrinsics make masks freely convert to integer
       // even though it costs a `kmov` instruction either way.
    int onepos =  _tzcnt_u32(cmp);
    if (onepos >= 16){
        return -1;
    }
    return onepos;
}

所有4个x86编译器都生成与我建议的类似的asm( see it on the Godbolt compiler explorer 当然,他们必须实际实现。 set1_epi32(1) 向量常量,或使用(广播)内存操作数。Clang实际上使用了 {1to16} 用于比较的常数的广播负载: vpcmpeqd k0, zmm1, dword ptr [rip + .LCPI0_0]{1to16} . (当然,当内联到一个循环中时,它们会做出不同的选择。) mov eax,1 / vpbroadcastd zmm0, eax .

GCC81-O3-三月=SkyLAK-AVX512有两个冗余 mov eax, -1 说明:一个给一个 kmov 对于集合,另一个用于返回值的东西。愚蠢的编译器应该保持它的周围,并使用不同的寄存器 1 .

它们都使用zmm0..15,因此无法避免 vzeroupper . (xmm16.31不能用遗留的sse访问,所以 the SSE/AVX transition penalty problem that vzeroupper solves 如果你使用的唯一向量寄存器是y/ZMM16…31,则不存在。VZououpPER可能仍然有很小的可能优势,比如当YMM或ZMM RGS的上半部已知为零时,更便宜的上下文切换。 Is it useful to use VZEROUPPER if your program+libraries contain no SSE instructions? )如果你打算使用它,没有理由避免xmm0..15。

哦,在windows调用约定中,xmm6..15是保留的调用。(不是ymm/zmm,只是低128位),所以如果xmm0..5 regs用完了,zmm16..31是一个不错的选择。