代码之家  ›  专栏  ›  技术社区  ›  youpilat13 Ty Petrice

Pycuda—执行矩阵求逆批处理的最佳方法—CUBLAS或MAGMA

  •  0
  • youpilat13 Ty Petrice  · 技术社区  · 6 年前

    我正在寻找一个用Python执行矩阵求逆的解决方案。我认为CUBLAS或MAGMA应该有一种方法以批处理或并发模式执行这些操作,因为每个矩阵都是独立的。

    我认为这里提出的计算应该是GPU的理想选择。

    (integ_prec,integ_prec) 其中内核对给定的全局项执行4x4矩阵求逆。

    batch_solver 提供的NVIDIA开发,但我不能让它工作。

    更新1

    来自NVIDIA开发人员(版本 BatchedSolver_v1_1

    $ make
    nvcc -O3  -arch=sm_35 -DKEPLER2  -o example_batch_solver example.c solve.cu inverse.cu
    In file included from solve.cu:41:
    ./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
    #if !defined(OPERATIONS_H_)
     ^~
    ./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
    #define OPERATIONS_SOLVE_H_
            ^~~~~~~~~~~~~~~~~~~
            OPERATIONS_H_
    1 warning generated.
    In file included from solve.cu:41:
    ./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
    #if !defined(OPERATIONS_H_)
     ^~
    ./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
    #define OPERATIONS_SOLVE_H_
            ^~~~~~~~~~~~~~~~~~~
            OPERATIONS_H_
    1 warning generated.
    In file included from inverse.cu:44:
    ./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
    #if !defined(OPERATIONS_H_)
     ^~
    ./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
    #define OPERATIONS_SOLVE_H_
            ^~~~~~~~~~~~~~~~~~~
            OPERATIONS_H_
    1 warning generated.
    
    In file included from inverse.cu:44:
    ./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
    #if !defined(OPERATIONS_H_)
     ^~
    ./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
    #define OPERATIONS_SOLVE_H_
            ^~~~~~~~~~~~~~~~~~~
            OPERATIONS_H_
    1 warning generated.
    

    不幸的是,执行结果不好:

    Non-batched matrix inversion
    
            3.000000   1.000000   1.000000             nan  -19945373249087470322107824313046586886748897396355850773313316907920980812816123986073723926411981165664742747916855157931798956499818437291518879567207778108249202114071816066955302634366146096749274721347289725502062211559628338200162202651585616465674552041292175081655027073691104118308864.000000  -25949369271932562088528097628985580835309378491979298170251656488819244813241392783541154149164125403081303093429316785499097407170772831834462454013755392.000000
    etc ...
    

    所以,为了避免这些警告,我替换了宏 OPERATIONS_SOLVE_H 通过 OPERATIONS_H_ operations.h 文件。编译过程中不再出现警告,但执行时仍会出现错误结果(同上)。

    Batchsolver (开 MacOS 10.13.5 具有 NVIDIA driver 387.10.10.10.35.106

    0 回复  |  直到 4 年前
        1
  •  2
  •   Robert Crovella    5 年前

    正如在注释中提到的,numpy函数通常不能从pycuda内核代码(或者CUDA内核代码,或者numba CUDA内核)中使用。

    batched matrix inversion function ,但目前这两个网站都没有曝光 pyculib cublas interface scikit-cuda cublas interface .

    我们可以继续实现自己的接口(例如,使用python) ctypes here

    接下来首先是在CUDA中实现这一点。函数 inv4x4 是以前代码的一种改编,为每个矩阵分配16个线程(每个矩阵元素一个线程),并将该代码用作模型。每个线程负责计算一个结果矩阵元素。首先我们将它与库布拉斯进行比较 matinvBatched 对于性能:

    $ cat t411.cu
    #include <iostream>
    #include <cublas_v2.h>
    #include <cstdlib>
    // 4x4 matrix inversion
    // https://stackoverflow.com/questions/1148309/inverting-a-4x4-matrix
    
    // assumes warp size is 32
    // assumes block size is multiple of warp size
    // therefore assumes number of matrices to be inverted (n) is even
    // 16 threads per matrix to invert
    
    const unsigned block_size = 256;
    typedef float mt;
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    long long dtime_usec(unsigned long long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    __device__ unsigned pat[3][16];
    const unsigned hpat[3][16] = {
    { 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50},
    { 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14},
    { 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618}};
    
    __device__ unsigned getoff(unsigned &off){
      unsigned ret = off & 0x0F;
      off = off >> 4;
      return ret;
    }
    
    const unsigned tmsk = 0xFFFFFFFF;
    // in-place is acceptable i.e. out == in)
    // T = float or double only
    template <typename T>
    __global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n){
    
      __shared__ T si[block_size];
      size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
      if (idx < n*16){
        si[threadIdx.x] = in[idx];
        unsigned lane = threadIdx.x & 15;
        unsigned sibase = threadIdx.x & 0x03F0;
        __syncwarp();
        unsigned off = pat[0][lane];
        T a,b;
        a  = si[sibase + getoff(off)];
        a *= si[sibase + getoff(off)];
        a *= si[sibase + getoff(off)];
        if (!getoff(off)) a = -a;
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        off = pat[1][lane];
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        off = pat[2][lane];
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        T det = si[sibase + (lane>>2)]*a;
        det += __shfl_down_sync(tmsk, det, 4, 16); // first add
        det += __shfl_down_sync(tmsk, det, 8, 16); // second add
        det =  __shfl_sync(tmsk, det, 0, 16); // broadcast
        out[idx] = a / det;
      }
    }
    
    size_t nr = 2048;
    int main(int argc, char *argv[]){
      if (argc > 1) nr = atoi(argv[1]);
    
      const mt m1[] = {1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0};
      const mt i1[] = {-3.0, -0.5, 1.5, 1.0, 1.0, 0.25, -0.25, -0.5, 3.0, 0.25, -1.25, -0.5, -3.0, 0.0, 1.0, 1.0};
      const mt m2[] = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0};
      const mt i2[] = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0};
    
      mt *h_d, *d_d;
      h_d = (mt *)malloc(nr*2*16*sizeof(mt));
      cudaMalloc(&d_d, nr*2*16*sizeof(mt));
      cudaMemcpyToSymbol(pat, hpat, 3*16*sizeof(unsigned));
      for (int i = 0; i < nr; i++){
        memcpy(h_d+i*16*2, m1, sizeof(m1));
        memcpy(h_d+i*16*2+16, m2, sizeof(m2));}
      cudaMemcpy(d_d, h_d, nr*2*16*sizeof(mt), cudaMemcpyHostToDevice);
      long long t = dtime_usec(0);
      inv4x4<<<nr*2*16/block_size, block_size>>>(d_d, d_d, nr*2);
      cudaDeviceSynchronize();
      t = dtime_usec(t);
      cudaMemcpy(h_d, d_d, nr*2*16*sizeof(mt), cudaMemcpyDeviceToHost);
      for (int i = 0; i < 2; i++){
        for (int j = 0; j < 16; j++) std::cout << h_d[i*16 + j] << ",";
        std::cout << std::endl;
        for (int j = 0; j < 16; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
        std::cout << std::endl;}
      std::cout << "kernel time: " << t << " microseconds" << std::endl;
      cudaError_t err = cudaGetLastError();
      if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
      //cublas
      for (int i = 0; i < nr; i++){
        memcpy(h_d+i*16*2, m1, sizeof(m1));
        memcpy(h_d+i*16*2+16, m2, sizeof(m2));}
      cudaMemcpy(d_d, h_d, nr*2*16*sizeof(mt), cudaMemcpyHostToDevice);
      cublasHandle_t h;
      cublasStatus_t cs = cublasCreate(&h);
      if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas create error" << std::endl;
      mt **A, **Ai, *Aid, **Ap, **Aip;
      A  = (mt **)malloc(nr*2*sizeof(mt *));
      Ai = (mt **)malloc(nr*2*sizeof(mt *));
      cudaMalloc(&Aid, nr*2*16*sizeof(mt));
      cudaMalloc(&Ap,  nr*2*sizeof(mt *));
      cudaMalloc(&Aip, nr*2*sizeof(mt *));
      for (int i = 0; i < nr*2; i++) A[i]  =  d_d + 16*i;
      for (int i = 0; i < nr*2; i++) Ai[i] =  Aid + 16*i;
      cudaMemcpy(Ap, A, nr*2*sizeof(mt *), cudaMemcpyHostToDevice);
      cudaMemcpy(Aip, Ai, nr*2*sizeof(mt *), cudaMemcpyHostToDevice);
      int *info;
      cudaMalloc(&info, nr*2*sizeof(int));
      t = dtime_usec(0);
      cs = cublasSmatinvBatched(h, 4,  Ap, 4, Aip, 4, info, nr*2);
      if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas matinv error" << std::endl;
      cudaDeviceSynchronize();
      t = dtime_usec(t);
      cudaMemcpy(h_d, Aid, nr*2*16*sizeof(mt), cudaMemcpyDeviceToHost);
      for (int i = 0; i < 2; i++){
        for (int j = 0; j < 16; j++) std::cout << h_d[i*16 + j] << ",";
        std::cout << std::endl;
        for (int j = 0; j < 16; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
        std::cout << std::endl;}
      std::cout << "cublas time: " << t << " microseconds" << std::endl;
      err = cudaGetLastError();
      if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
      return 0;
    }
    $ nvcc -o t411 t411.cu -lcublas
    $ ./t411
    -3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,-0,1,1,
    -3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
    1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
    1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
    kernel time: 49 microseconds
    -3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
    -3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
    1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
    1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
    cublas time: 95 microseconds
    $
    

    我们看到,代码似乎提供了正确的结果,为2个测试矩阵倒置,和4096个矩阵的总时间倒置特斯拉P100是约50us,约2倍快于库布拉斯。

    $ cat t10.py
    import numpy as np
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import pycuda.autoinit
    # kernel
    kernel = SourceModule("""
    
    __device__ unsigned getoff(unsigned &off){
      unsigned ret = off & 0x0F;
      off = off >> 4;
      return ret;
    }
    
    const int block_size = 256;
    const unsigned tmsk = 0xFFFFFFFF;
    // in-place is acceptable i.e. out == in)
    // T = float or double only
    typedef float T;
    __global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
    
      __shared__ T si[block_size];
      size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
      if (idx < n*16){
        si[threadIdx.x] = in[idx];
        unsigned lane = threadIdx.x & 15;
        unsigned sibase = threadIdx.x & 0x03F0;
        __syncwarp();
        unsigned off = pat[lane];
        T a,b;
        a  = si[sibase + getoff(off)];
        a *= si[sibase + getoff(off)];
        a *= si[sibase + getoff(off)];
        if (!getoff(off)) a = -a;
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        off = pat[lane+16];
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        off = pat[lane+32];
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        b  = si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        b *= si[sibase + getoff(off)];
        if (getoff(off)) a += b;
        else a -=b;
        T det = si[sibase + (lane>>2)]*a;
        det += __shfl_down_sync(tmsk, det, 4, 16); // first add
        det += __shfl_down_sync(tmsk, det, 8, 16); // second add
        det =  __shfl_sync(tmsk, det, 0, 16); // broadcast
        out[idx] = a / det;
      }
    }
    
    """)
    # python function for inverting 4x4 matrices
    # n should be an even number
    def gpuinv4x4(inp, n):
        # internal constants not to be modified
        hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
        # Convert parameters into numpy array
        inpd = np.array(inp, dtype=np.float32)
        hpatd = np.array(hpat, dtype=np.uint32)
        output = np.empty((n*16), dtype= np.float32)
        # Get kernel function
        matinv4x4 = kernel.get_function("inv4x4")
        # Define block, grid and compute
        blockDim = (256,1,1) # do not change
        gridDim = ((n/16)+1,1,1)
        # Kernel function
        matinv4x4 (
            cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
            block=blockDim, grid=gridDim)
        return output
    #example/test case
    inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0)
    n = 2
    result = gpuinv4x4(inp, n)
    print(result)
    $ python t10.py
    [-3.   -0.5   1.5   1.    1.    0.25 -0.25 -0.5   3.    0.25 -1.25 -0.5  -3.
     -0.    1.    1.    1.    0.    0.    0.    0.    1.    0.    0.    0.    0.
      1.    0.    0.    0.    0.    1.  ]
    $
    

    我怀疑,如果在CUDA中唯一需要做的事情就是反转这些矩阵,那么这将不是一个有趣或有吸引力的用例。我预计,将数据传输到设备并返回结果的成本将超过使用GPU与普通numpy相比所带来的任何提速好处。然而,我还没有测试或基准的一个numpy案件。

    注意使用 __syncwarp()

    还要注意的是,代码需要偶数个矩阵进行反转。如果没有偶数,请将任意值填充到下一个偶数矩阵中。

    还要注意,代码只是假设矩阵是可逆的。没有测试来确定它们是否为零,例如,如果计算的行列式为零,则矩阵将不可逆(使用此方法),并且由于被零除,结果通常为NaN。

    不清楚这里的目的是什么,所以这个例子不应该被解释为表明一般的矩阵求逆是一个好主意或一个特定问题的适当的解决方法。

    也许一个更好的pythonic方法反演密集矩阵上的GPU将使用 cupy