代码之家  ›  专栏  ›  技术社区  ›  A. Coorhp

mexcuda在.cu文件中的delete[]()处具有断点

  •  -1
  • A. Coorhp  · 技术社区  · 8 年前

    我在查找内存分配错误时遇到了一些困难。我目前在GeForce GT 630上使用Visual Studio 2013、Matlab 2015b和CUDA 7.0,我是GPU编程、CUDA和mex的新手。

    当我用mexcuda从Matlab调用我的代码时,直到我将带有colIndexStepSize的小部分添加到.cu文件中,一切都正常。程序正常运行,直到删除。在通知我已经到达这里的断点后,Matlab崩溃了。

    当我删除有问题的代码行时,一切都会再次顺利运行。

    我很确定我的内存处理有问题,但我simpy找不到bug。下面是产生问题的代码:

    #include <cuda_runtime.h>
    #include <cuda.h>
    #include <cusparse.h>
    #include <device_launch_parameters.h>
    #include <curand.h>
    
    #include <vector>
    
    // Test-Makro : (Funktionieren die Zugriffe auf die GPU?)
    #define gpuErrchk(ans){gpuAssert((ans), __FILE__, __LINE__);}
    
    inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true){
        if (code != cudaSuccess){
            fprintf(stderr, "GPUassert: %s%s%d\n", cudaGetErrorString(code), file, line);
        }
    }
    
    
    __global__ void startEndIndex(int *ergArray, int *first, int *last, float *dxmax, unsigned int *length){
    
        unsigned int index = threadIdx.x + blockIdx.x*blockDim.x;
    
        if (index < *length){
    
            first[index] = (*dxmax)*ergArray[index];
            last[index] = (*dxmax)*ergArray[index + 1] - 1;
        }
    }
    
    
    
    void rotateOSSARTrechnung(std::vector<float> *detektor, SparseMatrix<float, float, float> *systemMatrix_coo, Volumen<float, float, float> *volumen, unsigned int iterationen, std::vector<float> *deltaBIterationN, std::vector<float> *matdVoxelGrid, float projektionen,float dxmax, float detZellen, unsigned int threads_max_n, unsigned int threads_max_m, unsigned int threads_max_nnz){
    
    unsigned int nnz = (unsigned int)systemMatrix_coo->nnz;
    unsigned int n = (unsigned int)systemMatrix_coo->columnNumber; 
    unsigned int mNeu = detZellen; 
    
    float *measuredValues = 0; measuredValues = new float[mNeu](); 
    
    float *volumeN = 0; volumeN =new float[n]();
    float *volumeAlt = 0; volumeAlt = new float[n](); 
    
    float *initValuesM = 0; initValuesM = new float[mNeu]();
    
    float *volumeNInitZero = 0; volumeNInitZero = new float[n]();
    float *initValuesMInitZero = 0; initValuesMInitZero = new float[mNeu]();
    int *cooRowHostPtr=0; cooRowHostPtr = new int[nnz]();
    int *cooColHostPtr=0; cooColHostPtr = new int[nnz]();
    float *cooValuesHostPtr = 0; cooValuesHostPtr = new float[nnz]();
    
    unsigned int *colIndex = 0; colIndex = new unsigned int[nnz]();
    float *valIndex = 0; valIndex = new float[nnz]();
    unsigned int *colIndexStepSize = 0; colIndexStepSize = new unsigned int[n]();
    
    for (unsigned int i = 0; i < n; i++){
        colIndexStepSize[i] = nnz;
    }
    
    unsigned int length = matdVoxelGrid->size();
    int *ergArray = 0; ergArray = new int[length+1]();
    
    int *first = 0; first = new int[length]();
    int *last = 0; last = new int[length]();
    
    int *cooHostColRot = 0; cooHostColRot = new int[nnz]();
    
    
    int *d_cooColPtr;
    int *d_cooRowPtr;
    unsigned int *d_nnz;
    
    int *d_colIndexPtr;
    float *d_valIndexPtr;
    unsigned int *d_colIndexStepSizePtr;
    
    float *d_cooValuesPtr;
    float *d_measuredValues;
    
    float *d_volume_alt;
    float *d_volume_neu;
    
    int *d_ergArray; 
    
    float *d_dxmax;
    unsigned int *d_length;
    unsigned int *d_size;
    
    int *d_first;
    int *d_last;
    
    int *d_cooColRotPtr; 
    
    unsigned int *d_count;
    
    gpuErrchk(cudaMalloc((void**)&d_cooRowPtr, nnz*sizeof(int)));;
    gpuErrchk(cudaMalloc((void**)&d_cooColPtr, nnz*sizeof(int)));
    gpuErrchk(cudaMalloc((void**)&d_cooValuesPtr, nnz*sizeof(float)));
    
    gpuErrchk(cudaMalloc((void**)&d_measuredValues, mNeu*sizeof(float)));
    gpuErrchk(cudaMalloc((void**)&d_volume_alt, n*sizeof(float)));
    gpuErrchk(cudaMalloc((void**)&d_volume_neu, n*sizeof(float)));
    
    gpuErrchk(cudaMalloc((void**)&d_nnz, sizeof(unsigned int)));
    
    gpuErrchk(cudaMalloc((void**)&d_colIndexPtr, (nnz)*sizeof(int)));
    gpuErrchk(cudaMalloc((void**)&d_valIndexPtr, (nnz)*sizeof(float)));
    gpuErrchk(cudaMalloc((void**)&d_colIndexStepSizePtr, n*sizeof(unsigned int)));
    
    gpuErrchk(cudaMalloc((void**)&d_ergArray, (length+1)*sizeof(int)));
    
    gpuErrchk(cudaMalloc((void**)&d_dxmax, sizeof(float)));
    gpuErrchk(cudaMalloc((void**)&d_length, sizeof(unsigned int)));
    gpuErrchk(cudaMalloc((void**)&d_size, sizeof(unsigned int)));
    
    gpuErrchk(cudaMalloc((void**)&d_first, length*sizeof(int)));
    gpuErrchk(cudaMalloc((void**)&d_last, length*sizeof(int)));
    
    gpuErrchk(cudaMalloc((void**)&d_cooColRotPtr, nnz*sizeof(int)));
    
    gpuErrchk(cudaMalloc((void**)&d_count, sizeof(unsigned int)));
    
    for (unsigned int i = 0; i < nnz; i++){
        cooRowHostPtr[i] = systemMatrix_coo->cooRowInd->at(i);
        cooColHostPtr[i] = systemMatrix_coo->cooColInd->at(i);
        cooValuesHostPtr[i] = systemMatrix_coo->cooValues->at(i);
    }
    
    for (unsigned int j = 0; j < n; j++){
        volumen->setValueAtElement(j, (float)cooColHostPtr[j]);
    }
    
    gpuErrchk(cudaMemcpy(d_nnz, &nnz, sizeof(unsigned int), cudaMemcpyHostToDevice));
    
    gpuErrchk(cudaMemcpy(d_dxmax, &dxmax, sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_length, &length, sizeof(unsigned int), cudaMemcpyHostToDevice));
    
    // (Initialwerte sind immer gleich)
    gpuErrchk(cudaMemcpy(d_cooRowPtr, cooRowHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_cooValuesPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_cooColPtr, cooColHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
    
    gpuErrchk(cudaMemcpy(d_valIndexPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
    
    unsigned int threads_nnz = threads_max_nnz;
    
    unsigned int thread_length = length;
    unsigned int block_length = 1;
    
    unsigned int index = 0;
    
    for (unsigned int s = 0; s < length; s++){
        for (unsigned int t = 0; t <= s; t++){
            index = s + 1;
            ergArray[index] += (int)matdVoxelGrid->at(t);
        }
    }
    
    gpuErrchk(cudaMemcpy(d_ergArray, ergArray, (length+1)*sizeof(int), cudaMemcpyHostToDevice));
    
    startEndIndex <<< block_length, thread_length >>>(d_ergArray, d_first, d_last, d_dxmax, d_length);
    
    gpuErrchk(cudaMemcpy(first, d_first, length*sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(last, d_last, length*sizeof(int), cudaMemcpyDeviceToHost));
    
    for (unsigned int j = 0; j < length; j++){
        volumen->setValueAtElement(j, (float)first[j]);
    }
    
    
    for (unsigned int j = 0; j < length; j++){
        volumen->setValueAtElement(j, (float)last[j]);
    }
    
    
    unsigned int size = 0;
    
    for (unsigned int iter = 0; iter < iterationen; iter++){
        for (unsigned int proj = 1; proj <= projektionen; proj++){
    
            unsigned int begin1 = (proj - 1)*mNeu;
            unsigned int end1 = proj*mNeu;
    
            for (unsigned int j = begin1; j < end1; j++){
                measuredValues[j] = detektor->at(j);
            }
    
            gpuErrchk(cudaMemcpy(d_measuredValues, measuredValues, mNeu*sizeof(float), cudaMemcpyHostToDevice));
    
            for (unsigned int u = 0; u < length; u++){
                size = ceil(matdVoxelGrid->at(u)* (proj - 1) * dxmax / projektionen);
                gpuErrchk(cudaMemcpy(d_size, &size, sizeof(unsigned int), cudaMemcpyHostToDevice));
    
                gpuErrchk(cudaMemcpy(d_count, &u, sizeof(unsigned int), cudaMemcpyHostToDevice));
    
                if (proj > 1){
    
                    for (unsigned int i = 0; i < nnz; i++) {//(first[u] <= cooCols[index] <= last[u]){
    
                        if (first[u] <= cooColHostPtr[i] && cooColHostPtr[i] <= last[u]){
                            cooHostColRot[i] = first[u] + (int)(cooColHostPtr[i] + size) % (last[u] - first[u] + 1);// (int)(cooColHostPtr[i] + size) % (last[u]); // (int)(first[u] + ((int)(cooColHostPtr[i] + dxmax) % (last[u] - first[u] + 1)));
                        }
                    }
                }
                else{
                    for (unsigned int i = 0; i < nnz; i++) {
                        cooHostColRot[i] = cooColHostPtr[i];
                    }
                }
            }
    
    
    // --------- troubling code starts HERE ----------------
            unsigned int wert = 0, index = 0;
            for (unsigned int i = 0; i < nnz; i++){
                index = cooHostColRot[i];
                wert = colIndexStepSize[index];
    
                if (wert >= i){
                    colIndexStepSize[index] = i;
                }
            }
    
            for (unsigned int j = 0; j < n; j++){
                volumen->setValueAtElement(j, colIndexStepSize[j]);
            }
    
                gpuErrchk(cudaMemcpy(d_colIndexStepSizePtr, colIndexStepSize, n*sizeof(unsigned int), cudaMemcpyHostToDevice));
    
        // --------- troubling code ends HERE ----------------
    
                 gpuErrchk(cudaMemcpy(d_colIndexPtr, cooHostColRot, nnz*sizeof(int), cudaMemcpyHostToDevice));
    
    
            }
        }
    
    
        cudaFree(d_cooRowPtr);
        cudaFree(d_cooColPtr);
        cudaFree(d_cooValuesPtr);
        cudaFree(d_measuredValues);
        cudaFree(d_volume_alt);
        cudaFree(d_volume_neu);
        cudaFree(d_colCount);
        cudaFree(d_rowCount);
        cudaFree(d_ergSumCol);
        cudaFree(d_ergSumRow);
        cudaFree(d_ergMult);
        cudaFree(d_nnz);
        cudaFree(d_faktor);
        cudaFree(d_colIndexPtr);
        cudaFree(d_valIndexPtr);
        cudaFree(d_ergSumNNZforCol);
        cudaFree(d_colIndexStepSizePtr);
        cudaFree(d_deltaB);
    
        cudaFree(d_ergArray);
        cudaFree(d_dxmax);
        cudaFree(d_length);
        cudaFree(d_size);
        cudaFree(d_first);
        cudaFree(d_last);
        cudaFree(d_cooColRotPtr);
        cudaFree(d_count);
    
        delete[](ergArray); ergArray = NULL;
        delete[](measuredValues); measuredValues = NULL;
        delete[](cooColHostPtr); cooColHostPtr = NULL;
        delete[](cooRowHostPtr); cooRowHostPtr = NULL;
        delete[](cooValuesHostPtr); cooValuesHostPtr = NULL;
        delete[](volumeN); volumeN = NULL;
        delete[](ergArray); ergArray = NULL;
        delete[](initValuesM); initValuesM = NULL;
        delete[](colIndex); colIndex = NULL;
        delete[](valIndex); valIndex = NULL;
        delete[](volumeAlt); volumeAlt = NULL;
        delete[](volumeNInitZero); volumeNInitZero = NULL;
        delete[](initValuesMInitZero); initValuesMInitZero = NULL;
        delete[](colIndexStepSize); colIndexStepSize = NULL;
        delete[](deltaBArray); deltaBArray = NULL;
    
        delete[](first); first = NULL;
        delete[](last); last = NULL;
    
        delete[](cooHostColRot); cooHostColRot = NULL;
    
        deltaB->~vector();
        deltaB = NULL;
    

    }

    如果有人看到我犯了什么错误,请告诉我,我愿意接受任何建议。

    提前谢谢! 顺致敬意,

    编辑: @AnderBiguri是对的,我对数组进行了越界访问 测量值 。以下是问题代码的更正部分:

            for (unsigned int j = 0; j < mNeu; j++){
                measuredValues[j] = detektor->at((proj-1)*mNeu+j);
            }
    

    测量值 只有mNeu元素长,但我确实在这一点后面访问了一些元素。

    所以,非常感谢您的帮助!

    1 回复  |  直到 8 年前
        1
  •  0
  •   A. Coorhp    8 年前

    @AnderBiguri是对的,我对数组measuredValues进行了越界访问。以下是问题代码的更正部分:

        for (unsigned int j = 0; j < mNeu; j++){
            measuredValues[j] = detektor->at((proj-1)*mNeu+j);
        }
    

    我只需要调整for循环和向量访问的边界,以适应数组的边界。

    再次感谢!