代码之家  ›  专栏  ›  技术社区  ›  JuMoGar

如何在没有竞争条件或错误共享的情况下使用OpenMP并行化此函数?

  •  0
  • JuMoGar  · 技术社区  · 6 年前

    __inline static
    void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
        int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
    
        // sum all points
        // for every point
        for (int i = 0; i < numObjs; ++i) {
            // which cluster is it in?
            int activeCluster = clusterAssignmentCurrent[i];
    
            // update count of members in that cluster
            ++clusterMemberCount[activeCluster];
    
            // sum point coordinates for finding centroid
            for (int j = 0; j < numCoords; ++j)
                clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
        }
    
    
        // now divide each coordinate sum by number of members to find mean/centroid
        // for each cluster
        for (int i = 0; i < numClusters; ++i) {
            if (clusterMemberCount[i] != 0)
                // for each coordinate
                for (int j = 0; j < numCoords; ++j)
                    clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];  /// XXXX will divide by zero here for any empty clusters!
        }
    

    你知道我怎样才能做到吗?

    谢谢您。

    3 回复  |  直到 6 年前
        1
  •  1
  •   Zulan    6 年前

    这很直接

    // sum all points
    // for every point
    for (int i = 0; i < numObjs; ++i) {
        // which cluster is it in?
        int activeCluster = clusterAssignmentCurrent[i];
    
        // update count of members in that cluster
        ++clusterMemberCount[activeCluster];
    
        // sum point coordinates for finding centroid
        #pragma omp parallel for
        for (int j = 0; j < numCoords; ++j)
            clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
    }
    

    clustersCentroID . 您可以安全地假设默认计划不会显示明显的错误共享,它通常有足够大的块。只是 不要 试试像这样的 schedule(static,1) .

    外部循环不容易并行化。你可以减少 clusterMemberCount 群集成员计数

    #pragma omp parallel // note NO for
    for (int i = 0; i < numObjs; ++i) {
        int activeCluster = clusterAssignmentCurrent[i];
        // ensure that exactly one thread works on each cluster
        if (activeCluster % omp_num_threads() != omp_get_thread_num()) continue;
    

    只有在简单的解决方案不能产生足够的性能时才执行此操作。

    另一个循环也很简单

    #pragma omp parallel for
    for (int i = 0; i < numClusters; ++i) {
        if (clusterMemberCount[i] != 0)
            // for each coordinate
            for (int j = 0; j < numCoords; ++j)
                clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];
    }
    

        2
  •  1
  •   Brice    6 年前

    对于 numCoords , numObjs numClusters 因为并行化的最佳方式取决于此。尤其是, 努姆科德 很重要的一点是,看看在坐标上对内环进行并行化/矢量化是否有意义;比如,您采用的是三维坐标还是1000维?

    另一次尝试的缺点是 if 第一个循环中的语句(不利于性能)、静态调度(可能的负载不平衡),但每个线程的相邻部分递增 clusterMemberCount clustersCentroID

    #ifdef _OPENMP
       #include <omp.h>
    #else
       #define omp_get_num_threads() 1
       #define omp_get_thread_num() 0
    #endif
    
    
    __inline static
    void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {
        int * clusterMemberCount = (int *) calloc (numClusters,sizeof(float));
        // sum all points
        // for every point
    
        #pragma omp parallel
        {
            int nbOfThreads = omp_get_num_threads();
            int thisThread = omp_get_thread_num();
            // Schedule for the first step : process only cluster with ID in the [from , to[ range
            int clustFrom = (thisThread*numClusters)/nbOfThreads;
            int clustTo   = (thisThread+1 == nbOfThreads) ? numClusters : ((thisThread+1)*numClusters)/nbOfThreads;
    
            // Each thread will loop through all values of numObjs but only process them depending on activeCluster
            // The loop is skipped only if the thread was assigned no cluster
            if (clustTo>clustFrom){
                for (int i = 0; i < numObjs; ++i) {
                    // which cluster is it in?
                    int activeCluster = clusterAssignmentCurrent[i];
    
                    if (activeCluster>=clustFrom && activeCluster<clustTo){
                        // update count of members in that cluster
                        ++clusterMemberCount[activeCluster];
    
                        // sum point coordinates for finding centroid
                        for (int j = 0; j < numCoords; ++j)
                            clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
                    }
                }
            }
    
            #pragma omp barrier
    
            // now divide each coordinate sum by number of members to find mean/centroid
            // for each cluster
            #pragma omp for // straightforward
            for (int i = 0; i < numClusters; ++i) {
                if (clusterMemberCount[i] != 0)
                    // for each coordinate
                    for (int j = 0; j < numCoords; ++j)
                        clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];  /// XXXX will divide by zero here for any empty clusters!
            }
        }
        free(clusterMemberCount);
    }
    
        3
  •  -1
  •   Aki Suihkonen    6 年前

    添加到我的评论: ++clusterMemberCount[activeCluster] 形成一个直方图,当两个线程试图更新同一项(或共享缓存线的相邻项)时会出现问题。这需要作为一个连续的部分从循环中取出,或者必须通过为每个线程提供一个单独的直方图副本来并行化,然后合并。

    您可以很容易地将此部分与第一个并行循环分离。

    // Make the histogram
    for (int i = 0; i < numObjs; ++i) {
        int activeCluster = clusterAssignmentCurrent[i];
        ++clusterMemberCount[activeCluster];
    }
    

    // parallel processing
    #pragma omp parallel for
    for (int i = 0; i < numObjs; ++i) {
        int activeCluster = clusterAssignmentCurrent[i];
        for (int j = 0; j < numCoords; ++j)
            clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
    }
    

    第二次潜在的错误分享是 numCoords * sizeof(clustersCentroID[0]) % 64 != 0 假设是64字节缓存线。这可以通过将clustersCentroID过度分配到64字节的整数倍来缓解。

    // Loop for numCoords, but index by numCoordsX
    for (int j = 0; j < numCoords; ++j)
        clustersCentroID[activeCluster*numCoordsX + j] += dataSetMatrix[i*numCoords + j];