代码之家  ›  专栏  ›  技术社区  ›  ezekiel

C与Python/numpy的数学表现较差

  •  0
  • ezekiel  · 技术社区  · 7 年前


    我发现python/numpy的性能比我的C代码高出了10000x以上,这显然是不对的,那么我的C代码有什么问题导致它的性能如此差呢?(即使用-O3或-Ofast编译)

    python:

    import time
    import numpy as np
    
    t0 = time.time()
    m1 = np.random.rand(2000, 2000)
    m2 = np.random.rand(2000, 2000)
    t1 = time.time()
    m3 = m1 @ m2
    t2 = time.time()
    print('creation time: ', t1 - t0, ' \n multiplication time: ', t2 - t1)
    

    C:

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    
    int main(void) {
    
        clock_t t0=clock(), t1, t2;
    
        // create matrices and allocate memory  
        int m_size = 2000;
        int i, j, k;
        double running_sum;
        double *m1[m_size], *m2[m_size], *m3[m_size];
        double f_rand_max = (double)RAND_MAX;
        for(i = 0; i < m_size; i++) {
            m1[i] = (double *)malloc(sizeof(double)*m_size);
            m2[i] = (double *)malloc(sizeof(double)*m_size);
            m3[i] = (double *)malloc(sizeof(double)*m_size);
        }
        // populate with random numbers 0 - 1
        for (i=0; i < m_size; i++)
            for (j=0; j < m_size; j++) {
                m1[i][j] = (double)rand() / f_rand_max;
                m2[i][j] = (double)rand() / f_rand_max;
            }
        t1 = clock();
    
        // multiply together
        for (i=0; i < m_size; i++)
            for (j=0; j < m_size; j++) {
                running_sum = 0;
                for (k = 0; k < m_size; k++)
                    running_sum += m1[i][k] * m2[k][j];
                m3[i][j] = running_sum;
            }
    
        t2 = clock();
    
        float t01 = ((float)(t1 - t0) / CLOCKS_PER_SEC );
        float t12 = ((float)(t2 - t1) / CLOCKS_PER_SEC );
        printf("creation time: %f", t01 );
        printf("\nmultiplication time: %f", t12 );
        return 0;
    }
    

    编辑:已经修改了python,使其做了一个适当的点积,以微秒的分辨率缩小了差距,并使用了可比较的双精度数据类型,而不是最初发布的浮点数据类型。

    输出:

    $ gcc -O3 -march=native bench.c
    $ ./a.out
    creation time: 0.092651
    multiplication time: 139.945068
    $ python3 bench.py
    creation time: 0.1473407745361328
    multiplication time: 0.329038143157959
    

    编辑:修改C代码以转置第二个矩阵,以实现更高效的访问模式,差距更加缩小

    修改后的乘法代码:

    // transpose m2 in order to capitalise on cache efficiencies
    // store transposed matrix in m3 for now
    for (i=0; i < m_size; i++)
        for (j=0; j < m_size; j++)
            m3[j][i] = m2[i][j];
    // swap the pointers
    void *mtemp = *m3;
    *m3 = *m2;
    *m2 = mtemp;
    
    
    // multiply together
    for (i=0; i < m_size; i++)
        for (j=0; j < m_size; j++) {
            running_sum = 0;
            for (k = 0; k < m_size; k++)
                running_sum += m1[i][k] * m2[j][k];
            m3[i][j] = running_sum;
        }
    

    $ gcc -O3 -march=native bench2.c
    $ ./a.out
    creation time: 0.107767
    multiplication time: 10.843431
    $ python3 bench.py
    creation time: 0.1488208770751953
    multiplication time: 0.3335080146789551
    

    编辑:使用-0fast编译,我确信这是一个公平的比较,将差异降低到刚刚超过一个数量级(对numpy有利)。

    $ gcc -Ofast -march=native bench2.c
    $ ./a.out
    creation time: 0.098201
    multiplication time: 4.766985
    $ python3 bench.py
    creation time:  0.13812589645385742
    multiplication time:  0.3441300392150879
    

    for m_size = 10000
    
    $ gcc -Ofast -march=native bench3.c # indexed by arr[ i * m_size + j ]
    $ ./a.out
    creation time: 1.280863
    multiplication time: 626.327820
    $ gcc -Ofast -march=native bench2.c # indexed by art[I][j]
    $ ./a.out
    creation time: 2.410230
    multiplication time: 708.979980
    $ python3 bench.py
    creation time:  3.8284950256347656
    multiplication time:  39.06089973449707
    

    最新代码bench3。c:

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    
    int main(void) {
    
        clock_t t0, t1, t2;
    
        t0 = clock();
        // create matrices and allocate memory
        int m_size = 10000;
        int i, j, k, x, y;
        double running_sum;
        double *m1 = (double *)malloc(sizeof(double)*m_size*m_size),
                    *m2 = (double *)malloc(sizeof(double)*m_size*m_size),
                    *m3 = (double *)malloc(sizeof(double)*m_size*m_size);
        double f_rand_max = (double)RAND_MAX;
    
        // populate with random numbers 0 - 1
        for (i=0; i < m_size; i++) {
            x = i * m_size;
            for (j=0; j < m_size; j++)
                m1[x + j] = ((double)rand()) / f_rand_max;
              m2[x + j] = ((double)rand()) / f_rand_max;
                m3[x + j] = ((double)rand()) / f_rand_max;
        }
        t1 = clock();
    
        // transpose m2 in order to capitalise on cache efficiencies
        // store transposed matrix in m3 for now
        for (i=0; i < m_size; i++)
            for (j=0; j < m_size; j++)
                m3[j*m_size + i] = m2[i * m_size + j];
        // swap the pointers
        double *mtemp = m3;
        m3 = m2;
        m2 = mtemp;
    
    
        // multiply together
        for (i=0; i < m_size; i++) {
            x = i * m_size;
            for (j=0; j < m_size; j++) {
                running_sum = 0;
                y = j * m_size;
                for (k = 0; k < m_size; k++)
                    running_sum += m1[x + k] * m2[y + k];
                m3[x + j] = running_sum;
            }
        }
    
        t2 = clock();
    
        float t01 = ((float)(t1 - t0) / CLOCKS_PER_SEC );
        float t12 = ((float)(t2 - t1) / CLOCKS_PER_SEC );
        printf("creation time: %f", t01 );
        printf("\nmultiplication time: %f", t12 );
        return 0;
    }
    
    1 回复  |  直到 7 年前
        1
  •  1
  •   ezekiel    7 年前

    结论:X10000差异的原始荒谬因素主要是由于错误地将Python/numpy中的元素乘法与C代码进行比较,并且没有使用所有可用的优化进行编译,并且使用效率极低的内存访问模式编写,该模式可能没有利用缓存。 “公平”比较(即正确但效率极低的单线程算法,使用Ofast编译)得出的性能因子差为x350