我发现python/numpy的性能比我的C代码高出了10000x以上,这显然是不对的,那么我的C代码有什么问题导致它的性能如此差呢?(即使用-O3或-Ofast编译)
python:
import time
import numpy as np
t0 = time.time()
m1 = np.random.rand(2000, 2000)
m2 = np.random.rand(2000, 2000)
t1 = time.time()
m3 = m1 @ m2
t2 = time.time()
print('creation time: ', t1 - t0, ' \n multiplication time: ', t2 - t1)
C:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(void) {
clock_t t0=clock(), t1, t2;
// create matrices and allocate memory
int m_size = 2000;
int i, j, k;
double running_sum;
double *m1[m_size], *m2[m_size], *m3[m_size];
double f_rand_max = (double)RAND_MAX;
for(i = 0; i < m_size; i++) {
m1[i] = (double *)malloc(sizeof(double)*m_size);
m2[i] = (double *)malloc(sizeof(double)*m_size);
m3[i] = (double *)malloc(sizeof(double)*m_size);
}
// populate with random numbers 0 - 1
for (i=0; i < m_size; i++)
for (j=0; j < m_size; j++) {
m1[i][j] = (double)rand() / f_rand_max;
m2[i][j] = (double)rand() / f_rand_max;
}
t1 = clock();
// multiply together
for (i=0; i < m_size; i++)
for (j=0; j < m_size; j++) {
running_sum = 0;
for (k = 0; k < m_size; k++)
running_sum += m1[i][k] * m2[k][j];
m3[i][j] = running_sum;
}
t2 = clock();
float t01 = ((float)(t1 - t0) / CLOCKS_PER_SEC );
float t12 = ((float)(t2 - t1) / CLOCKS_PER_SEC );
printf("creation time: %f", t01 );
printf("\nmultiplication time: %f", t12 );
return 0;
}
编辑:已经修改了python,使其做了一个适当的点积,以微秒的分辨率缩小了差距,并使用了可比较的双精度数据类型,而不是最初发布的浮点数据类型。
输出:
$ gcc -O3 -march=native bench.c
$ ./a.out
creation time: 0.092651
multiplication time: 139.945068
$ python3 bench.py
creation time: 0.1473407745361328
multiplication time: 0.329038143157959
编辑:修改C代码以转置第二个矩阵,以实现更高效的访问模式,差距更加缩小
修改后的乘法代码:
// transpose m2 in order to capitalise on cache efficiencies
// store transposed matrix in m3 for now
for (i=0; i < m_size; i++)
for (j=0; j < m_size; j++)
m3[j][i] = m2[i][j];
// swap the pointers
void *mtemp = *m3;
*m3 = *m2;
*m2 = mtemp;
// multiply together
for (i=0; i < m_size; i++)
for (j=0; j < m_size; j++) {
running_sum = 0;
for (k = 0; k < m_size; k++)
running_sum += m1[i][k] * m2[j][k];
m3[i][j] = running_sum;
}
$ gcc -O3 -march=native bench2.c
$ ./a.out
creation time: 0.107767
multiplication time: 10.843431
$ python3 bench.py
creation time: 0.1488208770751953
multiplication time: 0.3335080146789551
编辑:使用-0fast编译,我确信这是一个公平的比较,将差异降低到刚刚超过一个数量级(对numpy有利)。
$ gcc -Ofast -march=native bench2.c
$ ./a.out
creation time: 0.098201
multiplication time: 4.766985
$ python3 bench.py
creation time: 0.13812589645385742
multiplication time: 0.3441300392150879
for m_size = 10000
$ gcc -Ofast -march=native bench3.c # indexed by arr[ i * m_size + j ]
$ ./a.out
creation time: 1.280863
multiplication time: 626.327820
$ gcc -Ofast -march=native bench2.c # indexed by art[I][j]
$ ./a.out
creation time: 2.410230
multiplication time: 708.979980
$ python3 bench.py
creation time: 3.8284950256347656
multiplication time: 39.06089973449707
最新代码bench3。c:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(void) {
clock_t t0, t1, t2;
t0 = clock();
// create matrices and allocate memory
int m_size = 10000;
int i, j, k, x, y;
double running_sum;
double *m1 = (double *)malloc(sizeof(double)*m_size*m_size),
*m2 = (double *)malloc(sizeof(double)*m_size*m_size),
*m3 = (double *)malloc(sizeof(double)*m_size*m_size);
double f_rand_max = (double)RAND_MAX;
// populate with random numbers 0 - 1
for (i=0; i < m_size; i++) {
x = i * m_size;
for (j=0; j < m_size; j++)
m1[x + j] = ((double)rand()) / f_rand_max;
m2[x + j] = ((double)rand()) / f_rand_max;
m3[x + j] = ((double)rand()) / f_rand_max;
}
t1 = clock();
// transpose m2 in order to capitalise on cache efficiencies
// store transposed matrix in m3 for now
for (i=0; i < m_size; i++)
for (j=0; j < m_size; j++)
m3[j*m_size + i] = m2[i * m_size + j];
// swap the pointers
double *mtemp = m3;
m3 = m2;
m2 = mtemp;
// multiply together
for (i=0; i < m_size; i++) {
x = i * m_size;
for (j=0; j < m_size; j++) {
running_sum = 0;
y = j * m_size;
for (k = 0; k < m_size; k++)
running_sum += m1[x + k] * m2[y + k];
m3[x + j] = running_sum;
}
}
t2 = clock();
float t01 = ((float)(t1 - t0) / CLOCKS_PER_SEC );
float t12 = ((float)(t2 - t1) / CLOCKS_PER_SEC );
printf("creation time: %f", t01 );
printf("\nmultiplication time: %f", t12 );
return 0;
}