我跟着
mdurant
这是一个基准:
import numpy as np
import dask
from dask.distributed import Client, Lock
import time
@dask.delayed
def locked_load(fn):
lock = Lock('numpy-read')
lock.acquire()
out = np.load(fn)
lock.release()
return out
@dask.delayed
def unlocked_load(fn):
return np.load(fn)
def work(arr_size, n_parts, use_lock=True):
if use_lock:
f = locked_load
else:
f = unlocked_load
x = np.arange(arr_size, dtype=np.int)
for i in range(n_parts):
np.save('%d.npy' % i, x)
d = [f('%d.npy' % i) for i in range(n_parts)]
return dask.compute(*d)
def main():
client = Client()
with open("lock_time.txt", "a") as fh:
n_parts_list = [20, 100]
arr_size_list = [1_000_000, 5_000_000, 10_000_000]
for n_part in n_parts_list:
for arr_size in arr_size_list:
for use_lock in [True, False]:
st = time.time()
work(arr_size, n_part, use_lock)
en = time.time()
fh.write("%d %d %s %s\n" % (
n_part, arr_size, use_lock, str(en - st))
)
fh.flush()
client.close()
if __name__ == '__main__':
main()
结果(计算机内存为16 GB):
+--------+----------+----------+----------+
| n_part | arr_size | use_lock | time |
+--------+----------+----------+----------+
| 20 | 1000000 | True | 0.97 |
| 20 | 1000000 | False | 0.89 |
| 20 | 5000000 | True | 7.52 |
| 20 | 5000000 | False | 6.80 |
| 20 | 10000000 | True | 16.70 |
| 20 | 10000000 | False | 15.78 |
| 100 | 1000000 | True | 3.76 |
| 100 | 1000000 | False | 6.88 |
| 100 | 5000000 | True | 43.22 |
| 100 | 5000000 | False | 38.96 |
| 100 | 10000000 | True | 291.34 |
| 100 | 10000000 | False | 389.34 |
+--------+----------+----------+----------+