I'm dealing with a big dataset and want to basically this:
test = np.random.rand(int(1e7))-0.5
def test0(test):
return [0 if c<0 else c for c in test]
which is doing this:
def test1(test):
for i,dat in enumerate(test):
if dat<0:
test[i] = 0
else:
test[i] = dat
return test
Is there a way to modify test0 to skip the else request so i works like this:
def test1(test):
for i,dat in enumerate(test):
if dat<0: test[i] = 0
return test
Thanks in advance!
CodePudding user response:
just do which seems to be fastest option:
(1) test.clip(0) # THANKS TO @u12-forward
OR
(2) test[test < 0] = 0
depending on how you test it.
when you execute each method 1000 times then approach number 2 is fastest. when you measure single function execution then option number 1 is fastest.
test:
import numpy as np
import timeit
from copy import copy
from functools import partial
def create_data():
return np.random.rand(int(1e7))-0.5
def func1(data):
data[data < 0] = 0
def func2(data):
np.putmask(data, data < 0, 0)
def func3(data):
np.maximum(data, 0)
def func4(data):
data.clip(0)
def func5(data):
np.where(data < 0, 0, data)
if __name__ == '__main__':
n_loops = 1000
test = create_data()
t1 = timeit.Timer(partial(func1, copy(test)))
t2 = timeit.Timer(partial(func2, copy(test)))
t3 = timeit.Timer(partial(func3, copy(test)))
t4 = timeit.Timer(partial(func4, copy(test)))
t5 = timeit.Timer(partial(func4, copy(test)))
print(f"func1 (x[x < 0]): timeit {t1.timeit(n_loops)} num test loops {n_loops}")
print(f"func2 (putmask): timeit {t2.timeit(n_loops)} num test loops {n_loops}")
print(f"func3 (maximum): timeit {t3.timeit(n_loops)} num test loops {n_loops}")
print(f"func4 (clip): timeit {t4.timeit(n_loops)} num test loops {n_loops}")
print(f"func5 (where): timeit {t5.timeit(n_loops)} num test loops {n_loops}")
test results:
func1 (x[x < 0]): timeit 7.2177265440000005 num test loops 1000
func2 (putmask): timeit 13.913492435999999 num test loops 1000
func3 (maximum): timeit 23.065230873999997 num test loops 1000
func4 (clip): timeit 22.768682354000006 num test loops 1000
func5 (where): timeit 23.844607757999995 num test loops 1000
EDIT:
different approach to test data[data < 0] = 0 vs np.were(data < 0, 0, data):
import numpy as np
from time import perf_counter as clock
z = np.random.rand(10**7) - 0.5
start = clock()
for i in range(100):
a = z.copy()
np.where(a<0, 0, a)
print(clock() - start)
start = clock()
for i in range(100):
a = z.copy()
a[a<0] = 0
print(clock() - start)
test result:
7.9247566030000005
8.021165436000002
test3:
In [1]: import numpy as np
...: from copy import copy
...:
...:
...:
...: test = np.random.rand(int(1e7))-0.5
...:
...:
...: def func1():
...: data = copy(test)
...: data[data < 0] = 0
...:
...:
...: def func2():
...: data = copy(test)
...: np.putmask(data, data < 0, 0)
...:
...:
...: def func3():
...: data = copy(test)
...: np.maximum(data, 0)
...:
...:
...: def func4():
...: data = copy(test)
...: data.clip(0)
...:
...:
...: def func5():
...: data = copy(test)
...: np.where(data < 0, 0, data)
...:
In [2]: timeit func1
16.9 ns ± 0.117 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)
In [3]: timeit func2
15.8 ns ± 0.184 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)
In [4]: timeit func3
22.1 ns ± 0.287 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
In [5]: timeit func4
15.6 ns ± 0.0594 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)
In [6]: timeit func5
16.2 ns ± 0.187 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)
CodePudding user response:
Use np.ndarray.clip
like test.clip(min=0)
:
>>> test.clip(0)
array([0. , 0.11819274, 0.36379089, ..., 0. , 0.13401746,
0. ])
>>>
https://gist.github.com/axil/af6c4adb8c5634ff39ed9f3da1efaa90