Faster for loop with only if in python-CodePudding

I'm dealing with a big dataset and want to basically this:

test = np.random.rand(int(1e7))-0.5
def test0(test):
    return [0 if c<0 else c for c in test]

which is doing this:

def test1(test):
    for i,dat in enumerate(test):
        if dat<0: 
            test[i] = 0
        else:
            test[i] = dat
    return test

Is there a way to modify test0 to skip the else request so i works like this:

def test1(test):
    for i,dat in enumerate(test):
        if dat<0: test[i] = 0
    return test

Thanks in advance!

CodePudding user response：

just do which seems to be fastest option:

(1) test.clip(0) # THANKS TO @u12-forward

OR 

(2) test[test < 0] = 0

depending on how you test it.

when you execute each method 1000 times then approach number 2 is fastest. when you measure single function execution then option number 1 is fastest.

test:

import numpy as np
import timeit
from copy import copy
from functools import partial


def create_data():
    return np.random.rand(int(1e7))-0.5


def func1(data):
    data[data < 0] = 0


def func2(data):
    np.putmask(data, data < 0, 0)


def func3(data):
    np.maximum(data, 0)


def func4(data):
    data.clip(0)


def func5(data):
    np.where(data < 0, 0, data)


if __name__ == '__main__':
    n_loops = 1000
    test = create_data()

    t1 = timeit.Timer(partial(func1, copy(test)))
    t2 = timeit.Timer(partial(func2, copy(test)))
    t3 = timeit.Timer(partial(func3, copy(test)))
    t4 = timeit.Timer(partial(func4, copy(test)))
    t5 = timeit.Timer(partial(func4, copy(test)))

    print(f"func1 (x[x < 0]): timeit {t1.timeit(n_loops)} num test loops {n_loops}")
    print(f"func2 (putmask): timeit {t2.timeit(n_loops)} num test loops {n_loops}")
    print(f"func3 (maximum): timeit {t3.timeit(n_loops)} num test loops {n_loops}")
    print(f"func4 (clip): timeit {t4.timeit(n_loops)} num test loops {n_loops}")
    print(f"func5 (where): timeit {t5.timeit(n_loops)} num test loops {n_loops}")

test results:

func1 (x[x < 0]): timeit 7.2177265440000005 num test loops 1000
func2 (putmask): timeit 13.913492435999999 num test loops 1000
func3 (maximum): timeit 23.065230873999997 num test loops 1000
func4 (clip): timeit 22.768682354000006 num test loops 1000
func5 (where): timeit 23.844607757999995 num test loops 1000

EDIT:

different approach to test data[data < 0] = 0 vs np.were(data < 0, 0, data):

import numpy as np
from time import perf_counter as clock


z = np.random.rand(10**7) - 0.5

start = clock()
for i in range(100):
    a = z.copy()
    np.where(a<0, 0, a)
print(clock() - start)


start = clock()
for i in range(100):
    a = z.copy()
    a[a<0] = 0
print(clock() - start)

test result:

7.9247566030000005
8.021165436000002

test3:

In [1]: import numpy as np
   ...: from copy import copy
   ...:
   ...:
   ...:
   ...: test = np.random.rand(int(1e7))-0.5
   ...:
   ...:
   ...: def func1():
   ...:     data = copy(test)
   ...:     data[data < 0] = 0
   ...:
   ...:
   ...: def func2():
   ...:     data = copy(test)
   ...:     np.putmask(data, data < 0, 0)
   ...:
   ...:
   ...: def func3():
   ...:     data = copy(test)
   ...:     np.maximum(data, 0)
   ...:
   ...:
   ...: def func4():
   ...:     data = copy(test)
   ...:     data.clip(0)
   ...:
   ...:
   ...: def func5():
   ...:     data = copy(test)
   ...:     np.where(data < 0, 0, data)
   ...:

In [2]: timeit func1
16.9 ns ± 0.117 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)

In [3]: timeit func2
15.8 ns ± 0.184 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)

In [4]: timeit func3
22.1 ns ± 0.287 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [5]: timeit func4
15.6 ns ± 0.0594 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)

In [6]: timeit func5
16.2 ns ± 0.187 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)

CodePudding user response：

Use np.ndarray.clip like test.clip(min=0):

>>> test.clip(0)
array([0.        , 0.11819274, 0.36379089, ..., 0.        , 0.13401746,
       0.        ])
>>>

Documentation of

https://gist.github.com/axil/af6c4adb8c5634ff39ed9f3da1efaa90