I have an array whose input data is 1028* 24* 24*16. When I run the code below, it works very slowly. How can I speed this up? thanks
(I want to get 3*3 matrices from a large array.)
import itertools,math,time,random
import numpy as np
start=time.time()
def ls(x):
x_p = x / np.sum(x)
return np.max(x_p)
inputs = np.random.rand(1028, 24, 24, 16)
b, r, c, ch = inputs.shape[0], inputs.shape[1], inputs.shape[2], inputs.shape[3]
inputs = np.transpose(inputs, (0, 3, 1, 2))
inputs = np.reshape(inputs, (b*ch, r, c))
s=2
r=3
num_r=9
num_c=9
ke = np.zeros((b*ch, num_r, num_c), dtype=np.float32)
for i in range(num_r):
for j in range(num_c):
outs = np.array(list(map(ls, inputs[:, i*s:i*s r, j*s:j*s r])))
ke[:, i, j] = outs
ke = np.reshape(ke, (b, ch, num_r, num_c))
ke = np.transpose(ke, (0, 2, 3, 1))
print(time.time() - start)
CodePudding user response:
The original code can be wrapped (with some polishing) into the following function:
import numpy as np
def foo0(arr, s=2, r=3, nr=9, nc=9):
forward_axes = (0, 3, 1, 2)
backward_axes = (0, 2, 3, 1)
b, r, c, ch = arr.shape
arr = np.transpose(arr, forward_axes)
arr = np.reshape(arr, (b * ch, r, c))
result = np.zeros((b * ch, nr, nc), dtype=np.float32)
for i in range(nr):
for j in range(nc):
result[:, i, j] = np.fromiter(
map(
lambda x: np.max(x / np.sum(x)),
arr[:, i * s:i * s r, j * s:j * s r]),
dtype=np.float32)
result = np.reshape(result, (b, ch, nr, nc))
result = np.transpose(result, backward_axes)
return result
While the explicit looping suggests that Numba could be applied here to get some low-hanging fruit, unfortunately the function cannot be readily decorated without interaction with Python object, thus greatly reducing the speed-up.
Fortunately, the core computation can be readily vectorized and, as long as nr
and nc
are small enough, this optimization is sufficient:
def foo1(arr, s=2, r=3, nr=9, nc=9):
forward_axes = (0, 3, 1, 2)
backward_axes = (0, 2, 3, 1)
b, r, c, ch = arr.shape
arr = np.transpose(arr, forward_axes)
arr = np.reshape(arr, (b * ch, r, c))
result = np.zeros((b * ch, nr, nc), dtype=np.float32)
for i in range(nr):
for j in range(nc):
x = arr[:, i * s:i * s r, j * s:j * s r]
result[:, i, j] = np.max(x, (-1, -2)) / np.sum(x, (-1, -2))
result = np.reshape(result, (b, ch, nr, nc))
result = np.transpose(result, backward_axes)
return result
Note that max(x / k)
is the same as max(x) / k
but the number of divisions are greatly reduced.
Actually, transposing and reshaping, while it may help with the computation speed, it is not really necessary:
def foo2(arr, s=2, r=3, nr=9, nc=9):
b, r, c, ch = arr.shape
result = np.zeros((b, nr, nc, ch), dtype=np.float32)
for i in range(nr):
for j in range(nc):
x = arr[:, i * s:i * s r, j * s:j * s r, :]
result[:, i, j, :] = np.max(x, (1, 2)) / np.sum(x, (1, 2))
return result
The above is simpler to translate in Numba, but the speed gain for small nr
/nc
would be minimal (compared to the partially vectorized approach):
import numba as nb
@nb.njit
def sum_nb(arr):
result = 0
for x in arr:
result = x
return result
@nb.njit
def max_nb(arr):
result = arr[0]
for x in arr[1:]:
if x > result:
result = x
return result
@nb.njit
def _sum_max(arr):
b, r, c, ch = arr.shape
res = np.empty((b, ch), dtype=arr.dtype)
for i in range(b):
for j in range(ch):
x = arr[i, :, :, j].ravel()
res[i, j] = max_nb(x) / sum_nb(x)
return res
@nb.njit
def foo3(arr, s=2, r=3, nr=9, nc=9):
b, r, c, ch = arr.shape
result = np.zeros((b, nr, nc, ch), dtype=np.float32)
for i in range(nr):
for j in range(nc):
result[:, i, j, :] = _sum_max(arr[:, i * s:i * s r, j * s:j * s r, :])
return result
Another option would be to keep the Numba-incompatible code outside of the main looping:
@nb.njit(fastmath=True)
def _foo4(arr, result, s, r, nr, nc):
bch, nr, nc = result.shape
for i in range(nr):
for j in range(nc):
for k in range(bch):
x = arr[k, i * s:i * s r, j * s:j * s r].ravel()
result[k, i, j] = max_nb(x) / sum_nb(x)
return result
def foo4(arr, s=2, r=3, nr=9, nc=9):
forward_axes = (0, 3, 1, 2)
backward_axes = (0, 2, 3, 1)
b, r, c, ch = arr.shape
arr = np.transpose(arr, forward_axes)
arr = np.reshape(arr, (b * ch, r, c))
result = np.empty((b * ch, nr, nc))
result = _foo4(arr, result, s, r, nr, nc)
result = np.reshape(result, (b, ch, nr, nc))
result = np.transpose(result, backward_axes)
return result
but again the speed gain would be minimal.
Note that a fully vectorized approach is unlikely to be efficient, because the objects inside the main loops are jagged.
To get some ideas of the relative speed:
funcs = foo0, foo1, foo2, foo3, foo4
arr = np.random.rand(100, 24, 24, 16)
timeds_n = {}
for p in range(1):
n = 10 ** p
k = 3
arr = np.random.rand(100, 24, 24, 16)
print(f"N = {arr.size}")
base = funcs[0](arr)
timeds_n[n] = []
for func in funcs:
res = func(arr)
timed = %timeit -r 1 -n 1 -q -o func(arr)
timeds_n[n].append(timed.best)
print(f"{func.__name__:>24} {np.allclose(base, res)} {timed.best:.9f}")
N = 921600
foo0 True 1.757508748
foo1 True 0.095540081
foo2 True 0.179208341
foo3 True 0.160671403
foo4 True 0.155691721
CodePudding user response:
I think the issue is mainly the function ls which should be vectorized and the list / map that takes you time
import itertools,math,time,random
import numpy as np
start=time.time()
def ls(x):
x_p = x / np.sum(np.sum(x, axis=1), axis=1)[:,None,None]
return np.max(np.max(x_p,axis=1),axis=1)
inputs = np.random.rand(1028, 24, 24, 16)
b, r, c, ch = inputs.shape[0], inputs.shape[1], inputs.shape[2], inputs.shape[3]
inputs = np.transpose(inputs, (0, 3, 1, 2))
inputs = np.reshape(inputs, (b*ch, r, c))
s=2
r=3
num_r=9
num_c=9
ke = np.zeros((b*ch, num_r, num_c), dtype=np.float32)
for i in range(num_r):
print(i)
for j in range(num_c):
# outs = np.array(list(map(ls, inputs[:, i*s:i*s r, j*s:j*s r])))
outs = ls(inputs[:, i*s:i*s r, j*s:j*s r])
ke[:, i, j] = outs
ke = np.reshape(ke, (b, ch, num_r, num_c))
ke = np.transpose(ke, (0, 2, 3, 1))
print(time.time() - start)