faster processing with matrix-CodePudding

I have an array whose input data is 1028* 24* 24*16. When I run the code below, it works very slowly. How can I speed this up? thanks

(I want to get 3*3 matrices from a large array.)

import itertools,math,time,random
import numpy as np

start=time.time()

def ls(x):
     x_p = x / np.sum(x)         
     return np.max(x_p)

inputs = np.random.rand(1028, 24, 24, 16)
b, r, c, ch = inputs.shape[0], inputs.shape[1], inputs.shape[2], inputs.shape[3]

inputs = np.transpose(inputs, (0, 3, 1, 2)) 
inputs = np.reshape(inputs, (b*ch, r, c))

s=2
r=3
num_r=9
num_c=9
ke = np.zeros((b*ch, num_r, num_c), dtype=np.float32)

for i in range(num_r):
     for j in range(num_c):
         outs = np.array(list(map(ls, inputs[:, i*s:i*s r, j*s:j*s r])))
         ke[:, i, j] = outs

ke = np.reshape(ke, (b, ch, num_r, num_c))
ke = np.transpose(ke, (0, 2, 3, 1))

print(time.time() - start)

CodePudding user response：

The original code can be wrapped (with some polishing) into the following function:

import numpy as np


def foo0(arr, s=2, r=3, nr=9, nc=9):
    forward_axes = (0, 3, 1, 2)
    backward_axes = (0, 2, 3, 1)
    b, r, c, ch = arr.shape
    arr = np.transpose(arr, forward_axes) 
    arr = np.reshape(arr, (b * ch, r, c))
    result = np.zeros((b * ch, nr, nc), dtype=np.float32)
    for i in range(nr):
        for j in range(nc):
            result[:, i, j] = np.fromiter(
                map(
                    lambda x: np.max(x / np.sum(x)),
                    arr[:, i * s:i * s   r, j * s:j * s   r]),
                dtype=np.float32)
    result = np.reshape(result, (b, ch, nr, nc))
    result = np.transpose(result, backward_axes)
    return result

While the explicit looping suggests that Numba could be applied here to get some low-hanging fruit, unfortunately the function cannot be readily decorated without interaction with Python object, thus greatly reducing the speed-up. Fortunately, the core computation can be readily vectorized and, as long as nr and nc are small enough, this optimization is sufficient:

def foo1(arr, s=2, r=3, nr=9, nc=9):
    forward_axes = (0, 3, 1, 2)
    backward_axes = (0, 2, 3, 1)
    b, r, c, ch = arr.shape
    arr = np.transpose(arr, forward_axes) 
    arr = np.reshape(arr, (b * ch, r, c))
    result = np.zeros((b * ch, nr, nc), dtype=np.float32)
    for i in range(nr):
        for j in range(nc):
            x = arr[:, i * s:i * s   r, j * s:j * s   r]
            result[:, i, j] = np.max(x, (-1, -2)) / np.sum(x, (-1, -2))
    result = np.reshape(result, (b, ch, nr, nc))
    result = np.transpose(result, backward_axes)
    return result

Note that max(x / k) is the same as max(x) / k but the number of divisions are greatly reduced.

Actually, transposing and reshaping, while it may help with the computation speed, it is not really necessary:

def foo2(arr, s=2, r=3, nr=9, nc=9):
    b, r, c, ch = arr.shape
    result = np.zeros((b, nr, nc, ch), dtype=np.float32)
    for i in range(nr):
        for j in range(nc):
            x = arr[:, i * s:i * s   r, j * s:j * s   r, :]
            result[:, i, j, :] = np.max(x, (1, 2)) / np.sum(x, (1, 2))
    return result

The above is simpler to translate in Numba, but the speed gain for small nr/nc would be minimal (compared to the partially vectorized approach):

import numba as nb


@nb.njit
def sum_nb(arr):
    result = 0
    for x in arr:
        result  = x
    return result


@nb.njit
def max_nb(arr):
    result = arr[0]
    for x in arr[1:]:
        if x > result:
            result = x
    return result


@nb.njit
def _sum_max(arr):
    b, r, c, ch = arr.shape
    res = np.empty((b, ch), dtype=arr.dtype)
    for i in range(b):
        for j in range(ch):
            x = arr[i, :, :, j].ravel()
            res[i, j] = max_nb(x) / sum_nb(x)
    return res


@nb.njit
def foo3(arr, s=2, r=3, nr=9, nc=9):
    b, r, c, ch = arr.shape
    result = np.zeros((b, nr, nc, ch), dtype=np.float32)
    for i in range(nr):
        for j in range(nc):
            result[:, i, j, :] = _sum_max(arr[:, i * s:i * s   r, j * s:j * s   r, :])
    return result

Another option would be to keep the Numba-incompatible code outside of the main looping:

@nb.njit(fastmath=True)
def _foo4(arr, result, s, r, nr, nc):
    bch, nr, nc = result.shape
    for i in range(nr):
        for j in range(nc):
            for k in range(bch):
                x = arr[k, i * s:i * s   r, j * s:j * s   r].ravel()
                result[k, i, j] = max_nb(x) / sum_nb(x)
    return result


def foo4(arr, s=2, r=3, nr=9, nc=9):
    forward_axes = (0, 3, 1, 2)
    backward_axes = (0, 2, 3, 1)
    b, r, c, ch = arr.shape
    arr = np.transpose(arr, forward_axes) 
    arr = np.reshape(arr, (b * ch, r, c))
    result = np.empty((b * ch, nr, nc))
    result = _foo4(arr, result, s, r, nr, nc)
    result = np.reshape(result, (b, ch, nr, nc))
    result = np.transpose(result, backward_axes)
    return result

but again the speed gain would be minimal.

Note that a fully vectorized approach is unlikely to be efficient, because the objects inside the main loops are jagged.

To get some ideas of the relative speed:

funcs = foo0, foo1, foo2, foo3, foo4
arr = np.random.rand(100, 24, 24, 16)


timeds_n = {}
for p in range(1):
    n = 10 ** p
    k = 3
    arr = np.random.rand(100, 24, 24, 16)
    print(f"N = {arr.size}")
    base = funcs[0](arr)
    timeds_n[n] = []
    for func in funcs:
        res = func(arr)
        timed = %timeit -r 1 -n 1 -q -o func(arr)
        timeds_n[n].append(timed.best)
        print(f"{func.__name__:>24}  {np.allclose(base, res)}  {timed.best:.9f}")

N = 921600
                    foo0  True  1.757508748
                    foo1  True  0.095540081
                    foo2  True  0.179208341
                    foo3  True  0.160671403
                    foo4  True  0.155691721

CodePudding user response：

I think the issue is mainly the function ls which should be vectorized and the list / map that takes you time

import itertools,math,time,random
import numpy as np

start=time.time()

def ls(x):
     x_p = x / np.sum(np.sum(x, axis=1), axis=1)[:,None,None]
     return np.max(np.max(x_p,axis=1),axis=1)

inputs = np.random.rand(1028, 24, 24, 16)
b, r, c, ch = inputs.shape[0], inputs.shape[1], inputs.shape[2], inputs.shape[3]

inputs = np.transpose(inputs, (0, 3, 1, 2))
inputs = np.reshape(inputs, (b*ch, r, c))

s=2
r=3
num_r=9
num_c=9
ke = np.zeros((b*ch, num_r, num_c), dtype=np.float32)

for i in range(num_r):
    print(i)
    for j in range(num_c):
        # outs = np.array(list(map(ls, inputs[:, i*s:i*s r, j*s:j*s r])))
        outs =  ls(inputs[:, i*s:i*s r, j*s:j*s r])
        ke[:, i, j] = outs

ke = np.reshape(ke, (b, ch, num_r, num_c))
ke = np.transpose(ke, (0, 2, 3, 1))

print(time.time() - start)