Using multiprocessing to speed up filling of a numpy array-CodePudding

I am trying to solve a problem that involves minimizing a certain function. The function contains a numpy array whose elements are filled by calling a function. The array generation is taking a huge amount of time, and I mean that. The array that I'm generating is defined below

def cov_p(Phi, l_mat, s_f, sigma_11, sigma_22, N_p):
    ans = np.zeros([N_p, N_p, 2, 2], dtype=complex)
    for i in range(N_p):
        for j in range(N_p):
            ans[i][j] = np.linalg.multi_dot([L_0, R_final(
                Phi, l_mat, s_f, (i-j)*tstep), L_0.T]) np.array([[sigma_11, 0], [0, sigma_22]])*I(i, j)

    return ans.transpose(0, 2, 1, 3).reshape(2*N_p, -1)

where L_0 and I are defined below

L_0 = np.eye(2)


def I(m, p):
    if m == p:
        return 1
    return 0

As can be seen , each element references the function R_final, which basically returns a 2 by 2 complex matrix. Then I concatenate all those 2 by 2 matrices to form a matrix of size 2N_p by 2N_p. R_final is defined below.

def R_x(Phi, l_mat, s_f, t, i, j):
    real_integral = quad(S_xr, -np.inf, np.inf,
                         args=(Phi, l_mat, s_f, t, i, j), limit=10000)
    imag_integral = quad(S_xc, -np.inf, np.inf,
                         args=(Phi, l_mat, s_f, t, i, j), limit=10000)
    return real_integral[0] 1j*imag_integral[0]

def R_final(Phi, l_mat, s_f, t):
    return np.array([[R_x(Phi, l_mat, s_f, t, i, j) for j in range(2)] for i in range(2)])

where S_xr, S_xc are shown below

def S_xr(omega, Phi, l_mat, s_f, t, i, j):
    h_x = np.real(np.linalg.multi_dot(
        [Phi, np.linalg.inv(-1j*omega*np.linalg.inv(l_mat) np.eye(np.shape(l_mat)[0])), Phi.T]))   #Taking only the real value. Check again!!!

    ans = np.exp(1j*omega*t)*np.linalg.multi_dot([np.conjugate(h_x), s_f, h_x.T])
    return np.real(ans[i][j])

def S_xc(omega, Phi, l_mat, s_f, t, i, j):
    h_x = np.real(np.linalg.multi_dot(
        [Phi, np.linalg.inv(-1j*omega*np.linalg.inv(l_mat) np.eye(np.shape(l_mat)[0])), Phi.T]))   #Taking only the real value. Check again!!!

    ans = np.exp(1j*omega*t)*np.linalg.multi_dot([np.conjugate(h_x), s_f, h_x.T])
    return np.imag(ans[i][j])

Try calculating cov_p for the following values of listed parameters.

phi = np.array([[-0.0529255  0.00662948j, -0.0529255 -0.00662948j,
        -0.03050694-0.00190298j, -0.03050694 0.00190298j],
       [-0.04149906 0.00171591j, -0.04149906-0.00171591j,
         0.01974404-0.00194719j,  0.01974404 0.00194719j]])
lamb_mat = np.array([[-1.00390867  6.28783994j,  0.          0.j        ,
         0.          0.j        ,  0.          0.j        ],
       [ 0.          0.j        , -1.00390867 -6.28783994j,
         0.          0.j        ,  0.          0.j        ],
       [ 0.          0.j        ,  0.          0.j        ,
        -0.25859133 12.09860357j,  0.          0.j        ],
       [ 0.          0.j        ,  0.          0.j        ,
         0.          0.j        , -0.25859133-12.09860357j]])
S_f = np.array([[100,0],[0,100]])
tstep = 0.1
sigma_11 = 0.3
sigma_22 = 0.4
#Try finding cov_p for the following set of inputs
cov_p(phi, lamb_mat, S_f, sigma_11, sigma_22, 10)

What I want to ask is how I can speed up filling of cov_p by dividing the operation into multiple processes. It's actually the R_final matrices to the right and bottom of cov_p which are taking time. Any help will be appreciated.

CodePudding user response：

You can easily parallelize the outermost loop using Pool of multiprocessing. The idea is to split the ans array in the last dimension and merge it once each part has been computed. Note that the tstep variable needs to be sent to processes. Here is the resulting code containing mainly the modified parts:

import numpy as np
from scipy.integrate import quad
from multiprocessing import Pool

# [...] -- Code of the following unchanged function here
# I, R_x, R_final, S_xr, S_xc

def cov_p_work(params):
    Phi, l_mat, s_f, sigma_11, sigma_22, N_p, tstep, i = params
    ans = np.zeros([N_p, 2, 2], dtype=complex)
    L_0 = np.eye(2)
    for j in range(N_p):
        ans[j] = np.linalg.multi_dot([L_0, R_final(
            Phi, l_mat, s_f, (i-j)*tstep), L_0.T]) np.array([[sigma_11, 0], [0, sigma_22]])*I(i, j)
    return ans

def cov_p(Phi, l_mat, s_f, sigma_11, sigma_22, N_p, tstep):
    with Pool() as p:
        params = [(Phi, l_mat, s_f, sigma_11, sigma_22, N_p, tstep, i) for i in range(N_p)]
        result = p.map(cov_p_work, params)
    ans = np.array(result)
    return ans.transpose(0, 2, 1, 3).reshape(2*N_p, -1)

if __name__ == '__main__':
    # [...] -- Unchanged parameters here

    #Try finding cov_p for the following set of inputs
    cov_p(phi, lamb_mat, S_f, sigma_11, sigma_22, 10, tstep)

On my 6-core machine, this is about 5 times faster (96.9 seconds --> 20.9 seconds). Note that it barely scale with more core since N_p is quite small here (which can cause some imperfect load balancing).

CodePudding user response：

The only potential optimisation I see here is to run a wrapper around quad as a discrete process. This is because quad calls S_xr and S_xc so you can't really optimise those functions.

With the values you've shown, the entire program runs in less than 50 seconds on my machine. How it ran for 6 days on your machine is a mystery. Perhaps you're using different parameters to those shown in your question. Anyway, here's the reworked version in its entirety:

import numpy as np
from scipy.integrate import quad
from multiprocessing import Pool

phi = np.array([[-0.0529255   0.00662948j, -0.0529255 - 0.00662948j,
                 -0.03050694-0.00190298j, -0.03050694 0.00190298j],
                [-0.04149906 0.00171591j, -0.04149906-0.00171591j,
                 0.01974404-0.00194719j,  0.01974404 0.00194719j]])
lamb_mat = np.array([[-1.00390867   6.28783994j,  0.   0.j,
                      0.   0.j,  0.   0.j],
                     [0.   0.j, -1.00390867 - 6.28783994j,
                      0.   0.j,  0.   0.j],
                     [0.   0.j,  0.   0.j,
                      -0.25859133 12.09860357j,  0.   0.j],
                     [0.   0.j,  0.   0.j,
                      0.   0.j, -0.25859133-12.09860357j]])

L_0 = np.eye(2)
S_f = np.array([[100, 0], [0, 100]])
tstep = 0.1
sigma_11 = 0.3
sigma_22 = 0.4
quadProc = None

def S_xr(omega, Phi, l_mat, s_f, t, i, j):
    h_x = np.real(np.linalg.multi_dot(
        [Phi, np.linalg.inv(-1j*omega*np.linalg.inv(l_mat) np.eye(np.shape(l_mat)[0])), Phi.T]))  # Taking only the real value. Check again!!!

    ans = np.exp(1j*omega*t) * np.linalg.multi_dot([np.conjugate(h_x), s_f, h_x.T])
    return np.real(ans[i][j])


def S_xc(omega, Phi, l_mat, s_f, t, i, j):
    h_x = np.real(np.linalg.multi_dot(
        [Phi, np.linalg.inv(-1j*omega*np.linalg.inv(l_mat) np.eye(np.shape(l_mat)[0])), Phi.T]))  # Taking only the real value. Check again!!!

    ans = np.exp(1j*omega*t) * np.linalg.multi_dot([np.conjugate(h_x), s_f, h_x.T])
    return np.imag(ans[i][j])

def quadWrapper(S_x, Phi, l_mat, s_f, t, i, j):
    return S_x, quad(S_x, -np.inf, np.inf, args=(Phi, l_mat, s_f, t, i, j), limit=10_000)

def R_x(Phi, l_mat, s_f, t, i, j):
    args = [(S_xr, Phi, l_mat, s_f, t, i, j), (S_xc, Phi, l_mat, s_f, t, i, j)]
    results = None

    def callback(n):
        nonlocal results
        results = n

    quadProc.starmap_async(func=quadWrapper, iterable=args, callback=callback).wait()

    for f, i in results:
        if f == S_xr:
            real_integral = i
        else:
            imag_integral = i

    return real_integral[0] 1j*imag_integral[0]


def R_final(Phi, l_mat, s_f, t):
    return np.array([[R_x(Phi, l_mat, s_f, t, i, j) for j in range(2)] for i in range(2)])


def cov_p(Phi, l_mat, s_f, sigma_11, sigma_22, N_p):
    global quadProc
    with Pool() as quadProc:
        ans = np.zeros([N_p, N_p, 2, 2], dtype=complex)
        for i in range(N_p):
            for j in range(N_p):
                ans[i][j] = np.linalg.multi_dot([L_0, R_final(
                    Phi, l_mat, s_f, (i-j)*tstep), L_0.T]) np.array([[sigma_11, 0], [0, sigma_22]])*(i==j)

    return ans.transpose(0, 2, 1, 3).reshape(2*N_p, -1)

import time
if __name__ == '__main__':
    start = time.perf_counter()
    print(cov_p(phi, lamb_mat, S_f, sigma_11, sigma_22, 10))
    end = time.perf_counter()
    print(f'Duration={end-start:.2f}s')