I want to populate a huge sparse matrix in python, with the aim to implement Crank-Nicolson numerical method in 2D.
I did it by lopping through all the interior nodes using two nested for loops, but it is extremely slow. Because it is a nested for loop with matrices, I thought of Numba, but it doesn't work with sparse matrices. I cannot convert the matrix to a dense format before passing it as an argument to a numba-jitted function, because of memory issues.
I want to ask what shall I look for in order to make the function populating the A
matrix below quicker?
I tried with scipy.sparse.diags
, but I again ended up using two nested for loops, just as in the (naive) code below.
The problem is that the k
value is computed from i
and j
, and I don't know how to manipulate it without the double for loop.
The code which populates the A
matrix with the double for loop is:
# @njit
def populate_A_matrix_b_vector(psi_gr, timeste, A, b, sys_float_info_min):
# Fill all the interior nodes
# -------------------------------------
# Utilities:
two_lambd = 2 * gl.lambd
oneover4_times_1over_deltazsq = 0.25 / (gl.delta_z**2)
oneover2_times_1over_deltazsq = 0.5 / (gl.delta_z**2)
# For the interior nodes:
for i in range(1, gl.N_z_divs):
for j in range(1, gl.N_epsilon_divs):
k = j*(gl.N_z_divs 1) i
ratio = 1 / ( (2*gl.lambd**2) * (j*gl.delta_epsilon)**two_lambd )
A[k, k] = (-1/gl.delta_time) - ratio * (j**2 - (gl.lambd-0.5)**2 * 0.5) - oneover2_times_1over_deltazsq - \
0.5 * Voft_OFF_imag(i, j, gl.m)
A[k, k 1] = oneover4_times_1over_deltazsq
A[k, k-1] = oneover4_times_1over_deltazsq
A[k, k (gl.N_z_divs 1)] = ratio * ( j**2*0.5 - (gl.lambd-1)*j*0.5 )
A[k, k - (gl.N_z_divs 1)] = ratio * ( j**2*0.5 (gl.lambd-1)*j*0.5 )
b[k] = psi_gr[i, j, timeste] * ( (-1/gl.delta_time) ratio*j**2 - (gl.lambd-0.5)**2*0.5 oneover2_times_1over_deltazsq \
0.5 * Voft_OFF_imag(i, j, gl.m) ) \
psi_gr[i 1, j, timeste] * (-oneover4_times_1over_deltazsq) \
psi_gr[i-1, j, timeste] * (-oneover4_times_1over_deltazsq) \
psi_gr[i, j 1, timeste] * ( -ratio * (j**2 * 0.5 - (gl.lambd-1) * j * 0.5) ) \
psi_gr[i, j-1, timeste] * ( -ratio * (j**2 * 0.5 (gl.lambd-1) * j * 0.5) )
if (i % 50 == 0):
print("we have done iter" str(i*gl.N_epsilon_divs j) "out of" str(gl.N_z_divs*gl.N_epsilon_divs) "iters")
# Fill boundaries
# -------------------------
# Bottom boundary except corners: j = 0; i = 1, 2, 3, ..., gl.N_z_divs - 1
j = 0
for i in range(1, gl.N_z_divs):
k = j*(gl.N_z_divs 1) i
A[k, k] = 1
if ( np.real(psi_gr[i, 2, timeste]) >= sys_float_info_min and np.imag(psi_gr[i, 2, timeste]) >= sys_float_info_min):
alpha = psi_gr[i, 1, timeste] / psi_gr[i, 2, timeste]
if (np.arctan(np.imag(alpha) / np.real(alpha))) < 0:
alpha = np.abs(alpha)
else:
alpha = 0.0
# A[k, k (gl.N_z_divs 1)] = alpha
A[k, k (gl.N_z_divs 1)] = 0.0
# Top boundary except corners: j = gl.N_epsilon_divs; i = 1, 2, 3, ..., gl.N_z_divs - 1
j = gl.N_epsilon_divs
for i in range(1, gl.N_z_divs):
k = j*(gl.N_z_divs 1) i
A[k, k] = 1
if (np.real(psi_gr[i, gl.N_epsilon_divs-2, timeste])>= sys_float_info_min and np.imag(psi_gr[i, gl.N_epsilon_divs-2, timeste]) >= sys_float_info_min):
alpha = psi_gr[i, gl.N_epsilon_divs-1, timeste] / psi_gr[i, gl.N_epsilon_divs-2, timeste]
if (np.arctan(np.imag(alpha) / np.real(alpha))) < 0:
alpha = np.abs(alpha)
else:
alpha = 0.0
# A[k, k-(gl.N_z_divs 1)] = alpha
A[k, k-(gl.N_z_divs 1)] = 0.0
# Left boundary except corners: i = 0; j = 1, 2, 3, ..., gl.N_epsilon_divs - 1
i = 0
for j in range(1, gl.N_epsilon_divs):
k = j*(gl.N_z_divs 1) i
A[k, k] = 1
if (np.real(psi_gr[2, j, timeste]) >= sys_float_info_min and np.imag(psi_gr[2, j, timeste]) >= sys_float_info_min):
alpha = psi_gr[1, j, timeste] / psi_gr[2, j, timeste]
if (np.arctan(np.imag(alpha) / np.real(alpha))) < 0:
alpha = np.abs(alpha)
else:
alpha = 0.0
# A[k, k 1] = alpha
A[k, k 1] = 0.0
# Right boundary except corners: i = N_z_divs; j = 1, 2, 3, ..., gl.N_epsilon_divs - 1
i = gl.N_z_divs
for j in range(1, gl.N_epsilon_divs):
k = j*(gl.N_z_divs 1) i
A[k, k] = 1
if (np.real(psi_gr[gl.N_z_divs-2, j, timeste]) >= sys_float_info_min and np.imag(psi_gr[gl.N_z_divs-2, j, timeste]) >= sys_float_info_min):
alpha = psi_gr[gl.N_z_divs-1, j, timeste] / psi_gr[gl.N_z_divs-2, j, timeste]
if (np.arctan(np.imag(alpha) / np.real(alpha))) < 0:
alpha = np.abs(alpha)
else:
alpha = 0.0
A[k, k-1] = alpha
# A[k, k-1] = 0.0
# Bottom left corner: i = j = 0
i = 0; j = 0
k = j*(gl.N_z_divs 1) i
A[k, k] = 1
if (np.real(psi_gr[2, 0, timeste]) >= sys_float_info_min and np.imag(psi_gr[2, 0, timeste]) >= sys_float_info_min):
alpha1 = psi_gr[1, 0, timeste] / psi_gr[2, 0, timeste]
if (np.arctan(np.imag(alpha1) / np.real(alpha1))) < 0:
alpha1 = np.abs(alpha1)
else:
alpha1 = 0.0
if (np.real(psi_gr[0, 2, timeste]) >= sys_float_info_min and np.imag(psi_gr[0, 2, timeste]) >= sys_float_info_min):
alpha2 = psi_gr[0, 1, timeste] / psi_gr[0, 2, timeste]
if (np.arctan(np.imag(alpha2) / np.real(alpha2))) < 0:
alpha2 = np.abs(alpha2)
else:
alpha2 = 0.0
# A[k, k 1] = - 0.5 * alpha1
# A[k, k (gl.N_z_divs 1)] = - 0.5 * alpha2
A[k, k 1] = 0.0
A[k, k (gl.N_z_divs 1)] = 0.0
# Bottom right corner: i = N_z_divs; j = 0
i = gl.N_z_divs; j = 0
k = j*(gl.N_z_divs 1) i
A[k, k] = 1
if (np.real(psi_gr[gl.N_z_divs-2, 0, timeste]) >= sys_float_info_min and np.imag(psi_gr[gl.N_z_divs-2, 0, timeste]) >= sys_float_info_min):
alpha1 = psi_gr[gl.N_z_divs-1, 0, timeste] / psi_gr[gl.N_z_divs-2, 0, timeste]
if (np.arctan(np.imag(alpha1) / np.real(alpha1))) < 0:
alpha1 = np.abs(alpha1)
else:
alpha1 = 0.0
if (np.real(psi_gr[gl.N_z_divs, 2, timeste]) >= sys_float_info_min and np.imag(psi_gr[gl.N_z_divs, 2, timeste]) >= sys_float_info_min):
alpha2 = psi_gr[gl.N_z_divs, 1, timeste] / psi_gr[gl.N_z_divs, 2, timeste]
if (np.arctan(np.imag(alpha2) / np.real(alpha2))) < 0:
alpha2 = np.abs(alpha2)
else:
alpha2 = 0.0
# A[k, k-1] = - 0.5 * alpha1
# A[k, k (gl.N_z_divs 1)] = - 0.5 * alpha2
A[k, k-1] = 0.0
A[k, k (gl.N_z_divs 1)] = 0.0
# Top left corner: i = 0; j = N_epsilon_divs
i = 0; j = gl.N_epsilon_divs
k = j*(gl.N_z_divs 1) i
A[k, k] = 1
if (np.real(psi_gr[2, gl.N_epsilon_divs, timeste]) >= sys_float_info_min and np.imag(psi_gr[2, gl.N_epsilon_divs, timeste]) >= sys_float_info_min):
alpha1 = psi_gr[1, gl.N_epsilon_divs, timeste] / psi_gr[2, gl.N_epsilon_divs, timeste]
if (np.arctan(np.imag(alpha1) / np.real(alpha1))) < 0:
alpha1 = np.abs(alpha1)
else:
alpha1 = 0.0
if (np.real(psi_gr[0, gl.N_epsilon_divs-2, timeste]) >= sys_float_info_min and np.imag(psi_gr[0, gl.N_epsilon_divs-2, timeste]) >= sys_float_info_min):
alpha2 = psi_gr[0, gl.N_epsilon_divs-1, timeste] / psi_gr[0, gl.N_epsilon_divs-2, timeste]
if (np.arctan(np.imag(alpha2) / np.real(alpha2))) < 0:
alpha2 = np.abs(alpha2)
else:
alpha2 = 0.0
# A[k, k 1] = - 0.5 * alpha1
# A[k, k - (gl.N_z_divs 1)] = - 0.5 * alpha2
A[k, k 1] = 0.0
A[k, k - (gl.N_z_divs 1)] = 0.0
# Top right corner: i = N_z_divs; j = N_epsilon_divs
i = gl.N_z_divs; j = gl.N_epsilon_divs
k = j*(gl.N_z_divs 1) i
A[k, k] = 1
if (np.real(psi_gr[gl.N_z_divs-2, gl.N_epsilon_divs, timeste]) >= sys_float_info_min and np.imag(psi_gr[gl.N_z_divs-2, gl.N_epsilon_divs, timeste]) >= sys_float_info_min):
alpha1 = psi_gr[gl.N_z_divs-1, gl.N_epsilon_divs, timeste] / psi_gr[gl.N_z_divs-2, gl.N_epsilon_divs, timeste]
if (np.arctan(np.imag(alpha1) / np.real(alpha1))) < 0:
alpha1 = np.abs(alpha1)
else:
alpha1 = 0.0
if (np.real(psi_gr[gl.N_z_divs, gl.N_epsilon_divs-2, timeste]) >= sys_float_info_min and np.imag(psi_gr[gl.N_z_divs, gl.N_epsilon_divs-2, timeste]) >= sys_float_info_min):
alpha2 = psi_gr[gl.N_z_divs, gl.N_epsilon_divs-1, timeste] / psi_gr[gl.N_z_divs, gl.N_epsilon_divs-2, timeste]
if (np.arctan(np.imag(alpha2) / np.real(alpha2))) < 0:
alpha2 = np.abs(alpha2)
else:
alpha2 = 0.0
# A[k, k-1] = - 0.5 * alpha1
# A[k, k -(gl.N_z_divs 1)] = - 0.5 * alpha2
A[k, k-1] = 0.0
A[k, k -(gl.N_z_divs 1)] = 0.0
return A, b
@njit
def Voft_OFF_imag(i, j, m):
# this is the same as Voft_OFF() above, because there is no modification due to the imag. time propagation (delta_time --> -1j*delta_time) appearing in Voft_OFF (as there is no E-field turned on)
ii = i - gl.K/2
term1 = -1 / sqrt( (j*gl.delta_epsilon)**(2*gl.lambd) (ii*gl.delta_z)**2 ) # - 1 / sqrt(...)
term2 = m**2 / ( 2*(j*gl.delta_epsilon)**(2*gl.lambd) )
return (term1 term2)
and I use it as:
def solve_for_1timestep_imag(psi_gr, timeste):
size = (gl.N_z_divs 1) * (gl.N_epsilon_divs 1)
A = sparse.dok_matrix( (size, size), dtype=np.complex64)
# b = sparse.dok_array((size, 1), dtype=np.complex64)
b = np.zeros( (size, 1), dtype=np.complex64)
sys_float_info_min = sys.float_info.min
A, b = populate_A_matrix_b_vector(psi_gr, timeste, A, b, sys_float_info_min)
...
...
The gl.py
file is:
import numpy as np
Energy_1s_au = np.float64(-0.5) # -13.6 eV in atomic units of energy
m = np.float64(0.0) # magnetic Q number
omega = np.float64(0.2)
E0 = np.float64(0.3)
N_timesteps_imag = np.int32(120)
lambd = np.float64(1.5)
delta_time = np.float64(0.01)
N_epsilon_divs = np.int32(200)
N_z_divs = np.int32(500)
K = N_z_divs # equal to the variable N_z_divs
delta_epsilon = np.float64(0.1)
delta_z = np.float64(0.1)
z_max = (N_z_divs/2) * delta_z
epsilon_range = np.linspace(0.0, N_epsilon_divs*delta_epsilon, N_epsilon_divs 1)
z_range = np.linspace(-z_max, z_max, N_z_divs 1)
That function populating the A
matrix is the bottleneck of my code and I would greatly appreciate any help on how to make it at least a little bit faster.
Thank you!
CodePudding user response:
Scipy sparse matrices are pretty slow. This is especially true for the DOK matrices using inefficient hash-tables internally. However, reading/Setting a specific cell of a sparse matrices in a loop is insanely slow (eg. each access takes 10~15 us on my machine). You should avoid that like the plague.
One solution to solve this problem is to create an array of indices and values and write the values to the space matrix in vectorized way. The computation of the indices/values can be optimized with Numba. Here is an example:
@nb.njit
def gen_indexed_values(psi_gr, timeste, b):
two_lambd = 2 * gl.lambd
oneover4_times_1over_deltazsq = 0.25 / (gl.delta_z**2)
oneover2_times_1over_deltazsq = 0.5 / (gl.delta_z**2)
res_size = (gl.N_z_divs-1) * (gl.N_epsilon_divs-1)
A_idx = np.empty(res_size, dtype=np.int32)
A_values = np.empty((3, res_size), dtype=np.complex64)
cur = 0
for i in range(1, gl.N_z_divs):
for j in range(1, gl.N_epsilon_divs):
k = j*(gl.N_z_divs 1) i
ratio = 1 / ( (2*gl.lambd**2) * (j*gl.delta_epsilon)**two_lambd )
A_idx[cur] = k
A_values[0, cur] = (-1/gl.delta_time) - ratio * (j**2 - (gl.lambd-0.5)**2 * 0.5) - oneover2_times_1over_deltazsq - \
0.5 * Voft_OFF_imag(i, j, gl.m)
A_values[1, cur] = ratio * ( j**2*0.5 - (gl.lambd-1)*j*0.5 )
A_values[2, cur] = ratio * ( j**2*0.5 (gl.lambd-1)*j*0.5 )
b[k] = psi_gr[i, j, timeste] * ( (-1/gl.delta_time) ratio*j**2 - (gl.lambd-0.5)**2*0.5 oneover2_times_1over_deltazsq \
0.5 * Voft_OFF_imag(i, j, gl.m) ) \
psi_gr[i 1, j, timeste] * (-oneover4_times_1over_deltazsq) \
psi_gr[i-1, j, timeste] * (-oneover4_times_1over_deltazsq) \
psi_gr[i, j 1, timeste] * ( -ratio * (j**2 * 0.5 - (gl.lambd-1) * j * 0.5) ) \
psi_gr[i, j-1, timeste] * ( -ratio * (j**2 * 0.5 (gl.lambd-1) * j * 0.5) )
cur = 1
return A_idx, A_values
def populate_A_matrix_b_vector(psi_gr, timeste, A, b, sys_float_info_min):
# Utilities:
two_lambd = 2 * gl.lambd
oneover4_times_1over_deltazsq = 0.25 / (gl.delta_z**2)
oneover2_times_1over_deltazsq = 0.5 / (gl.delta_z**2)
# For the interior nodes:
A_idx, A_values = gen_indexed_values(psi_gr, timeste, b)
A[A_idx, A_idx] = A_values[0,:]
A[A_idx, A_idx 1] = oneover4_times_1over_deltazsq
A[A_idx, A_idx-1] = oneover4_times_1over_deltazsq
A[A_idx, A_idx (gl.N_z_divs 1)] = A_values[1,:]
A[A_idx, A_idx - (gl.N_z_divs 1)] = A_values[2,:]
# [...]
This is about 22 times faster on my machine. Using another kind of sparse matrix may help to improve performance.