Home > database >  Looking for alternatives to increase performance in a FOR python loop
Looking for alternatives to increase performance in a FOR python loop

Time:10-27

I'm currently running this five FOR loops to meet certain conditions and fill another dataframe column. Both dataframe sizes approximately 500 row so basically each loop i run I'm iterating over 250k combinations. Although it's working as expected, its taking too much time to execute but i didnt find any alternatives yet.

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['E'] == linha_opf['E']) & (
                linha_calc['F'] == linha_opf['F']) & (
                linha_calc['G'] == linha_opf['G']) & (
                linha_opf['H'] == 'C') & (
                linha_opf['I'] == 'OFC'):
            calculos.at[x, 'J'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['K'] == linha_opf['K']) & (
                linha_calc['L'] == linha_opf['L']) & (
                linha_calc['M'] == linha_opf['M']) & (
                linha_opf['N'] == 'V') & (
                linha_opf['O'] == 'OFV'):
            calculos.at[x, 'P'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['Q'] == linha_opf['Q']) & (
                linha_calc['R'] == linha_opf['R']) & (
                linha_calc['S'] == linha_opf['S']) & (
                linha_opf['T'] == 'C') & (
                linha_opf['U'] == 'OFV'):
            calculos.at[x, 'V'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['X'] == linha_opf['X']) & (
                linha_calc['Y'] == linha_opf['Y']) & (
                linha_calc['Z'] == linha_opf['Z']) & (
                linha_opf['W'] == 'V') & (
                linha_opf['W1'] == 'OFC'):
            calculos.at[x, 'W2'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['AA'] == linha_opf['AA']) & (
                linha_calc['BB'] == linha_opf['BB']) & (
                linha_calc['CC'] == linha_opf['CC']) & (
                linha_opf['DD'] == 'V') & (
                linha_opf['EE'] == 'OFC'):
            calculos.at[x, 'FF'] = 1
            break

for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['A'] == linha_opf['A']) & (
                linha_calc['B'] == linha_opf['B']) & (
                linha_opf['C'] != 0):
            calculos.at[x, 'D'] = 1
            break
    enter code here

CodePudding user response:

You could probably combine the loops, then set a bool value for each condition, replacing the breaks.

cond1, cond2, ... = False, False, ...
for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows(): 
        if (linha_calc['E'] == linha_opf['E']) & (
            linha_calc['F'] == linha_opf['F']) & (
            linha_calc['G'] == linha_opf['G']) & (
            linha_opf['H'] == 'C') & (
            linha_opf['I'] == 'OFC') & not cond1:
        calculos.at[x, 'J'] = 1
        cond1 = True
        
        if (linha_calc['K'] == linha_opf['K']) & (
            linha_calc['L'] == linha_opf['L']) & (
            linha_calc['M'] == linha_opf['M']) & (
            linha_opf['N'] == 'V') & (
            linha_opf['O'] == 'OFV') & not cond2:
        calculos.at[x, 'P'] = 1
        cond2 = True
        ...
        ...
        ...

        if cond1 and con2 and ... condX:
            break

This will save you looping through the document for each case, X times.

CodePudding user response:

You can speed up for loop using zip rather than iterrows reference

  • iterrrows has a lot of overhead which slows the looping
  • we can also create a utility function to handle the different for loops

Code

import pandas as pd

# Utility function to handle the various for loops
def update(calculos, opf, input_columns, values, update_column):
    '''
        Uses zip to iterate over rows of calculos and opf
    '''
    # Mask of rows to change
    mask = [False]*calculos.shape[0]
    
    if isinstance(values, list):
        n = len(values)
        values = tuple(values)   # to allow comparison to tuple later
    else:
        n = 1       # values is scalar
        
    for x, calc in enumerate(zip(*(calculos[col] for col in input_columns))): # Iterate over rows of calculos
        calc = calc[:-n]   # drop last n elements (i.e. not used in comparison)

        for opf_ in zip(*(opf[col] for col in input_columns)):               # Iterate over rows of opf
            
            # Compare calc and calc_ without last n elements and compare last n elements to values
            if calc == opf_[:-n]:
                if ((n > 1 and opf_[-n:] == values) or 
                    (n == 1 and opf_[-1] != values)):   # second condition for scalar value (e.g. not 0)
                    mask[x] = 1
                    break

        calculos[update_column] = np.where(mask, 1, calculos[update_column])   # 1 where Mask is true, unchanged otherwise

Usage

import pandas as pd
import numpy as np

from random import randint, seed
import timeit

seed(1234)
N = 5            # number of rows
M = 100             # range of values i.e. 0 to M

# Generate data
data = {"E":[randint(0, M) for _ in range(N)],
        "F":[randint(0, M) for _ in range(N)], 
        "G":[randint(0, M) for _ in range(N)],
        "H":['C' if i%2==0 else 'X' for i in range(N)],
        "I":['OFC' if i%2==0 else 'X' for i in range(N)],
        "J":[randint(2, M) for _ in range(N)],
        "K":[randint(0, M) for _ in range(N)],
        "L":[randint(0, M) for _ in range(N)], 
        "M":[randint(0, M) for _ in range(N)],
        "N":['C' if i%2==0 else 'X' for i in range(N)],
        "O":['OFC' if i%2==0 else 'X' for i in range(N)],
        "P":[randint(2, M) for _ in range(N)],
        "Q":[randint(0, M) for _ in range(N)],
        "R":[randint(0, M) for _ in range(N)], 
        "S":[randint(0, M) for _ in range(N)],
        "T":['C' if i%2==0 else 'X' for i in range(N)],
        "U":['OFC' if i%2==0 else 'X' for i in range(N)],
        "V":[randint(2, M) for _ in range(N)],
        "A":[randint(0, M) for _ in range(N)],
        "B":['C' if i%2==0 else 'X' for i in range(N)],
        "C":['OFC' if i%2==0 else 'X' for i in range(N)],
        "D":[randint(2, M) for _ in range(N)]}

calculos = pd.DataFrame(data)
opf = pd.DataFrame(data)

# Use update for posted for loops with parameters as follows.
update(calculos, opf, ['E', 'F', 'G', 'H', 'I'], ['C', 'OFC'], 'J')
update(calculos, opf, ['K', 'L', 'M', 'N', 'O'], ['V', 'OFV'], 'P')
update(calculos, opf, ['Q', 'R', 'S', 'T', 'U'], ['C', 'OFV'], 'V')
update(calculos, opf, ['A', 'B', 'C'], 0, 'D')   # use scalar value of 0

Output

E   F   G   H   I   J   K   L   M   N   ... Q   R   S   T   U   V   A   B   C   D
0   40  37  44  C   OFC 8   11  67  24  C   ... 33  25  36  C   OFC 32  18  C   OFC 1
1   10  95  70  X   X   63  40  7   100 X   ... 56  9   15  X   X   15  44  X   X   1
2   25  85  33  C   OFC 66  31  96  35  C   ... 58  42  63  C   OFC 63  4   C   OFC 1
3   34  13  95  X   X   22  42  82  23  X   ... 17  42  63  X   X   34  21  X   X   1
4   7   76  51  C   OFC 9   63  17  65  C   ... 11  15  24  C   OFC 42  51  C   OFC 1
5 rows × 22 columns

Performance

Summary: ~120X speed up (timing first for loop only) Timing:

  • Posted Method ~ 2 minutes
  • Revised method using update function: ~1 second

Data Generation

N = 1000    # Number of rows
M = 100
seed(1234)   # random seed to make results repeatable

data = {"E":[randint(0, M) for _ in range(N)],
        "F":[randint(0, M) for _ in range(N)], 
        "G":[randint(0, M) for _ in range(N)],
        "H":['C' if i%2==0 else 'X' for i in range(N)],
        "I":['OFC' if i%2==0 else 'X' for i in range(N)],
        "J":[randint(2, M) for _ in range(N)]}

calculos = pd.DataFrame(data)
opf = pd.DataFrame(data)

Using Update

%timeit update(calculos, opf, ['E', 'F', 'G', 'H', 'I'], ['C', 'OFC'], 'J')

Result: 1.09 s ± 29.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Posted Method

%%timeit
for x, linha_calc in calculos.iterrows():
    for z, linha_opf in opf.iterrows():
        if (linha_calc['E'] == linha_opf['E']) & (
                linha_calc['F'] == linha_opf['F']) & (
                linha_calc['G'] == linha_opf['G']) & (
                linha_opf['H'] == 'C') & (
                linha_opf['I'] == 'OFC'):
            calculos.at[x, 'J'] = 1
            break

Result: 2min 51s ± 6.96 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
rows
for rows in linha_calc_rows:
  • Related