I'm currently running this five FOR loops to meet certain conditions and fill another dataframe column. Both dataframe sizes approximately 500 row so basically each loop i run I'm iterating over 250k combinations. Although it's working as expected, its taking too much time to execute but i didnt find any alternatives yet.
for x, linha_calc in calculos.iterrows():
for z, linha_opf in opf.iterrows():
if (linha_calc['E'] == linha_opf['E']) & (
linha_calc['F'] == linha_opf['F']) & (
linha_calc['G'] == linha_opf['G']) & (
linha_opf['H'] == 'C') & (
linha_opf['I'] == 'OFC'):
calculos.at[x, 'J'] = 1
break
for x, linha_calc in calculos.iterrows():
for z, linha_opf in opf.iterrows():
if (linha_calc['K'] == linha_opf['K']) & (
linha_calc['L'] == linha_opf['L']) & (
linha_calc['M'] == linha_opf['M']) & (
linha_opf['N'] == 'V') & (
linha_opf['O'] == 'OFV'):
calculos.at[x, 'P'] = 1
break
for x, linha_calc in calculos.iterrows():
for z, linha_opf in opf.iterrows():
if (linha_calc['Q'] == linha_opf['Q']) & (
linha_calc['R'] == linha_opf['R']) & (
linha_calc['S'] == linha_opf['S']) & (
linha_opf['T'] == 'C') & (
linha_opf['U'] == 'OFV'):
calculos.at[x, 'V'] = 1
break
for x, linha_calc in calculos.iterrows():
for z, linha_opf in opf.iterrows():
if (linha_calc['X'] == linha_opf['X']) & (
linha_calc['Y'] == linha_opf['Y']) & (
linha_calc['Z'] == linha_opf['Z']) & (
linha_opf['W'] == 'V') & (
linha_opf['W1'] == 'OFC'):
calculos.at[x, 'W2'] = 1
break
for x, linha_calc in calculos.iterrows():
for z, linha_opf in opf.iterrows():
if (linha_calc['AA'] == linha_opf['AA']) & (
linha_calc['BB'] == linha_opf['BB']) & (
linha_calc['CC'] == linha_opf['CC']) & (
linha_opf['DD'] == 'V') & (
linha_opf['EE'] == 'OFC'):
calculos.at[x, 'FF'] = 1
break
for x, linha_calc in calculos.iterrows():
for z, linha_opf in opf.iterrows():
if (linha_calc['A'] == linha_opf['A']) & (
linha_calc['B'] == linha_opf['B']) & (
linha_opf['C'] != 0):
calculos.at[x, 'D'] = 1
break
enter code here
CodePudding user response:
You could probably combine the loops, then set a bool value for each condition, replacing the breaks
.
cond1, cond2, ... = False, False, ...
for x, linha_calc in calculos.iterrows():
for z, linha_opf in opf.iterrows():
if (linha_calc['E'] == linha_opf['E']) & (
linha_calc['F'] == linha_opf['F']) & (
linha_calc['G'] == linha_opf['G']) & (
linha_opf['H'] == 'C') & (
linha_opf['I'] == 'OFC') & not cond1:
calculos.at[x, 'J'] = 1
cond1 = True
if (linha_calc['K'] == linha_opf['K']) & (
linha_calc['L'] == linha_opf['L']) & (
linha_calc['M'] == linha_opf['M']) & (
linha_opf['N'] == 'V') & (
linha_opf['O'] == 'OFV') & not cond2:
calculos.at[x, 'P'] = 1
cond2 = True
...
...
...
if cond1 and con2 and ... condX:
break
This will save you looping through the document for each case, X times.
CodePudding user response:
You can speed up for loop using zip rather than iterrows reference
- iterrrows has a lot of overhead which slows the looping
- we can also create a utility function to handle the different for loops
Code
import pandas as pd
# Utility function to handle the various for loops
def update(calculos, opf, input_columns, values, update_column):
'''
Uses zip to iterate over rows of calculos and opf
'''
# Mask of rows to change
mask = [False]*calculos.shape[0]
if isinstance(values, list):
n = len(values)
values = tuple(values) # to allow comparison to tuple later
else:
n = 1 # values is scalar
for x, calc in enumerate(zip(*(calculos[col] for col in input_columns))): # Iterate over rows of calculos
calc = calc[:-n] # drop last n elements (i.e. not used in comparison)
for opf_ in zip(*(opf[col] for col in input_columns)): # Iterate over rows of opf
# Compare calc and calc_ without last n elements and compare last n elements to values
if calc == opf_[:-n]:
if ((n > 1 and opf_[-n:] == values) or
(n == 1 and opf_[-1] != values)): # second condition for scalar value (e.g. not 0)
mask[x] = 1
break
calculos[update_column] = np.where(mask, 1, calculos[update_column]) # 1 where Mask is true, unchanged otherwise
Usage
import pandas as pd
import numpy as np
from random import randint, seed
import timeit
seed(1234)
N = 5 # number of rows
M = 100 # range of values i.e. 0 to M
# Generate data
data = {"E":[randint(0, M) for _ in range(N)],
"F":[randint(0, M) for _ in range(N)],
"G":[randint(0, M) for _ in range(N)],
"H":['C' if i%2==0 else 'X' for i in range(N)],
"I":['OFC' if i%2==0 else 'X' for i in range(N)],
"J":[randint(2, M) for _ in range(N)],
"K":[randint(0, M) for _ in range(N)],
"L":[randint(0, M) for _ in range(N)],
"M":[randint(0, M) for _ in range(N)],
"N":['C' if i%2==0 else 'X' for i in range(N)],
"O":['OFC' if i%2==0 else 'X' for i in range(N)],
"P":[randint(2, M) for _ in range(N)],
"Q":[randint(0, M) for _ in range(N)],
"R":[randint(0, M) for _ in range(N)],
"S":[randint(0, M) for _ in range(N)],
"T":['C' if i%2==0 else 'X' for i in range(N)],
"U":['OFC' if i%2==0 else 'X' for i in range(N)],
"V":[randint(2, M) for _ in range(N)],
"A":[randint(0, M) for _ in range(N)],
"B":['C' if i%2==0 else 'X' for i in range(N)],
"C":['OFC' if i%2==0 else 'X' for i in range(N)],
"D":[randint(2, M) for _ in range(N)]}
calculos = pd.DataFrame(data)
opf = pd.DataFrame(data)
# Use update for posted for loops with parameters as follows.
update(calculos, opf, ['E', 'F', 'G', 'H', 'I'], ['C', 'OFC'], 'J')
update(calculos, opf, ['K', 'L', 'M', 'N', 'O'], ['V', 'OFV'], 'P')
update(calculos, opf, ['Q', 'R', 'S', 'T', 'U'], ['C', 'OFV'], 'V')
update(calculos, opf, ['A', 'B', 'C'], 0, 'D') # use scalar value of 0
Output
E F G H I J K L M N ... Q R S T U V A B C D
0 40 37 44 C OFC 8 11 67 24 C ... 33 25 36 C OFC 32 18 C OFC 1
1 10 95 70 X X 63 40 7 100 X ... 56 9 15 X X 15 44 X X 1
2 25 85 33 C OFC 66 31 96 35 C ... 58 42 63 C OFC 63 4 C OFC 1
3 34 13 95 X X 22 42 82 23 X ... 17 42 63 X X 34 21 X X 1
4 7 76 51 C OFC 9 63 17 65 C ... 11 15 24 C OFC 42 51 C OFC 1
5 rows × 22 columns
Performance
Summary: ~120X speed up (timing first for loop only) Timing:
- Posted Method ~ 2 minutes
- Revised method using update function: ~1 second
Data Generation
N = 1000 # Number of rows
M = 100
seed(1234) # random seed to make results repeatable
data = {"E":[randint(0, M) for _ in range(N)],
"F":[randint(0, M) for _ in range(N)],
"G":[randint(0, M) for _ in range(N)],
"H":['C' if i%2==0 else 'X' for i in range(N)],
"I":['OFC' if i%2==0 else 'X' for i in range(N)],
"J":[randint(2, M) for _ in range(N)]}
calculos = pd.DataFrame(data)
opf = pd.DataFrame(data)
Using Update
%timeit update(calculos, opf, ['E', 'F', 'G', 'H', 'I'], ['C', 'OFC'], 'J')
Result: 1.09 s ± 29.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Posted Method
%%timeit
for x, linha_calc in calculos.iterrows():
for z, linha_opf in opf.iterrows():
if (linha_calc['E'] == linha_opf['E']) & (
linha_calc['F'] == linha_opf['F']) & (
linha_calc['G'] == linha_opf['G']) & (
linha_opf['H'] == 'C') & (
linha_opf['I'] == 'OFC'):
calculos.at[x, 'J'] = 1
break
Result: 2min 51s ± 6.96 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
rows
for rows in linha_calc_rows: