This code works but is too slow, any ideas for improvement would be appreciated. Numpy arrays?, other?
Estatus=Vigentes[['UUID','Estatus']]
MOV_10 = MOV_09.copy()
MOV_10['Estatus'] = ""
for i in range(0, len(MOV_10[['UUID']])):
u = MOV_10.loc[i][0]
w = MOV_10.loc[i][1]
tempu = Estatus.loc[Estatus['UUID'] == u]
tempw = Estatus.loc[Estatus['UUID'] == w]
try:
if w == 'N/A':
MOV_10.loc[i, 'Estatus'] = int(tempu.iloc[0, 1])
else:
MOV_10.loc[i, 'Estatus'] = int(tempu.iloc[0, 1]) \
* int(tempw.iloc[0, 1])
except IndexError:
MOV_10.loc[i, 'Estatus'] = 0
#Estatus table, Mov_09 Table, Mov_10 Table, expected result
UUID | Estatus | |
---|---|---|
0 | a | 0 |
1 | b | 1 |
2 | x | 1 |
3 | y | 1 |
UUID | UIID_2 | estatus | |
---|---|---|---|
0 | a | x | |
1 | b | y |
UUID | UIID_2 | estatus | |
---|---|---|---|
0 | a | x | 0*1 |
1 | b | y | 1*1 |
CodePudding user response:
You should be able to do much better than your existing method. I assume your existing data structure is a pandas dataframe. If so, it's very straightforward swap to use vector operations for a lot of the calculations. This approach should also scale much better than your approach.
uuid_index = Estatus.set_index('UUID').rename(columns={'Estatus': 'val'})
out = pd.DataFrame({ 'UUID': MOV_09.UUID.values, 'UIID2': MOV_09.UIID2.values }).join(uuid_index, on=['UUID']).join(uuid_index, on=['UIID2'], rsuffix='_uiid2')
out['Estatus'] = 0
out.loc[out.val_uiid2 != 0, 'Estatus'] = out.val / out.val_uiid2
using this approach gives the following performance improvement for a dataset of with 1000 entries in the MOV_09 table:
Method | Time | Relative |
---|---|---|
Original | 8.066573400050402 | 193.82507958031653 |
Swapping to joining dataframes | 0.04161780001595616 | 1.0 |
I have attached the test code below:
import pandas as pd
import numpy as np
import random
import timeit
# generate test data
random.seed(1)
iterations = 10
uuid_count = 1000
mov_count = 1000
uuid_values = [(hex(i), random.random(), random.randint(0, 1)) for i in range(uuid_count)]
uuid_values.sort(key=lambda x: x[1])
def rand_uuid():
return uuid_values[random.randint(0, uuid_count - 1)][0]
mov_values = set()
for i in range(mov_count):
uuid = rand_uuid()
while not ((uiid2 := rand_uuid()) and not (pair := (uuid, uiid2)) in mov_values): pass
mov_values.add(pair)
Estatus = pd.DataFrame({
'UUID': [v[0] for v in uuid_values],
'Estatus': [v[2] for v in uuid_values],
})
MOV_09 = pd.DataFrame({
'UUID': [t[0] for t in mov_values],
'UIID2': [t[1] for t in mov_values],
})
# base method
def method0():
MOV_10 = MOV_09.copy()
MOV_10['Estatus'] = ""
for i in range(0, len(MOV_10[['UUID']])):
u = MOV_10.loc[i][0]
w = MOV_10.loc[i][1]
tempu = Estatus.loc[Estatus['UUID'] == u]
tempw = Estatus.loc[Estatus['UUID'] == w]
try:
if w == 'N/A':
MOV_10.loc[i, 'Estatus'] = int(tempu.iloc[0, 1])
else:
MOV_10.loc[i, 'Estatus'] = int(tempu.iloc[0, 1]) \
* int(tempw.iloc[0, 1])
except IndexError:
MOV_10.loc[i, 'Estatus'] = 0
return MOV_10
# updated method
def method1():
uuid_index = Estatus.set_index('UUID').rename(columns={'Estatus': 'val'})
out = pd.DataFrame({ 'UUID': MOV_09.UUID.values, 'UIID2': MOV_09.UIID2.values }).join(uuid_index, on=['UUID']).join(uuid_index, on=['UIID2'], rsuffix='_uiid2')
out['Estatus'] = 0
out.loc[out.val_uiid2 != 0, 'Estatus'] = out.val / out.val_uiid2
return out[['UUID', 'UIID2', 'Estatus']]
m0 = method0()
m0['Estatus'] = m0.Estatus.astype(np.int64)
pd.testing.assert_frame_equal(m0, method1())
t0 = timeit.timeit(lambda: method0(), number=iterations)
t1 = timeit.timeit(lambda: method1(), number=iterations)
tmin = min((t0, t1))
print(f'| Method | Time | Relative |')
print(f'|------------------ |----------------------|')
print(f'| Original | {t0} | {t0 / tmin} |')
print(f'| Swap to joining dataframes | {t1} | {t1 / tmin} |')