go through every rows of a dataframe without iteration-CodePudding

this is my sample data :

Inventory is based on a Product

  Customer  Product  Quantity   Inventory    
  1           A         100        800      
  2           A         1000       800  
  3           A         700        800  
  4           A         50         800   
  5           B         20         100  
  6           B         50         100  
  7           B         40         100  
  8           B         30         100

Code require to create this data :

data = {
    'Customer':[1,2,3,4,5,6,7,8],
    'Product':['A','A','A','A','B','B','B','B'],
    'Quantity':[100,1000,700,50,20,50,40,30],
    'Inventory':[800,800,800,800,100,100,100,100]
}
df = pd.DataFrame(data)

I need to get a new column which is known Available to promise which is calculated by subtracting the quantity from previously available to promise and calculation only happens if the previously available inventory is greater than the order quantity .

here is my expected output:

Customer  Product  Quantity Inventory   Available to Promise 
  1           A         100        800   700                (800-100 = 700)
  2           A         1000       800   700                (1000 greater than 700 so same value)
  3           A         700        800   0                  (700-700 = 0)
  4           A         50         800   0                  (50 greater than 0)
  5           B         20         100   80                 (100-20 = 80)
  6           B         50         100   30                 (80-50 = 30)
  7           B         40         100   30                 (40 greater than 30)
  8           B         30         100   0                  (30 - 30 = 0)

i have achieved this using for loop and itterows in python pandas

this is my code:

master_df = df[['Product','Inventory']].drop_duplicates()
master_df['free'] = df['Inventory']
df['available_to_promise']=np.NaN
for i,row in df.iterrows():
    if i00==0:

        print(i)
    try:
        available = master_df[row['Product']==master_df['Product']]['free'].reset_index(drop=True).iloc[0]
        if available-row['Quantity']>=0:
            df.at[i,'available_to_promise']=available-row['Quantity']
            a = master_df.loc[row['Product']==master_df['Product']].reset_index()['index'].iloc[0]
            master_df.at[a,'free'] = available-row['Quantity']
        else:
            df.at[i,'available_to_promise']=available
    except Exception as e:
         print(i)
         print(e)
print((df.columns))
df = df.fillna(0)

Due to for loop is so slow in python, when there is a huge data input this loop take so much time to execute thus my aws lambda function is failing

Can you guys help me to optimize this code by introducing a better alternative to this loop which can execute in a few seconds ?

CodePudding user response：

I have a bit of a solution, it's not incredibly powerful because it still uses loops but it has the advantage of being simpler and easy to optimize.

import pandas as pd
import numpy as np

def func_no_jit(quant, inv):
    stock = inv[0]
    n = len(quant)
    out = np.zeros((n,), dtype=np.int64)
    for i in range(n):
        if stock > 0 and quant[i] <= stock:
            stock -= quant[i]
            out[i] = stock
        else:
            out[i] = stock
    return out


res = (
    df.groupby('Product')
    .apply(lambda x: func(x['Quantity'].values, x['Inventory'].values))
    .explode()
)

df["Promise"] = res

A possible solution is to use numba, When I used it, I could cut the time the process took in half, for a dataframe of 100_000 elements, it has no real effect on small dataframes though.

from numba import njit

@njit
def func(quant, inv):
    stock = inv[0]
    n = len(quant)
    out = np.zeros((n,), dtype=np.int64)
    for i in range(n):
        if stock > 0 and quant[i] <= stock:
            stock -= quant[i]
            out[i] = stock
        else:
            out[i] = stock
    return out

See the results here:

In [11]: big_df
Out[11]: 
       Customer Product  Quantity  Inventory
0             0       I       328        282
1             1       A       668        874
2             2       H        51        496
3             3       A       561        526
4             4       H       143        421
...         ...     ...       ...        ...
99995     99995       D        43        392
99996     99996       F       162        540
99997     99997       C       565        902
99998     99998       H       633        936
99999     99999       A       731        810

[100000 rows x 4 columns]

In [12]: %timeit big_df.groupby('Product').apply(lambda x : func_no_jit(x["Quantity"].values
    ...: ,x["Inventory"].values)).explode()
33.3 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [13]: %timeit big_df.groupby('Product').apply(lambda x : func(x["Quantity"].values,x["Inv
    ...: entory"].values)).explode()
12.5 ms ± 21.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

CodePudding user response：

You're doing a lot of manipulating of the two dataframes you have, and I think that might be the cause of the speed issue.

I would use a dict to keep track of the available inventory.

I'm actually curious what the speed comparison if you apply this on a large dataframe...

import pandas as pd


data = {
    'Customer':[1,2,3,4,5,6,7,8],
    'Product':['A','A','A','A','B','B','B','B'],
    'Quantity':[100,1000,700,50,20,50,40,30],
    'Inventory':[800,800,800,800,100,100,100,100]
}
df = pd.DataFrame(data)
df["Available to Promise"] = 0


# create availability tracking by using dict/zip/list/set magic, and set dict values to None
product_set = set(df.Product)
available = dict(zip(list(product_set), [None for _ in range(len(product_set))]))


for idx, row in df.iterrows():
    if available[row.Product] == None:
        if row.Quantity <= row.Inventory:
            available[row.Product] = row.Inventory - row.Quantity
            df.at[idx, "Available to Promise"] = available[row.Product]
        else:
            df.at[idx, "Available to Promise"] = row.Inventory
            available[row.Product] = 0
        
    elif available[row.Product] > 0:
        if row.Quantity <= available[row.Product]:
            available[row.Product] = available[row.Product] - row.Quantity
            df.at[idx, "Available to Promise"] = available[row.Product] 
        else:
            df.at[idx, "Available to Promise"] = available[row.Product]
            available[row.Product] = 0
    

print(df)

output

   Customer Product  Quantity  Inventory  Available to Promise
0         1       A       100        800                   700
1         2       A      1000        800                   700
2         3       A       700        800                     0
3         4       A        50        800                     0
4         5       B        20        100                    80
5         6       B        50        100                    30
6         7       B        40        100                    30
7         8       B        30        100                     0

CodePudding user response：

I am not sure it simple to write a vectorized and performant code that replicates the desired logic.

However, it is relatively simple to write it in a way that it is simple to accelerate with Numba.

Firstly, let us write your code as a (pure) function of the dataframe, returning the values to eventually put in df["Available to Promise"]. Eventually, it is easy to inglobate its result into the original dataframe with:

df["Available to Promise"] = calc_avail_OP(df)

The OP's code, save for exception handling and printing (and incorporation into the original dataframe as just discussed) is equivalent to the following:

import numpy as np
import pandas as pd


def calc_avail_OP(df, label="Avail"):
    temp_df = df[["Product", "Inventory"]].drop_duplicates()
    temp_df["free"] = df["Inventory"]
    result = np.zeros(len(df), dtype=df["Inventory"].dtype)
    for i, row in df.iterrows():
        available = (
            temp_df[row["Product"] == temp_df["Product"]]["free"]
            .reset_index(drop=True)
            .iloc[0]
        )
        if available - row["Quantity"] >= 0:
            result[i] = available - row["Quantity"]
            a = (
                temp_df.loc[row["Product"] == temp_df["Product"]]
                .reset_index()["index"]
                .iloc[0]
            )
            temp_df.at[a, "free"] = available - row["Quantity"]
        else:
            result[i] = available
    return result

Now, the same can be achieved with a few scalar temporary variables on native NumPy objects, and this can be accelerated with Numba:

import numba as nb


@nb.njit
def _calc_avail_nb(products, quantities, stocks):
    n = len(products)
    avails = np.empty(n, dtype=stocks.dtype)
    last_product = products[0]
    avail = stocks[0]
    for i in range(n):
        if products[i] != last_product:
            last_product = products[i]
            avail = stocks[i]
        qty = quantities[i]
        if avail >= qty:
            avail -= qty
        avails[i] = avail
    return avails
            

def calc_avail_nb(df):            
    return _calc_avail_nb(
        df["Product"].to_numpy(dtype="U"),
        df["Quantity"].to_numpy(),
        df["Inventory"].to_numpy()
    )

On the test dataframe, the get to the same result:

data = {
    'Customer':[1,2,3,4,5,6,7,8],
    'Product':['A','A','A','A','B','B','B','B'],
    'Quantity':[100,1000,700,50,20,50,40,30],
    'Inventory':[800,800,800,800,100,100,100,100]
}
df = pd.DataFrame(data)


res_OP = calc_avail_OP(df)
res_nb = calc_avail_nb(df)
print(np.allclose(res_OP, res_nb))
# True

but with some 200x speed improvement:

%timeit -n 16 -r 16 calc_avail_OP(df)
# 16 loops, best of 16: 11.7 ms per loop
%timeit -n 16 -r 16 calc_avail_nb(df)
# 16 loops, best of 16: 56.8 µs per loop