Python pandas iterating rows with complicated calculation faster way of current code-CodePudding

I've implemented some sort of object stability calculator in pandas. But performance time is horrible. Can someone help me, please.

def calculate_stability(ind, df, sidx, max_k):
    indexes = sidx[:, ind]
    indexes = np.delete(indexes, np.where(indexes == ind))

    d = 0
    last_crtit_obj_count = 0

    for j in range(max_k):
        if df.at[ind, "Class"] == df.at[indexes[j], "Class"]:
            d = d   1
        if d / (j 1) > 1/2:
            last_crtit_obj_count = (j 1)

    print(f'\t Object {ind} = {last_crtit_obj_count / max_k}')
    return last_crtit_obj_count / max_k

df.iloc was very slow. That's why I changed to df.at.

Code is here

Need to vectorized version of loop.

CodePudding user response：

Here is the version without the loop:

def calculate_stability(ind, df, sidx, max_k):
    indexes = sidx[:, ind]
    indexes = indexes[indexes != ind][:max_k]

    # `d` contains all values from the first condition from the original loop:
    d = (df["Class"][ind] == df["Class"][indexes]).cumsum()
    # `j` contains all values from the original `range`   1:
    j = np.arange(1, len(d)   1)
    # select `last_crtit_obj_count` values:
    crtit_objs = j[(d / j > 1 / 2)]
    # calculate `last_crtit_obj_count / max_k`
    result = crtit_objs[-1] / max_k if len(crtit_objs) else 0
    print(f"\t Object {ind} = {result}")
    return result