How to optimize following algorithm that iterates over a DataFrame of few million of rows?-CodePudding

I have the following algorithm that iterates over a DataFrame with few millions of rows. It takes a lot of time for the algorithm to finish. Do you have any suggestions?

def k_nn_averaging(df: pd.DataFrame, k: int = 15, use_abs_value: bool = False) -> pd.DataFrame:
    df_averaged = df.copy()
    df[helper.modifiable_columns] = df[helper.modifiable_columns].astype(float)
    df_averaged[helper.modifiable_columns] = df_averaged[helper.modifiable_columns].astype(float)
    for i in range(0, df.shape[0]):
        neighbours = list(range(i-k if i-k >= 0 else 0, i k if i k <= df_averaged.shape[0] else df_averaged.shape[0]))
        neighbours.remove(i)
        selectedNeighbourIndex = choice(neighbours)
        factor = uniform(0,1)
        currentSampleValues = df[helper.modifiable_columns].iloc[i]
        neighbourSampleValues = df[helper.modifiable_columns].iloc[selectedNeighbourIndex]
        average = 0
        if not use_abs_value: average = factor*(currentSampleValues - neighbourSampleValues)
        else: average = factor*(abs(currentSampleValues - neighbourSampleValues)) 
        df_averaged.loc[i,helper.modifiable_columns] = currentSampleValues   average
    return df_averaged

CodePudding user response：

The first thing you should always want is to vectorize loops. Here is the modified code that avoids using Python loops and uses NumPy operations instead:

import pandas as pd
import numpy as np

def k_nn_averaging(df: pd.DataFrame, k: int = 15, use_abs_value: bool = False) -> pd.DataFrame:
    df_averaged = df.copy()
    df_averaged[helper.modifiable_columns] = df_averaged[helper.modifiable_columns].astype(float)
    num_rows = df.shape[0]
    modifiable_columns = helper.modifiable_columns

    # create a matrix of the neighbour indices for each row
    neighbour_indices = np.empty((num_rows, k*2 1), dtype=int)
    neighbour_indices[:, k] = np.arange(num_rows)  # set the current row index as the middle value
    for i in range(k):
        # set the left neighbours
        neighbour_indices[i 1:, i] = neighbour_indices[i:-1, k] - 1
        # set the right neighbours
        neighbour_indices[:-i-1, k i 1] = neighbour_indices[1:, k]   1
    # set the values outside the range of the DataFrame to -1
    neighbour_indices[neighbour_indices < 0] = -1
    neighbour_indices[neighbour_indices >= num_rows] = -1

    # select the neighbour indices to use for each row
    selected_neighbour_indices = neighbour_indices[:, neighbour_indices[0] >= 0]

    # create a matrix of factors
    factors = np.random.uniform(size=(num_rows, selected_neighbour_indices.shape[1]))

    # select the neighbour values for each row
    neighbour_values = df[modifiable_columns].values[selected_neighbour_indices]

    # select the current values for each row
    current_values = df[modifiable_columns].values[:, np.newaxis]

    # calculate the average values
    if not use_abs_value:
        averages = factors * (current_values - neighbour_values)
    else:
        averages = factors * np.abs(current_values - neighbour_values)

    # update the values in the output DataFrame
    df_averaged[modifiable_columns] = current_values   averages

    return df_averaged

I think this will be much faster than the original script.