Creating new columns using data from a nested column object-CodePudding

It's kinda hard to show the dataframe in here, So I uploaded an example csv on the drive.

Here's the link, https://docs.google.com/spreadsheets/d/19m5hAo3wk9VbBNOyPQ96HhsIrjX20aT8NFwDZkkJRAE/edit?usp=sharing

I want to create n number of bids and asks columns in the dataframe using the data from a dictionary that's inside the depth column of the csv file.

I tried doing that in a for loop...

Here's my code,

total_depth_in_columns = 100

def restructure_data(df):
    
    # Created those new columns

    for i in range (total_depth_in_columns):
        df['bid_price_' str(i 1)] = 0
        df['bid_volume_' str(i 1)] = 0
        df['ask_price_' str(i 1)] = 0
        df['ask_volume_' str(i 1)] = 0


        
    for i in range(len(df)):
        depth = df['depth'].iloc[i]

               
        # Added data to those columns
            
        for j in range (total_depth_in_columns):
            
            df['bid_price_' str(j 1)].iloc[i] = depth['bids'][j][0]
            df['bid_volume_' str(j 1)].iloc[i] = depth['bids'][j][1]
            df['ask_price_' str(j 1)].iloc[i] = depth['asks'][j][0]
            df['ask_volume_' str(j 1)].iloc[i] = depth['asks'][j][1]
          
    return df

It works fine when I have around 100 data. But when running this code on 10000 data, it takes hours to process it. Is there any faster way to do these kind of operations using pandas?

CodePudding user response：

It's slow to loop through a dataframe.

Suggested Code

def restructure_data(df):
    # Depth column is string dictionary.
    #   - Convert to dictionary using ast.literal
    #   - Concatenate list of bids and asks
    df['bids-asks'] = df["depth"].astype('str').apply(lambda x: p['bids']   p['asks'] if (p:=ast.literal_eval(x)) else x)

    # Names of new columns
    columns = [f"{name}{j}" for j in range(1, (len(df['bids-asks'][0])//4) 1) for name in ["bid_price", "bid_volume", "ask_price", "ask_volume"]]

    # Split bids-asks column into columns
    split_df = pd.DataFrame(df['bids-asks'].tolist(), columns=columns)

    # concat df and split_df
    df = pd.concat([df, split_df], axis=1)

    # Drop bids-asks column
    df.drop('bids-asks', axis=1, inplace=True)
    
    return df

Usage

# Read CSV file into Dataframe
df = pd.read_csv('brr.csv', index_col=[0])
df = restructure_data(df)
# Display header
print(df.head())

Output

Close   depth   bid_price1  bid_volume1 ask_price1  ask_volume1 bid_price2  bid_volume2 ask_price2  ask_volume2 ... ask_price498    ask_volume498   bid_price499    bid_volume499   ask_price499    ask_volume499   bid_price500    bid_volume500   ask_price500    ask_volume500
0   7.245   {'lastUpdateId': 1787277998027, 'E': 165985484...   [7.2450, 652]   [7.2440, 545]   [7.2430, 2643]  [7.2420, 1308]  [7.2410, 2201]  [7.2400, 4207]  [7.2390, 4878]  [7.2380, 5604]  ... [8.2930, 12]    [8.2940, 619]   [8.2950, 16]    [8.2960, 33]    [8.2970, 10]    [8.2980, 13]    [8.2990, 2] [8.3000, 1884]  [8.3010, 3] [8.3020, 1]
1   7.246   {'lastUpdateId': 1787278048764, 'E': 165985485...   [7.2450, 842]   [7.2440, 1121]  [7.2430, 1825]  [7.2420, 2728]  [7.2410, 3047]  [7.2400, 5431]  [7.2390, 4790]  [7.2380, 5526]  ... [8.2930, 12]    [8.2940, 619]   [8.2950, 16]    [8.2960, 33]    [8.2970, 10]    [8.2980, 13]    [8.2990, 2] [8.3000, 1884]  [8.3010, 3] [8.3020, 1]
2   7.247   {'lastUpdateId': 1787278510622, 'E': 165985487...   [7.2460, 861]   [7.2450, 1298]  [7.2440, 1636]  [7.2430, 3669]  [7.2420, 4127]  [7.2410, 3330]  [7.2400, 3865]  [7.2390, 4951]  ... [8.2940, 619]   [8.2950, 16]    [8.2960, 33]    [8.2970, 10]    [8.2980, 13]    [8.2990, 2] [8.3000, 1884]  [8.3010, 3] [8.3020, 1] [8.3030, 9]
3   7.246   {'lastUpdateId': 1787278719415, 'E': 165985488...   [7.2460, 809]   [7.2450, 1872]  [7.2440, 1230]  [7.2430, 3774]  [7.2420, 3498]  [7.2410, 3744]  [7.2400, 3906]  [7.2390, 5100]  ... [8.2940, 619]   [8.2950, 16]    [8.2960, 33]    [8.2970, 10]    [8.2980, 13]    [8.2990, 2] [8.3000, 1884]  [8.3010, 3] [8.3020, 1] [8.3030, 9]
4   7.247   {'lastUpdateId': 1787278800142, 'E': 165985488...   [7.2470, 840]   [7.2460, 2145]  [7.2450, 1797]  [7.2440, 2720]  [7.2430, 6232]  [7.2420, 3448]  [7.2410, 2882]  [7.2400, 3198]  ... [8.2950, 16]    [8.2960, 33]    [8.2970, 10]    [8.2980, 13]    [8.2990, 2] [8.3000, 1884]  [8.3010, 3] [8.3020, 1] [8.3030, 9] [8.3040, 7]
5 rows × 2002 columns