Home > Mobile >  From multiples CSV to Dataframe columns with calculs
From multiples CSV to Dataframe columns with calculs

Time:10-02

I got 10 csvfiles like this :

Text

I want to add 10 columns in my dataframe with a vwap calculation. I tried to create the columns and then to concatenate it into the dataframe but it doesn't work at all. I tried a lot of things, the main problem is that i can't create new columns with calculated rows :

import pandas as pd
import os
import glob
from IPython.display import display, HTML
import csv
# use glob to get all the csv files 
# in the folder

path = os.getcwd()
csv_files = glob.glob(os.path.join("*.csv"))

""" 
#To change the name of every columns
liste1 = []
header_list = []
for f in csv_files:
    liste1.append(f)
header_list = [a.strip(".csv") for a in liste1]
 """
def add(f):
    df = pd.read_csv(f, header=0)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df = df.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean").reset_index()
    price = df["price"]
    amount = df["amount"]
    return df.assign(vwap  = (price * amount).cumsum() / amount.cumsum())

for f in csv_files:
    df = pd.read_csv(f, header=0)
    df2 = pd.concat(add(f))
    df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv", encoding='utf-8', index=False, mode = "a")

Thanks for your help

The traceback :

TypeError                                 
Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_16732/557098648.py in <module>
     31 for f in csv_files:
     32     df = pd.read_csv(f, header=0)
---> 33     df2 = pd.concat(add(f))
     34     df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv", encoding='utf-8', index=False, mode = "a")
     35 

~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
    292     ValueError: Indexes have overlapping values: ['a']
    293     """
--> 294     op = _Concatenator(
    295         objs,
    296         axis=axis,

~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\concat.py in __init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
    327     ):
    328         if isinstance(objs, (ABCSeries, ABCDataFrame, str)):
--> 329             raise TypeError(
    330                 "first argument must be an iterable of pandas "
    331                 f'objects, you passed an object of type "{type(objs).__name__}"'

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

CodePudding user response:

If need only aggregate values in ouput:

def add(df):
    #Removed read_csv 
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df = df.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean").reset_index()
    price = df["price"]
    amount = df["amount"]
    return (price * amount).cumsum() / amount.cumsum()

out = []
for f in csv_files:
    df = pd.read_csv(f, header=0)
    #added aggregate DataFrame with new column to list of DataFrames
    out.append(add(df))
    
#joined all dfs together
df2 = pd.concat(out, ignore_index=True, axis=1)  
#removed append mode
df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv", 
             encoding='utf-8')
  • Related