I got 10 csvfiles like this :
I want to add 10 columns in my dataframe with a vwap calculation. I tried to create the columns and then to concatenate it into the dataframe but it doesn't work at all. I tried a lot of things, the main problem is that i can't create new columns with calculated rows :
import pandas as pd
import os
import glob
from IPython.display import display, HTML
import csv
# use glob to get all the csv files
# in the folder
path = os.getcwd()
csv_files = glob.glob(os.path.join("*.csv"))
"""
#To change the name of every columns
liste1 = []
header_list = []
for f in csv_files:
liste1.append(f)
header_list = [a.strip(".csv") for a in liste1]
"""
def add(f):
df = pd.read_csv(f, header=0)
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean").reset_index()
price = df["price"]
amount = df["amount"]
return df.assign(vwap = (price * amount).cumsum() / amount.cumsum())
for f in csv_files:
df = pd.read_csv(f, header=0)
df2 = pd.concat(add(f))
df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv", encoding='utf-8', index=False, mode = "a")
Thanks for your help
The traceback :
TypeError
Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_16732/557098648.py in <module>
31 for f in csv_files:
32 df = pd.read_csv(f, header=0)
---> 33 df2 = pd.concat(add(f))
34 df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv", encoding='utf-8', index=False, mode = "a")
35
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
292 ValueError: Indexes have overlapping values: ['a']
293 """
--> 294 op = _Concatenator(
295 objs,
296 axis=axis,
~\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\reshape\concat.py in __init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
327 ):
328 if isinstance(objs, (ABCSeries, ABCDataFrame, str)):
--> 329 raise TypeError(
330 "first argument must be an iterable of pandas "
331 f'objects, you passed an object of type "{type(objs).__name__}"'
TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"
CodePudding user response:
If need only aggregate values in ouput:
def add(df):
#Removed read_csv
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean").reset_index()
price = df["price"]
amount = df["amount"]
return (price * amount).cumsum() / amount.cumsum()
out = []
for f in csv_files:
df = pd.read_csv(f, header=0)
#added aggregate DataFrame with new column to list of DataFrames
out.append(add(df))
#joined all dfs together
df2 = pd.concat(out, ignore_index=True, axis=1)
#removed append mode
df2.to_csv(r"C:\Users\vion1\Ele\Engie\Sorbonne\resultat\resultat_projet_4.csv",
encoding='utf-8')