I'm trying to merge the duplicates in the resulting .csv but unable to get the desired result.
My below code works just fine.
def inputCsv():
r = requests.get(endpoint, headers=headers, params=params)
with open("input.csv", "w") as f:
f.writelines(r.text.splitlines(True))
df = pd.read_csv("input.csv")
return df
def outputCsv():
with open("secret.json", "w") as file:
auth = ssm.get_parameter(Name="/something/something/creds", WithDecryption=True)
file.write(str(auth['Parameter']['Value']))
os.environ["creds"] = "secret.json"
rows = []
with open(rb'output.csv', 'w', newline='') as out_file:
timestamp = datetime.now()
df = getCsv()
if 'Name' in df.columns:
df.rename(columns = {"Name": "team", "Total": "cost"}, inplace = True)
df.insert(0, 'date',timestamp)
df.insert(1, 'resource_type', "pod")
df.insert(2, 'resource_name', "kubernetes")
df.insert(3, 'cluster_name', "eks-cluster")
df.drop(["CPU", "GPU", "RAM", "PV", "Network", "LoadBalancer", "External", "Shared", "Efficiency"], axis=1, inplace=True)
df['team'] = df['team'].map(squads).fillna(df['team'])
df.to_csv(out_file, index=False)
But the resulting output looks like below because of duplicates inserted via df.map
function.
date,resource_type,resource_name,cluster_name,team,cost
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,billing,0.201
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,sre-infra-and-release,0.238
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,sre-infra-and-release,0.008
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,sre-infra-and-release,0.836
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,growth,0.513
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,sre-observability,3.633
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,order-platform,1.963
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,menu,0.46
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,ncr,3.291
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,order-platform,4.846
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,grocery-affordability,0.171
I want that the duplicated sre-infra-and-release
to be merged with single row with cost being sum together.
tried to use this below but some how is not working, the resultant file still had the duplicated rows.
df.groupby(['team']).sum()
CodePudding user response:
Hi You need to group all columns except cost, and then sum on cost. something like -
df.groupby(['date','resource_type','resource_name','cluster_name','team'])['cost'].sum()
Hope this works for you.
CodePudding user response:
import pandas as pd
df = pd.read_csv('file.txt')
df.groupby(["date","resource_type","resource_name","cluster_name","team"]).agg({"cost": sum}).reset_index()
with file.txt:
date,resource_type,resource_name,cluster_name,team,cost
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,billing,0.201
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,sre-infra-and-release,0.238
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,sre-infra-and-release,0.008
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,sre-infra-and-release,0.836
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,growth,0.513
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,sre-observability,3.633
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,order-platform,1.963
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,menu,0.46
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,ncr,3.291
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,order-platform,4.846
2022-12-30 14:56:08.383080,pod,kubernetes,eks-cluster,grocery-affordability,0.171
Works for me. The result is:
date resource_type resource_name cluster_name team cost
0 2022-12-30 14:56:08.383080 pod kubernetes eks-cluster billing 0.201
1 2022-12-30 14:56:08.383080 pod kubernetes eks-cluster grocery-affordability 0.171
2 2022-12-30 14:56:08.383080 pod kubernetes eks-cluster growth 0.513
3 2022-12-30 14:56:08.383080 pod kubernetes eks-cluster menu 0.460
4 2022-12-30 14:56:08.383080 pod kubernetes eks-cluster ncr 3.291
5 2022-12-30 14:56:08.383080 pod kubernetes eks-cluster order-platform 6.809
6 2022-12-30 14:56:08.383080 pod kubernetes eks-cluster sre-infra-and-release 1.082
7 2022-12-30 14:56:08.383080 pod kubernetes eks-cluster sre-observability 3.633