I have the following list of dicts. I would like to group and aggregate by year and month and sum the values. The challenge being in my current output, I could only output the start_date and I would like to somehow show the end date which will be the last start date of the dict item in the list for that month and year. This is what I have tried so far.
from itertools import groupby
u = [
{
"start_date": "2018-12-01",
"val": 5436,
},
{
"start_date": "2018-12-02",
"val": 5376,
},
{
"start_date": "2018-12-03",
"val": 5608,
},
{
"start_date": "2018-12-04",
"val": 5671,
},
{
"start_date": "2018-12-05",
"val": 5800,
},
{
"start_date": "2018-12-06",
"val": 5834,
},
{
"start_date": "2018-12-07",
"val": 5800,
},
{
"start_date": "2018-12-08",
"val": 5658,
},
{
"start_date": "2018-12-09",
"val": 5709,
"year_month": "2018-12"
},
{
"start_date": "2018-12-10",
"val": 5673,
},
{
"start_date": "2018-12-11",
"val": 5661,
},
{
"start_date": "2018-12-12",
"val": 5889,
},
{
"start_date": "2018-12-13",
"val": 6197,
},
{
"start_date": "2018-12-14",
"val": 6163,
},
{
"start_date": "2018-12-15",
"val": 6376,
},
{
"start_date": "2018-12-16",
"val": 5762,
},
{
"start_date": "2018-12-17",
"val": 5733,
},
{
"start_date": "2018-12-18",
"val": 6207,
},
{
"start_date": "2018-12-19",
"val": 5600,
},
{
"start_date": "2018-12-20",
"val": 5815,
},
{
"start_date": "2018-12-21",
"val": 5533,
},
{
"start_date": "2018-12-22",
"val": 5346,
},
{
"start_date": "2018-12-23",
"val": 5478,
},
{
"start_date": "2018-12-24",
"val": 5276,
},
{
"start_date": "2018-12-25",
"val": 5205,
},
{
"start_date": "2018-12-26",
"val": 5246,
},
{
"start_date": "2018-12-27",
"val": 5234,
},
{
"start_date": "2018-12-28",
"val": 5233,
},
{
"start_date": "2018-12-29",
"val": 5102,
},
{
"start_date": "2018-12-30",
"val": 5240,
},
{
"start_date": "2018-12-31",
"val": 5248,
},
{
"start_date": "2021-12-04",
"val": 808,
}
]
def yss():
result = []
for k, v in groupby(u, key=lambda x: x['start_date'][:7]):
result.append({'date': k '-01', 'value': sum(int(d['val']) for d in v)})
return result
yss()
Output is -
[{'date': '2018-12-01', 'value': 174109},
{'date': '2021-12-01', 'value': 808}]
I want the output to be
[{'start_date':'2018-12-01', 'end_date': '2018-12-31','value':174109}, {'start_date':'2021-12-04','end_date':'2021-12-04','value':808}]
CodePudding user response:
When you groupby
, save the grouped data to a list of tuples and use this to construct your result:
result = []
for k, v in groupby(u, key=lambda x: x['start_date'][:7]):
date_values = [(d["start_date"], d["val"]) for d in v]
result.append({'start_date': min(d[0] for d in date_values),
'end_date': max(d[0] for d in date_values),
'value': sum(d[1] for d in date_values)})
>>> result
[{'start_date': '2018-12-01', 'end_date': '2018-12-31', 'value': 174109},
{'start_date': '2021-12-04', 'end_date': '2021-12-04', 'value': 808}]
Alternatively, with pandas
using Named Aggregation:
import pandas as pd
df = pd.DataFrame(u)
result = (df.groupby(pd.to_datetime(df['start_date']).dt.to_period('M'))
.agg(start_date=("start_date", "min"),
end_date=("start_date","max"),
value=("val","sum"))
.to_dict(orient='records')
)
>>> result
[{'start_date': '2018-12-01', 'end_date': '2018-12-31', 'value': 174109},
{'start_date': '2021-12-04', 'end_date': '2021-12-04', 'value': 808}]
CodePudding user response:
This should work:
import pandas as pd
df = pd.DataFrame(u)
# Convert start date
df["start_date"] = pd.to_datetime(df['start_date'])
df["year"] = df["start_date"].apply(lambda x: x.year)
df["month"] = df["start_date"].apply(lambda x: x.month)
result = []
for (year, month), group in df.groupby(["year", "month"]):
start_date = group["start_date"].min()
end_date = group["start_date"].max()
value = group["val"].sum()
result.append({"start_date": start_date, "end_date": end_date, "value": value})
CodePudding user response:
You can use pandas
:
import pandas as pd
df = pd.DataFrame(u)[['start_date','val']]
df['start_date'] = pd.to_datetime(df['start_date'])
df['year_month'] = df['start_date'].dt.to_period('M')
out = (df.groupby('year_month').agg({'start_date':['first','last'],'val':'sum'})
.droplevel(0, axis=1).rename({'first':'start_date', 'last':'end_date', 'sum':'value'}, axis=1)
.reset_index(drop=True))
out[['start_date','end_date']] = out[['start_date','end_date']].apply(lambda x: x.dt.strftime('%y-%m-%d'), axis=1)
out = list(out.T.to_dict().values())
Output:
[{'start_date': '18-12-01', 'end_date': '18-12-31', 'value': 174109},
{'start_date': '21-12-04', 'end_date': '21-12-04', 'value': 808}]
CodePudding user response:
Here it is as a single nested comprehension, just for funsies:
>>> [(
... lambda p=[(d["start_date"], d["val"]) for d in g]: {
... "start_date": p[0][0],
... "end_date": p[-1][0],
... "value": sum(v for _, v in p)
... }
... )() for _, g in groupby(
... sorted(u, key=lambda d: d["start_date"]),
... key=lambda d: d["start_date"][:7]
... )]
[{'start_date': '2018-12-01', 'end_date': '2018-12-31', 'value': 174109}, {'start_date': '2021-12-04', 'end_date': '2021-12-04', 'value': 808}]