I would like to fill the missing values of the dates of a Pandas dataframe, but instead of filling the missing date based only on the date column, I would like to do it based on more than 1 column. In this case, the column source
.
The example is the following
Original
date_found source count_unique_uuids count_unique_uuids_raw
2021-05-13 source_1 20 20
2021-05-14 source_2 1829 1829
2021-05-14 source_3 2245 2245
2021-05-14 source_1 40 40
2021-05-15 source_1 903 903
2021-05-16 source_2 20 20
2021-05-18 source_3 89 89
Desired dataset
date_found source count_unique_uuids count_unique_uuids_raw
2021-05-13 source_1 20 20
2021-05-13 source_2 0 0
2021-05-13 source_3 0 0
2021-05-14 source_1 40 40
2021-05-14 source_2 1829 1829
2021-05-14 source_3 2245 2245
2021-05-15 source_1 903 903
2021-05-15 source_2 0 0
2021-05-15 source_3 0 0
2021-05-16 source_1 0 0
2021-05-16 source_2 20 20
2021-05-16 source_3 0 0
2021-05-17 source_1 0 0
2021-05-17 source_2 0 0
2021-05-17 source_3 0 0
2021-05-18 source_1 0 0
2021-05-18 source_2 0 0
2021-05-18 source_3 89 89
I was using reindex and resample as a reference to build the dataset
Reindex
: Add missing dates to pandas dataframe
Resample
: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html
I'm trying the following code
But getting an error
ValueError: StringArray requires a sequence of strings or pandas.NA
def add_missing_dates(df: pd.DataFrame) -> pd.DataFrame:
df['date_found'] = pd.to_datetime(df['date_found'], format='%Y-%m-%d')
min_date = df['date_found'].min()
max_date = df['date_found'].max()
(df.set_index(['date_found'])
.groupby(['source'], as_index=False, group_keys=False)
.apply(lambda x: x.reindex(pd.date_range(min_date, max_date)))
.reset_index().rename(columns={'index': 'date_found'})
.fillna(0)
)
return df
def add_dates_to_source(df: pd.DataFrame, source: str = 'source') -> pd.DataFrame:
sources = df[source].tolist()
dfs_to_concat = []
for source_value in sources:
filtered_df = df.loc[df[source] == source_value]
df_ = add_missing_dates(filtered_df)
df_[source] = source_value
dfs_to_concat.append(df_)
return pd.concat(dfs_to_concat)
To run
df = add_dates_to_source(df)
CodePudding user response:
here is one way to do it
# make the date as of type datetime
df['date_found']=pd.to_datetime(df['date_found'])
#find the min_date
min_date = df['date_found'].min()
#find the max date
max_date = df['date_found'].max()
df2=(df.set_index(['date_found']) # set index to date, to allow for reindex
.groupby(['source'],as_index=False, group_keys=False) # group on source
.apply(lambda x: x.reindex(pd.date_range(min_date, max_date))) # reindex on the date range
.reset_index() # reset index
.rename(columns={'index': 'date_found'}) # rename the column
)
df2['source'].ffill(inplace=True)
df2.fillna(0, inplace=True)
df2
date_found source count_unique_uuids count_unique_uuids_raw
0 2021-05-13 source_1 20.0 20.0
1 2021-05-14 source_1 40.0 40.0
2 2021-05-15 source_1 903.0 903.0
3 2021-05-16 source_1 0.0 0.0
4 2021-05-17 source_1 0.0 0.0
5 2021-05-18 source_1 0.0 0.0
6 2021-05-13 source_1 0.0 0.0
7 2021-05-14 source_2 1829.0 1829.0
8 2021-05-15 source_2 0.0 0.0
9 2021-05-16 source_2 20.0 20.0
10 2021-05-17 source_2 0.0 0.0
11 2021-05-18 source_2 0.0 0.0
12 2021-05-13 source_2 0.0 0.0
13 2021-05-14 source_3 2245.0 2245.0
14 2021-05-15 source_3 0.0 0.0
15 2021-05-16 source_3 0.0 0.0
16 2021-05-17 source_3 0.0 0.0
17 2021-05-18 source_3 89.0 89.0
CodePudding user response:
this is what i came to (doesn't look nice but seems to work as expected):
min_date = df['date_found'].min()
max_date = df['date_found'].max()
dates = (pd.date_range(min_date,max_date).to_frame()
.reset_index(drop=True)
.set_axis(['date_found'],axis=1))
res = (dates.merge(df['source'].drop_duplicates(),how='cross')
.merge(df,how='left').fillna(0))
print(res)
'''
date_found source count_unique_uuids count_unique_uuids_raw
0 2021-05-13 source_1 20.0 20.0
1 2021-05-13 source_2 0.0 0.0
2 2021-05-13 source_3 0.0 0.0
3 2021-05-14 source_1 40.0 40.0
4 2021-05-14 source_2 1829.0 1829.0
5 2021-05-14 source_3 2245.0 2245.0
6 2021-05-15 source_1 903.0 903.0
7 2021-05-15 source_2 0.0 0.0
8 2021-05-15 source_3 0.0 0.0
9 2021-05-16 source_1 0.0 0.0
10 2021-05-16 source_2 20.0 20.0
11 2021-05-16 source_3 0.0 0.0
12 2021-05-17 source_1 0.0 0.0
13 2021-05-17 source_2 0.0 0.0
14 2021-05-17 source_3 0.0 0.0
15 2021-05-18 source_1 0.0 0.0
16 2021-05-18 source_2 0.0 0.0
17 2021-05-18 source_3 89.0 89.0
CodePudding user response:
One option is with complete from pyjanitor, to expose missing rows:
# pip install pyjanitor
import pandas as pd
import janitor
df['date_found'] = pd.to_datetime(df['date_found'])
# build a dictionary to contain the new dates
# the key of the dictionary must exist in the dataframe
new_dates = {"date_found":pd.date_range(df.date_found.min(),
df.date_found.max(),
freq='D')}
df.complete(new_dates, 'source', fill_value=0)
date_found source count_unique_uuids count_unique_uuids_raw
0 2021-05-13 source_1 20 20
1 2021-05-13 source_2 0 0
2 2021-05-13 source_3 0 0
3 2021-05-14 source_1 40 40
4 2021-05-14 source_2 1829 1829
5 2021-05-14 source_3 2245 2245
6 2021-05-15 source_1 903 903
7 2021-05-15 source_2 0 0
8 2021-05-15 source_3 0 0
9 2021-05-16 source_1 0 0
10 2021-05-16 source_2 20 20
11 2021-05-16 source_3 0 0
12 2021-05-17 source_1 0 0
13 2021-05-17 source_2 0 0
14 2021-05-17 source_3 0 0
15 2021-05-18 source_1 0 0
16 2021-05-18 source_2 0 0
17 2021-05-18 source_3 89 89
Another option, using Pandas only:
min_date = df['date_found'].min()
max_date = df['date_found'].max()
dates = pd.date_range(min_date,max_date)
source = df.source.unique()
combinations = pd.MultiIndex.from_product([dates, source],
names = ['date_found', 'source'])
combinations = pd.DataFrame([], index = combinations)
(combinations
.merge(df, how = 'outer', on = ['date_found', 'source'])
.fillna(0, downcast='infer')
)
date_found source count_unique_uuids count_unique_uuids_raw
0 2021-05-13 source_1 20 20
1 2021-05-13 source_2 0 0
2 2021-05-13 source_3 0 0
3 2021-05-14 source_1 40 40
4 2021-05-14 source_2 1829 1829
5 2021-05-14 source_3 2245 2245
6 2021-05-15 source_1 903 903
7 2021-05-15 source_2 0 0
8 2021-05-15 source_3 0 0
9 2021-05-16 source_1 0 0
10 2021-05-16 source_2 20 20
11 2021-05-16 source_3 0 0
12 2021-05-17 source_1 0 0
13 2021-05-17 source_2 0 0
14 2021-05-17 source_3 0 0
15 2021-05-18 source_1 0 0
16 2021-05-18 source_2 0 0
17 2021-05-18 source_3 89 89