Home > Software engineering >  Add missing dates for every value of another column
Add missing dates for every value of another column

Time:11-09

I would like to fill the missing values of the dates of a Pandas dataframe, but instead of filling the missing date based only on the date column, I would like to do it based on more than 1 column. In this case, the column source.

The example is the following

Original

date_found  source        count_unique_uuids    count_unique_uuids_raw
2021-05-13  source_1      20                    20
2021-05-14  source_2      1829                  1829
2021-05-14  source_3      2245                  2245
2021-05-14  source_1      40                    40
2021-05-15  source_1      903                   903
2021-05-16  source_2      20                    20
2021-05-18  source_3      89                    89

Desired dataset

    date_found  source        count_unique_uuids    count_unique_uuids_raw
    2021-05-13  source_1      20                    20
    2021-05-13  source_2      0                     0
    2021-05-13  source_3      0                     0
    2021-05-14  source_1      40                    40
    2021-05-14  source_2      1829                  1829
    2021-05-14  source_3      2245                  2245 
    2021-05-15  source_1      903                   903
    2021-05-15  source_2      0                     0
    2021-05-15  source_3      0                     0
    2021-05-16  source_1      0                     0
    2021-05-16  source_2      20                    20
    2021-05-16  source_3      0                     0
    2021-05-17  source_1      0                     0
    2021-05-17  source_2      0                     0
    2021-05-17  source_3      0                     0
    2021-05-18  source_1      0                     0
    2021-05-18  source_2      0                     0
    2021-05-18  source_3      89                    89

I was using reindex and resample as a reference to build the dataset

Reindex: Add missing dates to pandas dataframe

Resample: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html

I'm trying the following code

But getting an error ValueError: StringArray requires a sequence of strings or pandas.NA

def add_missing_dates(df: pd.DataFrame) -> pd.DataFrame:
    df['date_found'] = pd.to_datetime(df['date_found'], format='%Y-%m-%d')
    min_date = df['date_found'].min()
    max_date = df['date_found'].max()

    (df.set_index(['date_found'])
     .groupby(['source'], as_index=False, group_keys=False)
     .apply(lambda x: x.reindex(pd.date_range(min_date, max_date)))
     .reset_index().rename(columns={'index': 'date_found'})
     .fillna(0)
     )
    return df


def add_dates_to_source(df: pd.DataFrame, source: str = 'source') -> pd.DataFrame:
    sources = df[source].tolist()
    dfs_to_concat = []

    for source_value in sources:
        filtered_df = df.loc[df[source] == source_value]
        df_ = add_missing_dates(filtered_df)
        df_[source] = source_value
        dfs_to_concat.append(df_)

    return pd.concat(dfs_to_concat)

To run

 df = add_dates_to_source(df)

CodePudding user response:

here is one way to do it

# make the date as of type datetime
df['date_found']=pd.to_datetime(df['date_found'])

#find the min_date
min_date = df['date_found'].min()

#find the max date
max_date = df['date_found'].max()


df2=(df.set_index(['date_found'])      # set index to date, to allow for reindex
 .groupby(['source'],as_index=False, group_keys=False) # group on source
 .apply(lambda x: x.reindex(pd.date_range(min_date, max_date))) # reindex on the date range
 .reset_index() # reset index
 .rename(columns={'index': 'date_found'}) # rename the column
)
df2['source'].ffill(inplace=True)
df2.fillna(0, inplace=True)
df2

    date_found  source  count_unique_uuids  count_unique_uuids_raw
0   2021-05-13  source_1    20.0    20.0
1   2021-05-14  source_1    40.0    40.0
2   2021-05-15  source_1    903.0   903.0
3   2021-05-16  source_1    0.0     0.0
4   2021-05-17  source_1    0.0     0.0
5   2021-05-18  source_1    0.0     0.0
6   2021-05-13  source_1    0.0     0.0
7   2021-05-14  source_2    1829.0  1829.0
8   2021-05-15  source_2    0.0     0.0
9   2021-05-16  source_2    20.0    20.0
10  2021-05-17  source_2    0.0     0.0
11  2021-05-18  source_2    0.0     0.0
12  2021-05-13  source_2    0.0     0.0
13  2021-05-14  source_3    2245.0  2245.0
14  2021-05-15  source_3    0.0     0.0
15  2021-05-16  source_3    0.0     0.0
16  2021-05-17  source_3    0.0     0.0
17  2021-05-18  source_3    89.0    89.0

CodePudding user response:

this is what i came to (doesn't look nice but seems to work as expected):

min_date = df['date_found'].min()
max_date = df['date_found'].max()

dates = (pd.date_range(min_date,max_date).to_frame()
         .reset_index(drop=True)
         .set_axis(['date_found'],axis=1))

res = (dates.merge(df['source'].drop_duplicates(),how='cross')
       .merge(df,how='left').fillna(0))

print(res)
'''
   date_found    source  count_unique_uuids  count_unique_uuids_raw
0  2021-05-13  source_1                20.0                    20.0
1  2021-05-13  source_2                 0.0                     0.0
2  2021-05-13  source_3                 0.0                     0.0
3  2021-05-14  source_1                40.0                    40.0
4  2021-05-14  source_2              1829.0                  1829.0
5  2021-05-14  source_3              2245.0                  2245.0
6  2021-05-15  source_1               903.0                   903.0
7  2021-05-15  source_2                 0.0                     0.0
8  2021-05-15  source_3                 0.0                     0.0
9  2021-05-16  source_1                 0.0                     0.0
10 2021-05-16  source_2                20.0                    20.0
11 2021-05-16  source_3                 0.0                     0.0
12 2021-05-17  source_1                 0.0                     0.0
13 2021-05-17  source_2                 0.0                     0.0
14 2021-05-17  source_3                 0.0                     0.0
15 2021-05-18  source_1                 0.0                     0.0
16 2021-05-18  source_2                 0.0                     0.0
17 2021-05-18  source_3                89.0                    89.0
​

CodePudding user response:

One option is with complete from pyjanitor, to expose missing rows:

# pip install pyjanitor
import pandas as pd
import janitor
df['date_found'] = pd.to_datetime(df['date_found'])

# build a dictionary to contain the new dates
# the key of the dictionary must exist in the dataframe
new_dates = {"date_found":pd.date_range(df.date_found.min(), 
                                        df.date_found.max(), 
                                        freq='D')}

df.complete(new_dates, 'source', fill_value=0)
   date_found    source  count_unique_uuids  count_unique_uuids_raw
0  2021-05-13  source_1                  20                      20
1  2021-05-13  source_2                   0                       0
2  2021-05-13  source_3                   0                       0
3  2021-05-14  source_1                  40                      40
4  2021-05-14  source_2                1829                    1829
5  2021-05-14  source_3                2245                    2245
6  2021-05-15  source_1                 903                     903
7  2021-05-15  source_2                   0                       0
8  2021-05-15  source_3                   0                       0
9  2021-05-16  source_1                   0                       0
10 2021-05-16  source_2                  20                      20
11 2021-05-16  source_3                   0                       0
12 2021-05-17  source_1                   0                       0
13 2021-05-17  source_2                   0                       0
14 2021-05-17  source_3                   0                       0
15 2021-05-18  source_1                   0                       0
16 2021-05-18  source_2                   0                       0
17 2021-05-18  source_3                  89                      89

Another option, using Pandas only:

min_date = df['date_found'].min()
max_date = df['date_found'].max()

dates = pd.date_range(min_date,max_date)
source = df.source.unique()
combinations = pd.MultiIndex.from_product([dates, source], 
                                          names = ['date_found', 'source'])
combinations = pd.DataFrame([], index = combinations)
(combinations
.merge(df, how = 'outer', on = ['date_found', 'source'])
.fillna(0, downcast='infer')
)
   date_found    source  count_unique_uuids  count_unique_uuids_raw
0  2021-05-13  source_1                  20                      20
1  2021-05-13  source_2                   0                       0
2  2021-05-13  source_3                   0                       0
3  2021-05-14  source_1                  40                      40
4  2021-05-14  source_2                1829                    1829
5  2021-05-14  source_3                2245                    2245
6  2021-05-15  source_1                 903                     903
7  2021-05-15  source_2                   0                       0
8  2021-05-15  source_3                   0                       0
9  2021-05-16  source_1                   0                       0
10 2021-05-16  source_2                  20                      20
11 2021-05-16  source_3                   0                       0
12 2021-05-17  source_1                   0                       0
13 2021-05-17  source_2                   0                       0
14 2021-05-17  source_3                   0                       0
15 2021-05-18  source_1                   0                       0
16 2021-05-18  source_2                   0                       0
17 2021-05-18  source_3                  89                      89
  • Related