Adding rows for missing month where keep all some of the other column is same as it is and the rest-CodePudding

I have data frame in pandas that have missing month, so I would like to add missing months where with one column values as o and other as it is. For instance my initial data looks like,


import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(0,30,size=10),
                 columns=["Random"],
                 index=pd.date_range("20180101", periods=10))
df = df.reset_index()
df['Date']=['2018-01-01', '2018-02-01', '2018-04-01', '2018-05-01', '2018-07-01', '2018-02-01', '2018-04-01', '2018-03-01', '2018-04-01', '2018-08-01']
df['Random']=[44,65,78,32,55,61,78,85,91,56]
df['Home'] = ['843','843','843','843','843', '976','976','976','976','976',]
df['Mat'] = ['327837', '327837', '327837','327837','327837', 'eid78', 'eid78','eid78','eid78','eid78',]
df.loc[5, 'index'] = pd.to_datetime('2018-01-02')
df['id'] = df.Mat.astype(str)   '_'   df.Home.astype(str)   '_'   df.Date.astype(str)
df['Date']= pd.to_datetime(df['Date'])
df['Date'] = df['Date'].dt.to_period('m')
df.drop('index', axis=1, inplace=True)
df

Here the earliest month is 2018-01 and latest is 2018-08(but it can be different), What I want is add rows for each missing month in the range, values for the 'Random' column should be 0 for those newly added months and keep other columns as it is. The expected output should be

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(0,48,size=16),
                 columns=["Random"],
                 index=pd.date_range("20180101", periods=16))
df = df.reset_index()
df['Date']=['2018-01-01', '2018-02-01', '2018-03-01', '2018-04-01', '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01', '2018-01-01', '2018-02-01', '2018-03-01', '2018-04-01', '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01']
df['Random']=[44,65,0,78,32,0, 0, 55, 0, 61, 0, 78, 85, 0, 91,56]
df['Home'] = ['843','843','843','843','843', '843', '843', '843', '976', '976', '976','976','976','976','976','976',]
df['Mat'] = ['327837', '327837', '327837','327837','327837', '327837','327837','327837', 'eid78', 'eid78','eid78','eid78', 'eid78','eid78','eid78','eid78',]
#df.loc[5, 'index'] = pd.to_datetime('2018-01-02')
df['id'] = df.Mat.astype(str)   '_'   df.Home.astype(str)   '_'   df.Date.astype(str)
df['Date']= pd.to_datetime(df['Date'])
df['Date'] = df['Date'].dt.to_period('m')
df.drop('index', axis=1, inplace=True)
df

I have tried

df.set_index(['Date', 'Mat', 'home', 'id']).unstack().fillna(0).stack().reset_index()

but it doesn't seems like work.

CodePudding user response：

Here's a way to do what your question asks:

dates = list(pd.Series(pd.date_range(
    df.Date.min().to_timestamp(), df.Date.max().to_timestamp(), freq='MS')).dt.to_period('m'))
info = df[['Mat','Home']].drop_duplicates().set_index(['Mat','Home'])
info = info.assign(Date=[dates for _ in info.index]).explode('Date').reset_index()

origCols = df.columns
df = info.join(df.set_index(['Mat', 'Home', 'Date']), on=['Mat', 'Home', 'Date'])
df.Random = df.Random.fillna(0).astype(int)
df.loc[df.id.isna(),'id'] = df[df.id.isna()].apply(lambda x: '_'.join(
    [x.Mat, x.Home, str(x.Date.to_timestamp().strftime("%Y-%m-%d"))]), axis=1)
df = df.reindex(columns=origCols)

Input:

   Random     Date Home     Mat                     id
0      44  2018-01  843  327837  327837_843_2018-01-01
1      65  2018-02  843  327837  327837_843_2018-02-01
2      78  2018-04  843  327837  327837_843_2018-04-01
3      32  2018-05  843  327837  327837_843_2018-05-01
4      55  2018-07  843  327837  327837_843_2018-07-01
5      61  2018-02  976   eid78   eid78_976_2018-02-01
6      78  2018-04  976   eid78   eid78_976_2018-04-01
7      85  2018-03  976   eid78   eid78_976_2018-03-01
8      91  2018-04  976   eid78   eid78_976_2018-04-01
9      56  2018-08  976   eid78   eid78_976_2018-08-01

Output:

    Random     Date Home     Mat                     id
0       44  2018-01  843  327837  327837_843_2018-01-01
1       65  2018-02  843  327837  327837_843_2018-02-01
2        0  2018-03  843  327837  327837_843_2018-03-01
3       78  2018-04  843  327837  327837_843_2018-04-01
4       32  2018-05  843  327837  327837_843_2018-05-01
5        0  2018-06  843  327837  327837_843_2018-06-01
6       55  2018-07  843  327837  327837_843_2018-07-01
7        0  2018-08  843  327837  327837_843_2018-08-01
8        0  2018-01  976   eid78   eid78_976_2018-01-01
9       61  2018-02  976   eid78   eid78_976_2018-02-01
10      85  2018-03  976   eid78   eid78_976_2018-03-01
11      78  2018-04  976   eid78   eid78_976_2018-04-01
11      91  2018-04  976   eid78   eid78_976_2018-04-01
12       0  2018-05  976   eid78   eid78_976_2018-05-01
13       0  2018-06  976   eid78   eid78_976_2018-06-01
14       0  2018-07  976   eid78   eid78_976_2018-07-01
15      56  2018-08  976   eid78   eid78_976_2018-08-01

CodePudding user response：

One option is with complete from pyjanitor :

# pip install pyjanitor
import pandas as pd
import janitor

# build a sequence of all the expected periods
new_dates = {'Date': pd.period_range(df.Date.min(), df.Date.max(), freq='m')}

(df
.complete('Home', new_dates, 
          fill_value = {'Random':0})
.assign(Mat = lambda df: df.Mat.ffill(), 
        id = lambda df: df['id'].ffill()
        )
)

 Random     Date Home     Mat                     id
0       44  2018-01  843  327837  327837_843_2018-01-01
1       65  2018-02  843  327837  327837_843_2018-02-01
2        0  2018-03  843  327837  327837_843_2018-02-01
3       78  2018-04  843  327837  327837_843_2018-04-01
4       32  2018-05  843  327837  327837_843_2018-05-01
5        0  2018-06  843  327837  327837_843_2018-05-01
6       55  2018-07  843  327837  327837_843_2018-07-01
7        0  2018-08  843  327837  327837_843_2018-07-01
8        0  2018-01  976  327837  327837_843_2018-07-01
9       61  2018-02  976   eid78   eid78_976_2018-02-01
10      85  2018-03  976   eid78   eid78_976_2018-03-01
11      78  2018-04  976   eid78   eid78_976_2018-04-01
12      91  2018-04  976   eid78   eid78_976_2018-04-01
13       0  2018-05  976   eid78   eid78_976_2018-04-01
14       0  2018-06  976   eid78   eid78_976_2018-04-01
15       0  2018-07  976   eid78   eid78_976_2018-04-01
16      56  2018-08  976   eid78   eid78_976_2018-08-01

complete offers an efficient abstraction. You can however run it with just pd.MultiIndex.from_product and pd.join:

names = pd.MultiIndex.from_product([df.Home.unique(),
                                    pd.period_range(df.Date.min(),
                                                    df.Date.max(), 
                                                    freq='m')
                                    ], names = ['Home', 'Date'])
names = pd.DataFrame([], index = names)
(names
.join(df.set_index(['Home', 'Date']), how = 'outer')
.fillna({'Random':0}, downcast='infer')
.ffill()
.reset_index()
)
   Home     Date  Random     Mat                     id
0   843  2018-01      44  327837  327837_843_2018-01-01
1   843  2018-02      65  327837  327837_843_2018-02-01
2   843  2018-03       0  327837  327837_843_2018-02-01
3   843  2018-04      78  327837  327837_843_2018-04-01
4   843  2018-05      32  327837  327837_843_2018-05-01
5   843  2018-06       0  327837  327837_843_2018-05-01
6   843  2018-07      55  327837  327837_843_2018-07-01
7   843  2018-08       0  327837  327837_843_2018-07-01
8   976  2018-01       0  327837  327837_843_2018-07-01
9   976  2018-02      61   eid78   eid78_976_2018-02-01
10  976  2018-03      85   eid78   eid78_976_2018-03-01
11  976  2018-04      78   eid78   eid78_976_2018-04-01
12  976  2018-04      91   eid78   eid78_976_2018-04-01
13  976  2018-05       0   eid78   eid78_976_2018-04-01
14  976  2018-06       0   eid78   eid78_976_2018-04-01
15  976  2018-07       0   eid78   eid78_976_2018-04-01
16  976  2018-08      56   eid78   eid78_976_2018-08-01