I have time-series data. My task is to average some of values in the feature columns based on the sequences in target column, and then replace those values in feature columns by averaged value for each sequence. So, for example, in the table below, we have a sequence of three 1s in target, we take average of values in this sequence for feature 1 and then for feature 2 and after that we replace those values with our mean values. And that should be done for each sequence in target.
ID | time(s) | feature 1 | feature 2 | target |
---|---|---|---|---|
abc | 500 | 2.56789 | 91.12834 | 0 |
abc | 1000 | 2.45678 | 91.23452 | 1 |
abc | 1500 | 2.36589 | 91.54398 | 1 |
abc | 2000 | 2.56428 | 91.32348 | 1 |
abc | 2500 | 2.25869 | 91.79322 | 0 |
cba | 500 | 5.36589 | 93.54398 | 1 |
cba | 1000 | 5.56428 | 93.32348 | 1 |
cba | 1500 | 5.25869 | 94.79322 | 0 |
To do this, I made an algorithm based on indexing.
def averaging(input_df: pd.DataFrame):
output_df = input_df.copy()
target_index = []
for _, sub_df in input_df.groupby('ID'):
_index = sub_df.index
_targets = sub_df['target'].tolist()
before = 1 if _targets[0] == 1 else 0
tmp = []
if before:
tmp.append(_index[0])
for i, flag in enumerate(_targets):
if flag == 1 and before == 0:
tmp.append(_index[i])
elif flag == 0 and before == 1:
tmp.append(_index[i])
target_index.append(tmp)
tmp = []
before = flag
if tmp:
tmp.append(_index[-1] 1)
target_index.append(tmp)
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
output_df.iloc[indexi:indexj]['target_id'] = i 1
targetid2avg = output_df.loc[output_df['target_id']!=0].groupby('target_id')[['feature1', 'feature2']].mean()
targetid2avg.columns = ['target_feature1', 'target_feature2']
output_df = output_df.merge(targetid2avg, on='target_id', how='left')
output_df.loc[output_df['target_id']!=0, ['feature1', 'feature2']] = output_df.loc[output_df['target_id']!=0, ['target_feature1', 'target_feature2']].values
output_df = output_df.drop(columns=['target_feature1', 'target_feature2'], axis=1)
return output_df
However, something goes wrong when I start indexing the target_id column in output_df and instead of indexes there, I keep getting empty column (zeros). Tried to figure out what the problem is but nothing helped. Does anyone know what is wrong here or how to solve this problem in another way?
Will really appreciate any help.
CodePudding user response:
The problem with your code is contained in this bit:
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
output_df.iloc[indexi:indexj]['target_id'] = i 1
When you run this part, you'll get a SettingWithCopyWarning
:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
output_df.iloc[indexi:indexj]['target_id'] = i 1
What this means essentially, is that you aren't updating the actual DataFrame, you are updating a copy of a slice.
Indeed, after running this line, you'll find that the values in column target_id
are still all zeros:
print(output_df.target_id.sum()) # 0
To fix this you could use the following line instead:
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
# output_df.iloc[indexi:indexj]['target_id'] = i 1
output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i 1
Once you have corrected this line, your function should generate the expected outcome. To be sure, the last bit could also simply be:
for i, (indexi, indexj) in enumerate(target_index):
means = output_df.loc[indexi:indexj-1,['feature1', 'feature2']].mean(axis=0)
output_df.loc[indexi:indexj-1,['feature1', 'feature2']] = \
output_df.loc[indexi:indexj-1,['feature1', 'feature2']].assign(**means)
return output_df
Same result. Finally, you could do all of this a bit quicker actually. E.g. using something like this:
def averaging_new(input_df: pd.DataFrame):
output_df = input_df.copy()
output_df['target_id'] = 0
zeros = output_df.target != 0
sequences = (zeros != zeros.shift()).cumsum()
features = ['feature1','feature2']
gb = output_df[zeros].groupby([sequences, 'ID'])[features].transform('mean')
gb['target_id'] = output_df[zeros].groupby([sequences, 'ID'])[features].ngroup() 1
output_df.loc[gb.index, features] = gb[features]
output_df.loc[gb.index, 'target_id'] = gb['target_id']
return output_df
output_df = averaging_new(input_df)
output_df
ID time(s) feature1 feature2 target target_id
0 abc 500 2.567890 91.128340 0 0
1 abc 1000 2.462317 91.367327 1 1
2 abc 1500 2.462317 91.367327 1 1
3 abc 2000 2.462317 91.367327 1 1
4 abc 2500 2.258690 91.793220 0 0
5 cba 500 5.465085 93.433730 1 2
6 cba 1000 5.465085 93.433730 1 2
7 cba 1500 5.258690 94.793220 0 0
# comparing that result with your function corrected as well as your function refactored:
def averaging_corrected(input_df: pd.DataFrame):
output_df = input_df.copy()
target_index = []
for _, sub_df in input_df.groupby('ID'):
_index = sub_df.index
_targets = sub_df['target'].tolist()
before = 1 if _targets[0] == 1 else 0
tmp = []
if before:
tmp.append(_index[0])
for i, flag in enumerate(_targets):
if flag == 1 and before == 0:
tmp.append(_index[i])
elif flag == 0 and before == 1:
tmp.append(_index[i])
target_index.append(tmp)
tmp = []
before = flag
if tmp:
tmp.append(_index[-1] 1)
target_index.append(tmp)
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
# output_df.iloc[indexi:indexj]['target_id'] = i 1
output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i 1
targetid2avg = output_df.loc[output_df['target_id']!=0].groupby('target_id')[['feature1', 'feature2']].mean()
targetid2avg.columns = ['target_feature1', 'target_feature2']
output_df = output_df.merge(targetid2avg, on='target_id', how='left')
output_df.loc[output_df['target_id']!=0, ['feature1', 'feature2']] = output_df.loc[output_df['target_id']!=0, ['target_feature1', 'target_feature2']].values
output_df = output_df.drop(columns=['target_feature1', 'target_feature2'], axis=1)
return output_df
def averaging_refactored(input_df: pd.DataFrame):
output_df = input_df.copy()
target_index = []
for _, sub_df in input_df.groupby('ID'):
_index = sub_df.index
_targets = sub_df['target'].tolist()
before = 1 if _targets[0] == 1 else 0
tmp = []
if before:
tmp.append(_index[0])
for i, flag in enumerate(_targets):
if flag == 1 and before == 0:
tmp.append(_index[i])
elif flag == 0 and before == 1:
tmp.append(_index[i])
target_index.append(tmp)
tmp = []
before = flag
if tmp:
tmp.append(_index[-1] 1)
target_index.append(tmp)
output_df['target_id'] = 0
for i, (indexi, indexj) in enumerate(target_index):
output_df.iloc[indexi:indexj,range(len(output_df.columns))[-1]] = i 1
means = output_df.loc[indexi:indexj-1,['feature1', 'feature2']].mean(axis=0)
output_df.loc[indexi:indexj-1,['feature1', 'feature2']] = \
output_df.loc[indexi:indexj-1,['feature1', 'feature2']].assign(**means)
return output_df
output_df2 = averaging_corrected(input_df)
output_df3 = averaging_refactored(input_df)
dfs = [output_df, output_df2, output_df3]
all(x.equals(dfs[0]) for x in dfs)
# True
N.B. in averaging_new
and averaging_refactored
, I'm not actually using the column target_id
to fill the feature columns. Just adding it for the comparison.