import pandas as pd
import numpy as np
def impute_row_median(
s: pd.Series,
threshold: float
) -> pd.Series:
'''For a vector of values, impute nans with median if %nan is below threshold'''
nan_mask = s.isna()
if nan_mask.any() and ((nan_mask.sum() / s.size) * 100) < threshold:
s_median = s.median(skipna=True)
s[nan_mask] = s_median
return s # dtype: float
df = pd.DataFrame(np.random.uniform(0, 1, size=(1000, 5)))
df = df.mask(df < 0.5)
df.apply(impute_row_median, axis=1, threshold=80) # slow
The following apply is pretty slow (I didn't use timeit since I have nothing to compare it to). My usual approach would be to avoid apply and instead use vectorised functions like np.where but I can't presently manage to conceive of a way to do that here. Does anyone have any suggestions? Thank you!
CodePudding user response:
For count percentage of missing values use mean
with boolean mask, chain 2d mask
with 1d mask
in numpy by broadcasting and replace missing values in DataFrame.mask
:
threshold = 80
mask = df.isna()
m = mask.mean(axis=1) * 100 < threshold
df1 = df.mask(mask & m.to_numpy()[:, None], df.median(axis=1, skipna=True), axis=0)
Similar idea with numpy.where
:
mask = df.isna()
m = mask.mean(axis=1) * 100 < threshold
arr = np.where(mask & m.to_numpy()[:, None],
df.median(axis=1, skipna=True).to_numpy()[:, None],
df)
df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
threshold = 80
a = df.to_numpy()
mask = np.isnan(a)
m = np.mean(mask, axis=1) * 100 < threshold
arr = np.where(mask & m[:, None], np.nanmedian(a, axis=1)[:, None], df)
df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
print (df1.equals(df.apply(impute_row_median, axis=1, threshold=80)))
True
Performance comparison (10k rows, 50 columns):
np.random.seed(2023)
df = pd.DataFrame(np.random.uniform(0, 1, size=(10000, 50)))
df = df.mask(df < 0.5)
In [130]: %timeit df.apply(impute_row_median, axis=1, threshold=80)
2.12 s ± 370 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [131]: %%timeit
...: a = df.to_numpy()
...:
...: mask = np.isnan(a)
...: m = np.mean(mask, axis=1) * 100 < threshold
...: arr = np.where(mask & m[:, None], np.nanmedian(a, axis=1)[:, None], df)
...:
...: df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
...:
29.5 ms ± 330 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [132]: %%timeit
...: threshold = 80
...:
...: mask = df.isna()
...: m = mask.mean(axis=1) * 100 < threshold
...: df1 = df.mask(mask & m.to_numpy()[:, None],df.median(axis=1, skipna=True),axis=0)
...:
18.6 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [133]: %%timeit
...: mask = df.isna()
...: m = mask.mean(axis=1) * 100 < threshold
...: arr = np.where(mask & m.to_numpy()[:, None],
...: df.median(axis=1, skipna=True).to_numpy()[:, None],
...: df)
...:
...: df1 = pd.DataFrame(arr, index=df.index, columns=df.columns)
...:
...:
10.2 ms ± 230 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)