I am trying to impute the nan values in a tensor with the mean of the column for that tensor. I know that this can be easily done using the SimpleImputer() for example in sklearn, however, I want to implement all of my feature engineering in Keras or Tensorflow so I can add it as a lambda layer for a Neural Network.
I current have a function like this however I am getting an error:
s = tf.convert_to_tensor(df_train)
def impute_mean(tensor):
tensor = tf.dtypes.cast(tensor, tf.float32)
mean = tft.mean(tensor)
tensor = tf.where(tf.math.is_nan(tensor, mean))
return tensor
d = impute_mean(s)
d
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_37096\4009563778.py in <module>
12 return tensor
13
---> 14 d = impute_mean(s)
15 d
~\AppData\Local\Temp\ipykernel_37096\4009563778.py in impute_mean(tensor)
8 def impute_mean(tensor):
9 tensor = tf.dtypes.cast(tensor, tf.float32)
---> 10 mean = tft.mean(tensor)
11 tensor = tf.where(tf.math.is_nan(tensor, mean))
12 return tensor
~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\common.py in wrapped_fn(*args, **kwargs)
71 collection.append(collections.Counter())
72 collection[0][fn.__name__] = 1
---> 73 return fn(*args, **kwargs)
74 else:
75 return fn(*args, **kwargs)
~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzers.py in mean(x, reduce_instance_dims, name, output_dtype)
842 """
843 with tf.compat.v1.name_scope(name, 'mean'):
--> 844 return _mean_and_var(x, reduce_instance_dims, output_dtype)[0]
845
846
~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzers.py in _mean_and_var(x, reduce_instance_dims, output_dtype)
909 x_mean, x_var = _apply_cacheable_combiner(
910 WeightedMeanAndVarCombiner(output_dtype.as_numpy_dtype, output_shape),
--> 911 *combine_inputs)
912
913 return x_mean, x_var
~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzers.py in _apply_cacheable_combiner(combiner, *tensor_inputs)
170 outputs_value_nodes = apply_cacheable_combine_operation(
171 combiner, *tensor_inputs)
--> 172 return tuple(map(analyzer_nodes.wrap_as_tensor, outputs_value_nodes)) # pytype: disable=bad-return-type
173
174
~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzer_nodes.py in wrap_as_tensor(output_value_node)
320 return bind_future_as_tensor(
321 output_value_node,
--> 322 analyzer_def.output_tensor_infos[output_value_node.value_index])
323
324
~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzer_nodes.py in bind_future_as_tensor(future, tensor_info, name)
310 return _bind_future_as_tensor_v2(future, tensor_info, name)
311 else:
--> 312 return _bind_future_as_tensor_v1(future, tensor_info, name)
313
314
~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow_transform\analyzer_nodes.py in _bind_future_as_tensor_v1(future, tensor_info, name)
140 name: Optional[str] = None) -> tf.Tensor:
141 """Bind a future value as a tensor to a TF1 graph."""
--> 142 result = tf.compat.v1.placeholder(tensor_info.dtype, tensor_info.shape, name)
143 is_asset_filepath = tensor_info.temporary_asset_info is not None
144 tf.compat.v1.add_to_collection(TENSOR_REPLACEMENTS,
~\Anaconda3\envs\DemandForecastEnv\lib\site-packages\tensorflow\python\ops\array_ops.py in placeholder(dtype, shape, name)
3341 """
3342 if context.executing_eagerly():
-> 3343 raise RuntimeError("tf.placeholder() is not compatible with "
3344 "eager execution.")
3345
RuntimeError: tf.placeholder() is not compatible with eager execution.
CodePudding user response:
The problem is you have to give the boolean values to the tf.where()
, and then also the two matrices x
and y
.
Let's have a look.
s = tf.constant([np.nan , 4.0 , np.nan])
mean = tf.reduce_mean(tf.where(tf.math.is_finite(s) , s , [0.0]), axis=-1)
imputed_s = tf.where(tf.math.is_finite(s) , s , mean))
print(imputed_s)
Output
<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[1.3333334, 4. , 1.3333334]], dtype=float32)>
For Sklearn.impute.Imputer
try this...
tensor = tf.constant([[4, 6, 3], [1, np.nan, 0], [1, 3, 0]])
x = tf.constant([[np.nan, 0, np.nan], [0, np.nan, 11], [np.nan, np.nan, 8]])
compute_mask = tf.where(tf.math.is_finite(tensor), tensor , 0)
compute_mean_mask = tf.math.divide(tf.reduce_sum(compute_mask,axis=0),
tf.reduce_sum(tf.cast(tf.math.is_finite(s), dtype=tf.float32) ,axis=0))
#this below line will work as a .fit() method
tf.where(tf.math.is_finite(x) , x , compute_mean_mask)
Output:
<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[ 2. , 0. , 1. ],
[ 0. , 4.5, 11. ],
[ 2. , 4.5, 8. ]], dtype=float32)>
CodePudding user response:
Replicating sklearn.impute.SimpleImputer
in tensorflow
:
sklearn:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
s = np.array([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
imp_mean.fit(s)
X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
print(imp_mean.transform(X))
#output
[[ 7. 2. 3. ]
[ 4. 3.5 6. ]
[10. 3.5 9. ]]
Tensorflow:
#compute imputer mean
mask = tf.where(tf.math.is_nan(s) , 0. , s)
mask_norm = tf.reduce_sum(tf.clip_by_value(mask, 0., 1.),axis=0)
imp_mean = tf.math.divide(tf.reduce_sum(mask, axis=0), mask_norm)
#transform
tf.where(tf.math.is_nan(X) , imp_mean , X)
#output
[[ 7. , 2. , 3. ],
[ 4. , 3.5, 6. ],
[10. , 3.5, 9. ]],
2.09 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)