I want to replace the empty values in the dataframe using random already existing values, while maintaining the weights so that the correlation does not suffer and the data is not lost.
def nan_fill_random(column_name, nan):
for i in range(len(column_name)):
if column_name[i] == nan:
column_name[i] = random.choice(column_name[column_name != nan])
else:
continue
I wrote a function, but it periodically throws a KeyError
: and the value has different numbers, I assume indexes. Also, when you restart the cell, it can either disappear or be updated.
nan_fill_random(data['education'], 'unknown')
Here is the error
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
W:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
W:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 14563
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_4720/2723938638.py in <module>
----> 1 nan_fill_random(data['education'], 'unknown')
~\AppData\Local\Temp/ipykernel_4720/1980306790.py in nan_fill_random(column_name, nan)
2 for i in range(len(column_name)):
3 if column_name[i] == nan:
----> 4 column_name[i] = random.choice(column_name[column_name != nan])
5 else:
6 continue
W:\ProgramData\Anaconda3\lib\random.py in choice(self, seq)
344 """Choose a random element from a non-empty sequence."""
345 # raises IndexError if seq is empty
--> 346 return seq[self._randbelow(len(seq))]
347
348 def shuffle(self, x, random=None):
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
940
941 elif key_is_scalar:
--> 942 return self._get_value(key)
943
944 if is_hashable(key):
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in _get_value(self, label, takeable)
1049
1050 # Similar to Index.get_value, but we do not fall back to positional
-> 1051 loc = self.index.get_loc(label)
1052 return self.index._get_values_for_loc(self, loc, label)
1053
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 14563
CodePudding user response:
def nan_fill_random(column_name, nan):
list_values = set(column_name)
try :
list_values.remove(nan)
except :
return(column_name)
column_name = column_name.apply(lambda x: x if x != nan else random.choice(list(list_values)))
return(column_name)
CodePudding user response:
You seem to be getting an error whenever your column only contains nan values. Also, please avoid calling nan something that isn't strictly nan for clarity.
What you could do is providing some other default values from which you can randomly pick whenever the column doesn't have admissible values you can choose from. Something like:
def nan_fill_random(column_name, to_replace, default_values):
for i in range(len(column_name)):
if column_name[i] == to_replace:
try:
column_name[i] = random.choice(column_name[column_name != to_replace])
except IndexError:
column_name[i] = random.choice(default_values)
Please note that the function could be optimized, I just tried to propose something pretty much aligned with the original one.
Edit: sorry, I missed something, while the above still stands IMO.
The KeyError is because you're accessing an index that doesn't exist. You should use column_name.iloc[i]
instead of column_name[i]
.
You can reproduce this error with:
l = pd.Series(["a", "b", "c"], index=[2,4,5])
for i in range(len(l)):
print(l.iloc[i])
print(l[i]) # error