I need to replace several values from several columns in pandas, but I'm having issues with replacing them. I'm not sure why it's not working; I think I've checked everything to ensure I'm not doing something wrong.
data = {'gender': {0: np.nan, 1: 'Male', 2: np.nan, 3: 'Male', 4: 'Female', 5: 'Male', 6: 'Male', 7: 'Female', 8: 'Male', 9: 'UnSpec', 10: 'Male'}, 'birthyear': {0: '#VALUE!', 1: '1991', 2: '#VALUE!', 3: '1982', 4: '1976', 5: '1971', 6: '1984', 7: '1973', 8: '1972', 9: '#VALUE!', 10: '1974'}, 'race': {0: np.nan, 1: 'Caucasian', 2: np.nan, 3: np.nan, 4: 'Hispanic', 5: 'Caucasian', 6: 'Caucasian', 7: 'Hispanic', 8: 'Caucasian', 9: np.nan, 10: 'Caucasian'}, 'degree': {0: np.nan, 1: "Bachelor's", 2: np.nan, 3: "Bachelor's", 4: "Bachelor's", 5: "Bachelor's", 6: 'Doctorate', 7: "Bachelor's", 8: "Master's", 9: np.nan, 10: "Master's"}}
data = pd.DataFrame(data)
def get_replace_dict():
replace_dict = {
## Gender
re.compile("^male", re.IGNORECASE): "Man",
re.compile("^female", re.IGNORECASE): "Woman",
# re.compile("^null|unspec|^notsay|^not.*relev.*|^np.nan", re.IGNORECASE): "",
# Degree
re.compile(".*associa.*", re.IGNORECASE): "Associates",
re.compile(".*bache.*", re.IGNORECASE): "Bachelors",
re.compile(".*doctorate.*", re.IGNORECASE): "Doctorate/Professional",
re.compile(".*high\\s school.*", re.IGNORECASE): "High School",
re.compile(".*master.*", re.IGNORECASE): "Masters",
re.compile("^proff.*", re.IGNORECASE): "Doctorate/Professional",
}
return(replace_dict)
replace_dict = get_replace_dict()
replaced_data = data.replace(to_replace=replace_dict, value=None, regex=True)
print("replaced: ", replaced_data )
But it doesn't show any replaced values. So what am I not doing correctly?
CodePudding user response:
It seems not supported, but you can try the (?i)
extenstion to ignore case.
def get_replace_dict():
replace_dict = {
"(?i)^male": "Man",
"(?i)^female": "Woman",
## Gender
"(?i)^male": "Man",
"(?i)^female": "Woman",
# "^null|unspec|^notsay|^not.*relev.*|^np.nan": "",
# Degree
"(?i).*associa.*": "Associates",
"(?i).*bache.*": "Bachelors",
"(?i).*doctorate.*": "Doctorate/Professional",
"(?i).*high\\s school.*": "High School",
"(?i).*master.*": "Masters",
"(?i)^proff.*": "Doctorate/Professional",
}
return replace_dict
replace_dict = get_replace_dict()
replace_data = data.replace(to_replace=replace_dict, regex=True)
print(replace_data)
gender birthyear race degree
0 NaN #VALUE! NaN NaN
1 Man 1991 Caucasian Bachelors
2 NaN #VALUE! NaN NaN
3 Man 1982 NaN Bachelors
4 Woman 1976 Hispanic Bachelors
5 Man 1971 Caucasian Bachelors
6 Man 1984 Caucasian Doctorate/Professional
7 Woman 1973 Hispanic Bachelors
8 Man 1972 Caucasian Masters
9 UnSpec #VALUE! NaN NaN
10 Man 1974 Caucasian Masters
CodePudding user response:
According to the example in the documentation https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html, df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'}), the regular expression dictionary is assigned to regex=, not to_replace=, so replace_data = data.replace(to_replace=replace_dict, regex=True) must be replaced with replaced_data = data.replace(regex=replace_dict)