I'm trying to write a function to fix some boilerplate code in Jupyter notebooks. An example I'm looking at is:
# boilerplate
df_cases.drop( columns=df_cases.columns[8:], inplace=True )
df_cases.columns = [ 'State', 'Country', 'Latitude', 'Longitude',
'January', 'February', 'March', 'April' ]
df_deaths.drop( columns=df_deaths.columns[8:], inplace=True )
df_deaths.columns = [ 'State', 'Country', 'Latitude', 'Longitude',
'January', 'February', 'March', 'April' ]
df_recoveries.drop( columns=df_recoveries.columns[8:], inplace=True )
df_recoveries.columns = [ 'State', 'Country', 'Latitude', 'Longitude',
'January', 'February', 'March', 'April' ]
# fixed
def drop_unneeded_columns ( df ):
df.drop( columns=df.columns[8:], inplace=True )
df.columns = [ 'State', 'Country', 'Latitude', 'Longitude',
'January', 'February', 'March', 'April' ]
But I'm having an issue trying to fix this code:
#boilerplate
df['Age'] = df['Age'].replace( '-', np.nan )
df['Age'] = df['Age'].astype( float )
df['BA'] = df['BA'].replace( '-', np.nan )
df['BA'] = df['BA'].astype( float )
df['OBP'] = df['OBP'].replace( '-', np.nan )
df['OBP'] = df['OBP'].astype( float )
df['SLG'] = df['SLG'].replace( '-', np.nan )
df['SLG'] = df['SLG'].astype( float )
df['OPS'] = df['OPS'].replace( '-', np.nan )
df['OPS'] = df['OPS'].astype( float )
df['OPS '] = df['OPS '].replace( '-', np.nan )
df['OPS '] = df['OPS '].astype( float )
However, all we have is the following code: which is not really working well because we aren't sure on how to reference the columns:
# fixed? not working
def replace_hyphens( df ):
df.replace( '-', np.nan )
df.astype( float )
df.columns = [ 'Age', 'BA', 'OBP', 'SLG',
'OPS', 'OPS ']
Does someone know how we can fix this function?
CodePudding user response:
You don't seem to assign the DataFrame's return values to anything. Without specifying inplace=True
, the result of e.g. df.replace('-', np.nan)
is computed, but not assigned.
You could try something like this:
def replace_hyphens(df: pd.DataFrame) -> pd.DataFrame:
return df.replace('-', np.nan).astype(float)
Regarding the columns, you could for instance apply str.replace
to a subset of them. First you specify the relevant columns with the DataFrame.loc
method, then apply a different function (e.g. a lambda expression) and call DataFrame.apply
. Note that there is probably a more efficient way, and you might get in trouble trying to cast to float for columns which do not contain '-' strings, so you will likely have to build in a check before you call astype(float)
.
def replace_hyphens(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
df.loc[:, cols] = df.loc[:, cols].apply(
lambda s: s.replace('-', np.nan)
).astype(float)
return df
df = replace_hyphens(df,
cols=['Age', 'BA', 'OBP', 'SLG',
'OPS', 'OPS ']
)
With crude check:
def replace_hyphens(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
df.loc[:, cols] = df.loc[:, cols].apply(
lambda s: s.replace('-', np.nan)
for col in cols:
try:
df[:, col] = df[:, col].astype(float)
except TypeError as err:
print(f"Couldn't cast column to float: {err!r}")
# Error handling
return df
But I can hardly imagine there is not a better way to do this available if you take the time to look through some of Panda's documentation.
CodePudding user response:
If you want to change only the -
in the specified columns and use the 'dict-like to_replace' in the documentation, here's an example.
First the set-up:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': [5, 6, "-", 8, 9],
'B': [5, 6, "-", 8, 9],
'C': [5, 6, "-", 8, 9]})
df #show initial dataframe
Gives
A B C
0 5 5 5
1 6 6 6
2 - - -
3 8 8 8
4 9 9 9
Then to alter:
def replace_hyphens( df, col_list):
df = df.replace({k: '-' for k in col_list}, np.nan)
df[col_list] = df[col_list].astype(float)
return df
df = replace_hyphens(df,['A','C'])
df #show result
Column B will remain untouched:
A B C
0 5.0 5 5.0
1 6.0 6 6.0
2 NaN - NaN
3 8.0 8 8.0
4 9.0 9 9.0
Alternatively, you could iterate on the columns replacing the -
only in each column one at time, like so:
def replace_hyphens( df, col_list):
for c in col_list:
df[c] = df[c].replace('-', np.nan)
df[c] = df[c].astype(float)
return df
df = replace_hyphens(df,['A','C'])