Say I have a DataFrame looks like the following,
df = pd.DataFrame({
'Name': ['', 'Dave', 'Tom'],
'Age':[20, 30, None],
'Job':['singer', 'teacher', 'student'],
})
and I'd like to make a function that can help me filter the value I want for analysis
def filt(name=False, age=False, job=False) -> pd.DataFrame:
'''
this function return the dataframe with selected param
'''
if name:
df = df[df.Name.notna()]
if age:
df = df[df.Age.notna()]
if job:
df = df[df.Job.notna()]
return df
# Expected Usage: return the dataframe where "Name" and "Age" should not be empty.
filt(df, name=True, age=True)
In real world I would like to have at least 10 kwargs from case to case, and this code seems to be redundant. What is the convention to set param without using too many if-else statements?
EDIT: sorry for misunderstood with filtering na values only. There's also cases like " filter job=='students' ", and the default param would be all of them.
# i.e. How could the function possible looks like
def filt(df, name=False, job=False, age=False,
selected_name=None, selected_job=None):
# make the default selected_name value to all of them
selected_name = tuple(df.Name.tolist()) if not selected_name else tuple(selected_name) #tuple() cause dict keys cannot be list
selected_job = tuple(df.Job.tolist()) if not selected_job else tuple(selected_job) #tuple() cause dict keys cannot be list
# Try to make dict when keys=True, use the values as conditions
filtlist = {
name: df.Name!='',
age : df.Age!='',
job : df.Job!='',
selected_name: df.Name.isin(selected_name),
selected_job: df.Job.isin(selected_job)
}
x = [filtlist[i] for i in filtlist.keys() if i]
# since I pass in 3 kwargs, len(x)=3, i have 3 conditions.
# ^ I stuck here, How do I know concat those conditions together without knowing how many conditions I have?
# '&'.join(x) returns a string, I cannot do this
return df[x[0] & x[1] & x[2]] # I manually do this cause I know I have 3 kwargs. just to make sample run.
filt(df,name=True, age=True, selected_job=['teacher']) #yeah...for some reason name=True does not seems to work...`
CodePudding user response:
Use DataFrame.dropna
with possible pass list of columns names for remove missing values, because here are empty strings I replace them to missing values:
def filt(df, cols) -> pd.DataFrame:
return df.dropna(subset=cols)
df = filt(df.replace('', np.nan), ['Name', 'Age'])
print (df)
Name Age Job
1 Dave 30.0 teacher
If need pass only one column name:
def filt(df, col) -> pd.DataFrame:
return df.dropna(subset=[col])
df = filt(df.replace('', np.nan), 'Name')
print (df)
Name Age Job
1 Dave 30.0 teacher
2 Tom NaN student
Another idea if need pass atributes use **attrs
parameter and then procssing dictionary - e.g. here use title
for match columns names:
def filt(df, **attrs) -> pd.DataFrame:
#for testing
#print (attrs)
cols = [k.title() for k, v in attrs.items() if v]
return df.dropna(subset=cols)
df = filt(df.replace('', np.nan), name=True, age=True)
print (df)
Name Age Job
1 Dave 30.0 teacher
EDIT: In your solution use this solution:
return df[np.logical_and.reduce(x)]
CodePudding user response:
You can do this:
In [142]: import numpy as np
In [155]: def filt(colname) -> pd.DataFrame:
...: '''
...: this function return the dataframe with selected param
...: '''
...: if colname in df:
...: return df[df[f'{colname}'].replace('', np.nan).notna()]
...: return f'{colname} does not exist in df'
...:
In [148]: filt('Age')
Out[148]:
Name Age Job
0 20.0 singer
1 Dave 30.0 teacher
In [149]: filt('Job')
Out[149]:
Name Age Job
0 20.0 singer
1 Dave 30.0 teacher
2 Tom NaN student
In [151]: filt('Name')
Out[151]:
Name Age Job
1 Dave 30.0 teacher
2 Tom NaN student
In [157]: filt('name')
Out[157]: 'name does not exist in df'