I want to create a new column SURV
in the clin
dataframe based on clin["days_to_death"]
values, whereby:
- 'lts' if NA or more than or equal to 2*365
- 'non-lts' if condition not met (i.e., less than 2*365)
My code below labeled all the values as 'lts', even when less than 2*365.
clin
dataframe:
clin = pd.DataFrame([[1, '45', '44', 1, nan],
[1, '121', '68', 0, nan],
[1, '466', '47', 0, '90'],
[1, '357', '54', 1, '80'],
[1, '108', '72', 1, '60'],
[1, '254', '51', 0, '80'],
[1, '138', '78', 1, '80'],
[0, nan, '67', 0, '60'],
[0, nan, '61', 0, '80'],
[0, nan, '60', 0, '100'],
[0, nan, '23', 1, '80'],
[0, nan, '45', 1, '80'],
[1, '83', '75', 1, '60'],
[1, '114', '58', 0, nan],
[0, nan, '45', 1, '100'],
[0, nan, '63', 0, '40'],
[1, '159', '64', 1, '80'],
[0, nan, '64', 0, '40'],
[0, nan, '65', 0, '80'],
[0, nan, '53', 1, nan],
[0, nan, '58', 1, nan],
[0, nan, '76', 0, '100'],
[0, nan, '60', 0, nan],
[0, nan, '21', 1, '90'],
[1, '57', '78', 1, nan],
[1, '95', '79', 0, nan],
[1, '78', '53', 1, '60'],
[1, '444', '64', 0, '70'],
[0, nan, '30', 1, '100'],
[1, '454', '60', 1, nan],
[1, '98', '56', 1, '80'],
[1, '62', '59', 0, '80'],
[1, '460', '49', 0, '100'],
[1, '364', '70', 0, '60'],
[1, '29', '49', 0, nan],
[1, '88', '60', 0, '40'],
[1, '485', '60', 1, '60'],
[1, '42', '52', 0, '80'],
[1, '975', '58', 1, '80'],
[0, nan, '57', 1, '80'],
[0, nan, '36', 0, nan],
[1, '202', '47', 1, '40'],
[1, '523', '52', 1, '100'],
[1, '244', '76', 1, '80'],
[1, '575', '62', 0, '60'],
[1, '144', '58', 0, nan],
[1, '368', '72', 1, '60'],
[1, '54', '83', 0, nan],
[1, '684', '53', 1, '80'],
[1, '428', '56', 0, '80'],
[1, '511', '77', 1, '80'],
[1, '455', '56', 0, '80'],
[0, nan, '39', 0, nan],
[0, nan, '40', 0, '100'],
[0, nan, '74', 0, '60'],
[1, '270', '72', 1, nan],
[1, '577', '78', 1, nan],
[0, nan, '47', 0, nan],
[1, '593', '69', 1, nan],
[1, '36', '72', 0, nan],
[1, '585', '66', 0, nan],
[1, '460', '59', 1, nan],
[1, '379', '73', 1, nan],
[0, nan, '61', 0, nan],
[0, nan, '69', 0, nan],
[0, nan, '54', 1, '70'],
[1, '105', '67', 0, '100'],
[0, nan, '73', 1, '70'],
[0, nan, '51', 0, '90'],
[0, nan, '58', 0, '100'],
[0, nan, '77', 0, '100'],
[0, nan, '55', 1, '80'],
[1, '146', '76', 0, '100'],
[1, '138', '68', 0, '80'],
[1, '535', '58', 0, '80'],
[1, '94', '85', 1, '80'],
[1, '111', '76', 1, '80'],
[1, '279', '70', 1, '80'],
[1, '1458', '50', 1, '80'],
[1, '77', '66', 1, '80'],
[1, '1121', '52', 1, '80'],
[1, '508', '57', 0, '80'],
[1, '100', '74', 0, '80'],
[1, '82', '78', 0, '60'],
[1, '519', '63', 0, '80'],
[1, '254', '64', 1, '80'],
[1, '638', '60', 0, '80'],
[1, '147', '66', 0, '60'],
[1, '153', '74', 0, '80'],
[1, '727', '54', 0, '60'],
[1, '1048', '58', 0, '80'],
[1, '567', '44', 1, '80'],
[0, nan, '49', 1, '80'],
[1, '180', '68', 0, nan],
[1, '191', '72', 1, nan],
[0, nan, '51', 0, nan],
[1, '625', '55', 0, nan],
[1, '1448', '63', 1, '60'],
[1, '375', '68', 0, nan],
[1, '399', '65', 0, '100'],
[1, '317', '62', 0, '80'],
[1, '225', '72', 1, nan],
[1, '360', '47', 0, '100'],
[1, '603', '31', 0, '100'],
[1, '717', '39', 1, nan],
[1, '414', '81', 1, nan],
[0, nan, '49', 1, '100'],
[1, '164', '58', 0, '80'],
[1, '3667', '64', 1, nan],
[1, '224', '76', 1, nan],
[1, '24', '61', 0, '40'],
[1, '1537', '21', 0, nan],
[1, '666', '48', 1, nan],
[1, '141', '51', 0, '80'],
[0, nan, '43', 0, '80'],
[0, nan, '59', 0, '80'],
[0, nan, '74', 0, '60'],
[0, nan, '65', 1, '60'],
[0, nan, '57', 0, '80'],
[0, nan, '65', 1, '80'],
[1, '454', '72', 0, '80'],
[1, '343', '71', 0, '70'],
[1, '544', '52', 0, '70'],
[0, nan, '66', 1, nan],
[0, nan, '42', 0, '100'],
[0, nan, '72', 0, '80'],
[1, '713', '53', 0, '70'],
[1, '335', '62', 1, '90'],
[0, nan, '52', 0, '80'],
[1, '157', '63', 0, nan],
[0, nan, '47', 1, '70'],
[1, '388', '67', 0, '90'],
[1, '165', '60', 0, nan],
[1, '346', '57', 0, '80'],
[1, '165', '71', 1, '0'],
[1, '114', '73', 0, '0'],
[1, '49', '64', 0, '40'],
[0, nan, '33', 0, nan],
[0, nan, '50', 1, nan]], columns=['vital_status', 'days_to_death', 'age_at_initial_pathologic_diagnosis',
'gender', 'karnofsky_performance_score'], index="bcr_patient_barcode")
My attempt:
import numpy as np
import pandas as pd
# Survival info
def survival(clin):
if np.where(clin["days_to_death"],np.nan,1):
val = "lts"
elif clin["days_to_death"].astype(int) >= 2*365:
val = "lts"
else:
val = "non-lts"
return val
clin['SURV'] = clin.apply(survival, axis=1)
Then I want to drop the NAs in all the columns except for the days_to_death
column.
CodePudding user response:
To create clin['SURV']
column as you want, you can use numpy.select
like below:
vals = clin['days_to_death'].astype(np.float32)
condlist = [vals>=2*365, vals<2*365]
choicelist = ['Its', 'nonIts']
clin['SURV'] = np.select(condlist, choicelist, 'Its')
To remove row that contain np.nan
you can use dropna(subset=[...])
like below:
clin = clin.dropna(subset=['vital_status', 'age_at_initial_pathologic_diagnosis','gender', 'karnofsky_performance_score'])
CodePudding user response:
Use .loc
clin.loc[clin["days_to_death"].isna() | (clin["days_to_death"].map(float) >= 2*365), 'SURV'] = 'lts'
clin.loc[clin["SURV"].isna(), 'SURV'] = 'non-lts'
Then remove nans
clin = clin.dropna(subset=['vital_status', 'age_at_initial_pathologic_diagnosis', 'gender', 'karnofsky_performance_score'])