I have the following data.frame, where I am trying to creat a new field with the following rules.
tibble [212 × 9] (S3: tbl_df/tbl/data.frame)
$ Observation : num [1:212] 1 2 3 4 5 6 7 8 9 10 ...
$ Gender : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 2 1 1 ...
$ Education : Factor w/ 3 levels "Bachelors","Masters",..: 2 2 3 1 2 3 3 1 2 2 ...
$ Salary : num [1:212] 64233855 7955556 97531875 89785395 6956943 ...
$ Graduation : Date[1:212], format: "2015-09-22" "2020-06-15" "2008-05-07" ...
$ License : logi [1:212] TRUE FALSE TRUE FALSE FALSE TRUE ...
$ Expenses : num [1:212] 3356768 247988 274816 2447352 4069344 ...
$ Satisfaction: Factor w/ 5 levels "1","2","3","4",..: 3 1 4 3 5 2 3 2 4 3 ...
$ Stress : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 1 2 1 1 ...
Rules
- a. Increase the current Salary by 15% for female whose satisfaction score is either 2 or 3 and whose Stress is ’Yes’).
- b. Increase the current Salary by 7.5% for male whose satisfaction score is either 1 or 2 and whose Stress is ’No’).
- c. For the rest of the rows, do nothing.
I have tried to create an IF statment to solve it. But I get an error message:
Error: unexpected '&' in: " if (df$Gender[j]=='female') &"
for (j in 1:i[1]) {
if (df$Gender[j]=='female')
& df$Satisfaction[j] 2 | 3 & df$Stress[j] == 'Yes'
df$SalaryNew[j] <- df$Salary[j]*1.15
else if (df$Gender[j]=='male'), & df$Satisfaction[j] 2 | 3 & df$Stress[j] == 'No'
df$SalaryNew[j] <- df$Salary[j]*1.075
else
df$SalaryNew[j] <- df$Salary[j]
}
Structure
structure(list(Observation = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Gender = structure(c(2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L,
1L), levels = c("0", "1"), class = "factor"), Education =
structure(c(2L, 2L, 3L, 1L, 2L, 3L, 3L, 1L, 2L, 2L), levels = c("Bachelors", "Masters", "PhD"), class = "factor"), Salary = c(64233855,7955556, 97531875, 89785395, 6956943, 12445419, 54293295,
109647195, 113335215, 8171793), Graduation = structure(c(16700, 18428, 14006, 11782, 15333, 13879, 18873, 19085, 13067, 13529), class = "Date"), License = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE), Expenses = c(3356768, 247988, 274816, 2447352, 4069344, 244264, 3398872, 2901072, 3346736, 2358584), Satisfaction = structure(c(3L, 1L, 4L,3L, 5L, 2L, 3L, 2L, 4L, 3L), levels = c("1", "2", "3", "4","5"), class = "factor"), Stress = structure(c(2L, 2L, 1L,1L, 2L, 2L, 1L, 2L, 1L, 1L), levels = c("No", "Yes"), class = "factor")), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
CodePudding user response:
The are some problems with your sintax, such as parenthesis closing and the use of some operators, and I wrote in a way that is easier to see where to close and use each operator:
for (j in 1:(nrow(df))) {
if(
(df$Gender[j] == 'female') &
((df$Satisfaction[j] == 2) | (df$Satisfaction[j] == 3)) &
(df$Stress[j] == 'Yes')
){
df$SalaryNew[j] <- df$Salary[j] * 1.15
}else if(
(df$Gender[j] == 'male') &
((df$Satisfaction[j] == 2) | (df$Satisfaction[j] == 3)) &
(df$Stress[j] == 'No')
){
df$SalaryNew[j] <- df$Salary[j] * 1.075
}else{
df$SalaryNew[j] <- df$Salary[j]
}
}