Home > OS >  Replacing levels of factor R - Variable 'income' expressed as range of different currencie
Replacing levels of factor R - Variable 'income' expressed as range of different currencie

Time:03-14

I am working with survey data where observations have been collected across 7 different countries. Since I am not interested in studying country-specific differences, I'd like to reduce the levels of the factor for my variable "income" by using a single currency. This is an example of the levels I have.

800 Pounds or less
800 – 1,600 Pounds
1,600 – 2,400 Pounds
2,400 – 3,200 Pounds
3,200 – 4,000 Pounds
4,000 – 4,800 Pounds
4,800 – 5,600 Pounds
5,600 Pounds or more

And the same for Euro and CHF. What I am trying to do is to change everything in Euro. I've tried with:

data$demo_income <- as.character(data$demo_income)

data$demo_income[data$demo_income == "800 Pounds or less" | data$demo_income == "1.000 CHF or less" |data$demo_income ==  "1,000 Euro or less"] <- "1,000 Euro or less"
data$demo_income[data$demo_income == "800 - 1,600 Pounds" | data$demo_income == "1.000 - 2.000 CHF" | data$demo_income == "1,000 - 2,000 Euro"] <- "1,000 - 2,000 Euro"
data$demo_income[data$demo_income == "1,600 - 2,400 Pounds" | data$demo_income == "2.000 - 3.000 CHF" | data$demo_income == "2,000 - 3,000 Euro"] <- "2,000 - 3,000 Euro"
data$demo_income[data$demo_income == "2,400 - 3,200 Pounds" | data$demo_income == "3.000 - 4.000 CHF"|data$demo_income ==  "3,000 - 4,000 Euro"] <- "3,000 - 4,000 Euro"
data$demo_income[data$demo_income == "3,200 - 4,000 Pounds" | data$demo_income == "4.000 - 5.000 CHF"|data$demo_income ==  "4,000 - 5,000 Euro"] <- "4,000 - 5,000 Euro"
data$demo_income[data$demo_income == "4,000 - 4,800 Pounds" | data$demo_income == "5.000 - 6.000 CHF"|data$demo_income ==  "5,000 - 6,000 Euro"] <- "5,000 - 6,000 Euro"
data$demo_income[data$demo_income == "4,800 - 5,600 Pounds" | data$demo_income == "6.000 - 7.000 CHF"|data$demo_income ==  "6,000 - 7,000 Euro"] <- "6,000 - 7,000 Euro"
data$demo_income[data$demo_income =="5,600 Pounds or more" | data$demo_income =="7.000 CHF or more"|data$demo_income ==  "7,000 Euro or more"] <- "7,000 Euro or more"

table(data$demo_income)
data$demo_income <- as.factor(data$demo_income)

But it does not work. I also tried recode_factor or

levels(WB.Data$demo_income) <- list("1,000 Euro or less" = "800 Pounds or less",
                                    "1,000 - 2,000 Euro" = "800 - 1,600 Pounds",
                                    "2,000 - 3,000 Euro" ="1,600 - 2,400 Pounds",
                                    "3,000 - 4,000 Euro" = "2,400 - 3,200 Pounds",
                                    "4,000 - 5,000 Euro" = "3,200 - 4,000 Pounds",
                                    "5,000 - 6,000 Euro" = "4,000 - 4,800 Pounds",
                                    "6,000 - 7,000 Euro" = "4,800 - 5,600 Pounds",
                                    "7,000 Euro or more" = "5,600 Pounds or more",
                                    "1,000 Euro or less" = "1.000 CHF or less",
                                    "1,000 - 2,000 Euro" = "1.000 - 2.000 CHF",
                                    "2,000 - 3,000 Euro" = "2.000 - 3.000 CHF",
                                    "3,000 - 4,000 Euro" = "3.000 - 4.000 CHF",
                                    "4,000 - 5,000 Euro" = "4.000 - 5.000 CHF",
                                    "5,000 - 6,000 Euro" = "5.000 - 6.000 CHF",
                                    "6,000 - 7,000 Euro" = "6.000 - 7.000 CHF",
                                    "7,000 Euro or more" = "6.000 - 7.000 CHF"
)

But nothing.

CodePudding user response:

forcats::fct_collapse() is well-suited to this:

library(dplyr)
library(forcats)

data <- data %>% fct_collapse(
  demo_income,
  "1,000 Euro or less" = c("800 Pounds or less", "1.000 CHF or less", "1,000 Euro or less"),
  "1,000 - 2,000 Euro" = c("800 - 1,600 Pounds", "1.000 - 2.000 CHF", "1,000 - 2,000 Euro"),
  "2,000 - 3,000 Euro" = c("1,600 - 2,400 Pounds", "2.000 - 3.000 CHF", "2,000 - 3,000 Euro"),
  # and so on for remaining levels
)

CodePudding user response:

It seems that your data and your code use a different character for "-", so the corresponding entries are not matched in R. E.g. Comparing the 800 – 1,600 Pounds from your example levels against the 800 - 1,600 Pounds from your code in R gives False, because the left side uses the hyphen character, whereas the right side uses the minus character (which is slightly shorter):

"800 – 1,600 Pounds" == "800 - 1,600 Pounds"
False

If the same character is used, your code should work. E.g.:

Example data:

data = data.frame(demo_income = c(
    "800 Pounds or less",
    "800 - 1,600 Pounds",
    "1,600 - 2,400 Pounds",
    "2,400 - 3,200 Pounds",
    "3,200 - 4,000 Pounds",
    "4,000 - 4,800 Pounds",
    "4,800 - 5,600 Pounds",
    "5,600 Pounds or more",
    
    "1.000 CHF or less",
    "1.000 - 2.000 CHF",
    "2.000 - 3.000 CHF",
    "3.000 - 4.000 CHF",
    "4.000 - 5.000 CHF",
    "5.000 - 6.000 CHF",
    "6.000 - 7.000 CHF",
    "7.000 CHF or more",
    
    "1,000 Euro or less",
    "1,000 - 2,000 Euro",
    "2,000 - 3,000 Euro",
    "3,000 - 4,000 Euro",
    "4,000 - 5,000 Euro",
    "5,000 - 6,000 Euro",
    "6,000 - 7,000 Euro",
    "7,000 Euro or more"
))
length(levels(data$demo_income))
24

After applying your code (but using the same "-" character as the data):

data$demo_income <- as.character(data$demo_income)

data$demo_income[data$demo_income == "800 Pounds or less" | data$demo_income == "1.000 CHF or less" |data$demo_income ==  "1,000 Euro or less"] <- "1,000 Euro or less"
data$demo_income[data$demo_income == "800 - 1,600 Pounds" | data$demo_income == "1.000 - 2.000 CHF" | data$demo_income == "1,000 - 2,000 Euro"] <- "1,000 - 2,000 Euro"
data$demo_income[data$demo_income == "1,600 - 2,400 Pounds" | data$demo_income == "2.000 - 3.000 CHF" | data$demo_income == "2,000 - 3,000 Euro"] <- "2,000 - 3,000 Euro"
data$demo_income[data$demo_income == "2,400 - 3,200 Pounds" | data$demo_income == "3.000 - 4.000 CHF"|data$demo_income ==  "3,000 - 4,000 Euro"] <- "3,000 - 4,000 Euro"
data$demo_income[data$demo_income == "3,200 - 4,000 Pounds" | data$demo_income == "4.000 - 5.000 CHF"|data$demo_income ==  "4,000 - 5,000 Euro"] <- "4,000 - 5,000 Euro"
data$demo_income[data$demo_income == "4,000 - 4,800 Pounds" | data$demo_income == "5.000 - 6.000 CHF"|data$demo_income ==  "5,000 - 6,000 Euro"] <- "5,000 - 6,000 Euro"
data$demo_income[data$demo_income == "4,800 - 5,600 Pounds" | data$demo_income == "6.000 - 7.000 CHF"|data$demo_income ==  "6,000 - 7,000 Euro"] <- "6,000 - 7,000 Euro"
data$demo_income[data$demo_income =="5,600 Pounds or more" | data$demo_income =="7.000 CHF or more"|data$demo_income ==  "7,000 Euro or more"] <- "7,000 Euro or more"

table(data$demo_income)
data$demo_income <- as.factor(data$demo_income)
length(levels(data$demo_income))

1,000 - 2,000 Euro 1,000 Euro or less 2,000 - 3,000 Euro 3,000 - 4,000 Euro 
                 3                  3                  3                  3 
4,000 - 5,000 Euro 5,000 - 6,000 Euro 6,000 - 7,000 Euro 7,000 Euro or more 
                 3                  3                  3                  3 
8
  •  Tags:  
  • r
  • Related