Home > Mobile >  Add a column based on two different columns, each with multiple different names
Add a column based on two different columns, each with multiple different names

Time:10-07

I have a dataset that includes a stage number and a machine number - a small portion is reproduced below. However, in actuality, the full dataset includes 38 stages and is over 1 million rows long.

stage <- c("Stg1",  "Stg1","Stg1","Stg1","Stg1","Stg1","Stg1","Stg1","Stg1","Stg1","Stg1","Stg1",   "Stg2", "Stg2", "Stg2","Stg2","Stg2","Stg2","Stg2","Stg2","Stg2","Stg2","Stg10","Stg10","Stg10")
machine <- c("132H", "132H","132H", "132H", "132H", "212H", "212H", "212H", "212H", "212H", "217H", "217H", "132H", "132H", "212H", "212H", "212H", "212H", "212H", "217H", "217H", "217H", "132H", "132H", "132H")

df <- data.frame(stage,machine)

head(df)
  stage machine
1  Stg1    132H
2  Stg1    132H
3  Stg1    132H
4  Stg1    132H
5  Stg1    132H
6  Stg1    212H

My goal is to create a new column that will sequentially assign numbers to grouped stages and machines. Ultimately, the code that will produce an output like this:

Stage   Machine JobStage
Stg1    132H    1
Stg1    132H    1
Stg1    132H    1
Stg1    132H    1
Stg1    132H    1
Stg1    212H    2
Stg1    212H    2
Stg1    212H    2
Stg1    212H    2
Stg1    212H    2
Stg1    217H    3
Stg1    217H    3
Stg2    132H    4
Stg2    132H    4
Stg2    212H    5
Stg2    212H    5
Stg2    212H    5
Stg2    212H    5
Stg2    212H    5
Stg2    217H    6
Stg2    217H    6
Stg2    217H    6
Stg10   132H    7
Stg10   132H    7
Stg10   132H    7

I am aware you can do something like this for each stage and each machine, but it is time consuming especially for a large dataset:

df$JobStage[df$stage == "Stg1" & df$machine == "132H"] <- 1
df$JobStage[df$stage == "Stg1" & df$machine == "212H"] <- 2
...

I was trying to use dplyr with group_by() and mutate(), but I am not sure how to capture the different stages and machines properly and assign it a number. I know that unique() doesn't work for character values, but maybe the code would be something like this:

df %>% group_by(stage, machine) %>% mutate(JobStage = unique(stage) & unique(machine))

Any help would be very appreciated. Thank you.

CodePudding user response:

Here is an option with unite and match i.e. unite the columns 'stage', 'machine' to create a new pasted column and get the index by matching the elements with the unique values

library(dplyr)
library(tidyr)
df %>% 
   unite(JobStage, stage, machine, remove = FALSE) %>% 
   mutate(JobStage = match(JobStage, unique(JobStage)))

-output

    JobStage stage machine
1         1  Stg1    132H
2         1  Stg1    132H
3         1  Stg1    132H
4         1  Stg1    132H
5         1  Stg1    132H
6         2  Stg1    212H
7         2  Stg1    212H
8         2  Stg1    212H
9         2  Stg1    212H
10        2  Stg1    212H
11        3  Stg1    217H
12        3  Stg1    217H
13        4  Stg2    132H
14        4  Stg2    132H
15        5  Stg2    212H
16        5  Stg2    212H
17        5  Stg2    212H
18        5  Stg2    212H
19        5  Stg2    212H
20        6  Stg2    217H
21        6  Stg2    217H
22        6  Stg2    217H

CodePudding user response:

One option:

library(tidyverse)
df %>% 
  mutate(stag_mach = str_c(stage, machine),
         JobStage = as.integer(factor(stag_mach))) %>% 
  select(-stag_mach)
#    stage machine JobStage
# 1   Stg1    132H        1
# 2   Stg1    132H        1
# 3   Stg1    132H        1
# 4   Stg1    132H        1
# 5   Stg1    132H        1
# 6   Stg1    212H        2
# 7   Stg1    212H        2
# 8   Stg1    212H        2
# 9   Stg1    212H        2
# 10  Stg1    212H        2
# 11  Stg1    217H        3
# 12  Stg1    217H        3
# 13  Stg2    132H        4
# 14  Stg2    132H        4
# 15  Stg2    212H        5
# 16  Stg2    212H        5
# 17  Stg2    212H        5
# 18  Stg2    212H        5
# 19  Stg2    212H        5
# 20  Stg2    217H        6
# 21  Stg2    217H        6
# 22  Stg2    217H        6

EDIT: And if you want to keep the order:

df %>% 
  mutate(stag_mach = str_c(stage, machine),
         JobStage = as.integer(fct_inorder(stag_mach))) %>% 
  select(-stag_mach)
#    stage machine JobStage
# 1   Stg1    132H        1
# 2   Stg1    132H        1
# 3   Stg1    132H        1
# 4   Stg1    132H        1
# 5   Stg1    132H        1
# 6   Stg1    212H        2
# 7   Stg1    212H        2
# 8   Stg1    212H        2
# 9   Stg1    212H        2
# 10  Stg1    212H        2
# 11  Stg1    217H        3
# 12  Stg1    217H        3
# 13  Stg2    132H        4
# 14  Stg2    132H        4
# 15  Stg2    212H        5
# 16  Stg2    212H        5
# 17  Stg2    212H        5
# 18  Stg2    212H        5
# 19  Stg2    212H        5
# 20  Stg2    217H        6
# 21  Stg2    217H        6
# 22  Stg2    217H        6
# 23 Stg10    132H        7
# 24 Stg10    132H        7
# 25 Stg10    132H        7
  • Related