Home > Software design >  New column for next occurrence of a variable in R
New column for next occurrence of a variable in R

Time:11-26

Note:

Please note that I have tried the following to solve my problem before posting:

to try to solve my problem, but haven't been successful

Problem

Suppose I have the following data that shows the way that items have flowed from a start to an end

> run = c(1, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 7, 8, 9, 10, 10, 11)
> start_location = c("A", "C", "A", "B", "A", "B", "C", "A", "B", "C", "B", "A", "A", "A", "A", "B", "C")
> end_location = c("B", "B", "B", "C", "C", "C", "A", "C", "A", "B", "A", "C", "B", "C", "B", "C", "B")
> df = data.frame(run, start_site, end_site)
> df
   run start_site end_site
1    1          A        B
2    2          A        C
3    3          A        B
4    3          B        C
5    4          A        C
6    5          B        C
7    5          C        A
8    5          A        C
9    6          B        A
10   7          C        B
11   7          B        A
12   7          A        C
13   8          A        B
14   9          A        C
15  10          A        B
16  10          B        C
17  11          C        B

I would like to convert the data into a "wide" format that looks like the following, with a new column for every instance of a stage by the run.

> # Desired result
      run  first_location second_location third_location fourth_location
 [1,] "1"  "A"            "B"             NA             NA             
 [2,] "2"  "C"            "B"             NA             NA             
 [3,] "3"  "A"            "B"             "C"            NA             
 [4,] "4"  "A"            "C"             NA             NA             
 [5,] "5"  "B"            "C"             "A"            "C"            
 [6,] "6"  "C"            "A"             NA             NA             
 [7,] "7"  "C"            "B"             "A"            "C"            
 [8,] "8"  "A"            "B"             NA             NA             
 [9,] "9"  "A"            "C"             NA             NA             
[10,] "10" "A"            "B"             "C"            NA             
[11,] "11" "C"            "B"             NA             NA     

Attempted Solution

I have tried the following but I haven't got the desired result. I have more columns than I need.

> library(dplyr)
> library(tidyr)
>
> # Unsuccessful attempt
> df_long = melt(df, id.vars=c("run"))
> df_long %>%
  select(!variable) %>%
  group_by(run) %>%
  dplyr::mutate(rn = paste0("location_",row_number())) %>%
  spread(rn, value)

# A tibble: 11 x 7
# Groups:   run [11]
     run location_1 location_2 location_3 location_4 location_5 location_6
   <dbl> <chr>      <chr>      <chr>      <chr>      <chr>      <chr>     
 1     1 A          B          NA         NA         NA         NA        
 2     2 A          C          NA         NA         NA         NA        
 3     3 A          B          B          C          NA         NA        
 4     4 A          C          NA         NA         NA         NA        
 5     5 B          C          A          C          A          C         
 6     6 B          A          NA         NA         NA         NA        
 7     7 C          B          A          B          A          C         
 8     8 A          B          NA         NA         NA         NA        
 9     9 A          C          NA         NA         NA         NA        
10    10 A          B          B          C          NA         NA        
11    11 C          B          NA         NA         NA         NA    

Can someone help me figure out my mistake and help me get the desired output please?

Thank you for looking at my post.

CodePudding user response:

The rle and tidyr::unnest_wider based solution.

run = c(1, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 7, 8, 9, 10, 10, 11)
start_location = c("A", "C", "A", "B", "A", "B", "C", "A", "B", "C", "B", "A", "A", "A", "A", "B", "C")
end_location = c("B", "B", "B", "C", "C", "C", "A", "C", "A", "B", "A", "C", "B", "C", "B", "C", "B")
df = data.frame(run = run, from = start_location, to = end_location)

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)

df %>% group_by(run) %>% 
  summarise(location = list(rle(as.vector(t(cbind(from, to))))$values)) %>%
  unnest_wider(location, names_sep = "_")
#> # A tibble: 11 × 5
#>      run location_1 location_2 location_3 location_4
#>    <dbl> <chr>      <chr>      <chr>      <chr>     
#>  1     1 A          B          <NA>       <NA>      
#>  2     2 C          B          <NA>       <NA>      
#>  3     3 A          B          C          <NA>      
#>  4     4 A          C          <NA>       <NA>      
#>  5     5 B          C          A          C         
#>  6     6 B          A          <NA>       <NA>      
#>  7     7 C          B          A          C         
#>  8     8 A          B          <NA>       <NA>      
#>  9     9 A          C          <NA>       <NA>      
#> 10    10 A          B          C          <NA>      
#> 11    11 C          B          <NA>       <NA>

Created on 2022-11-25 with reprex v2.0.2

CodePudding user response:

First you group by the run, then you pivot the end locations wider, and lastly you summarize the first non-NA value for each location to get rid of the overlap.

library(tidyverse)

df |>
  group_by(run) |>
  mutate(location = paste("location", row_number() 1, sep= "_")) |>
  pivot_wider(names_from = location, values_from = end_location) |>
  rename(location_1 = start_location) |>
  summarise(across(starts_with("location"), ~first(.[!is.na(.)])))
#> # A tibble: 11 x 5
#>      run location_1 location_2 location_3 location_4
#>    <dbl> <chr>      <chr>      <chr>      <chr>     
#>  1     1 A          B          <NA>       <NA>      
#>  2     2 C          B          <NA>       <NA>      
#>  3     3 A          B          C          <NA>      
#>  4     4 A          C          <NA>       <NA>      
#>  5     5 B          C          A          C         
#>  6     6 B          A          <NA>       <NA>      
#>  7     7 C          B          A          C         
#>  8     8 A          B          <NA>       <NA>      
#>  9     9 A          C          <NA>       <NA>      
#> 10    10 A          B          C          <NA>      
#> 11    11 C          B          <NA>       <NA>

Here is another option that I think is a little simpler:

df |>
  group_by(run) |>
  summarise(location = list(c(first(start_location), end_location))) |>
  unnest_wider(location, names_sep = "_")
#> # A tibble: 11 x 5
#>      run location_1 location_2 location_3 location_4
#>    <dbl> <chr>      <chr>      <chr>      <chr>     
#>  1     1 A          B          <NA>       <NA>      
#>  2     2 C          B          <NA>       <NA>      
#>  3     3 A          B          C          <NA>      
#>  4     4 A          C          <NA>       <NA>      
#>  5     5 B          C          A          C         
#>  6     6 B          A          <NA>       <NA>      
#>  7     7 C          B          A          C         
#>  8     8 A          B          <NA>       <NA>      
#>  9     9 A          C          <NA>       <NA>      
#> 10    10 A          B          C          <NA>      
#> 11    11 C          B          <NA>       <NA>
  • Related