R: Make a new column filled from value in a row-CodePudding

I have multiple dataframes all structured as below but with thousands of observations.

(df <- data.frame(
col1 = c("Elem_A", "String", "String", "String", "Elem_A", "String", "String", "Elem_A", "String", "String", "String", "String"), 
col2 = c("DOI_1", "String", "String", "String", "DOI_2", "String", "String", "DOI_3", "String", "String", "String", "String")))

#>     col1   col2
#> 1 Elem_A  DOI_1
#> 2 String String
#> 3 String String
#> 4 String String
#> 5 Elem_A  DOI_2
#> 6 String String
#> 7 String String
#> 8 Elem_A  DOI_3
#> 9 String String
#> 10 String String
#> 11 String String
#> 12 String String

I am wanting to structure it as below, pulling specifically the value that begins with "DOI" into a new column and filling that value down until it reaches the row with the next "DOI" value.

(df <- data.frame(
col1 = c("Elem_A", "String", "String", "String", 
    "Elem_A", "String", "String", "Elem_A", "String", "String", "String", "String", "String", "String", "String"), 
col2 = c("DOI_1", 
    "String", "String", "String", "DOI_2", "String", "String", 
    "DOI_3", "String", "String", "String", "String", "String", "String", "String"), 
col3 = c("DOI_1", "DOI_1", "DOI_1", "DOI_1", 
    "DOI_2", "DOI_2", "DOI_2", "DOI_3", "DOI_3", "DOI_3", "DOI_3", "DOI_3")))

#>     col1   col2  col3
#> 1 Elem_A  DOI_1 DOI_1
#> 2 String String DOI_1
#> 3 String String DOI_1
#> 4 String String DOI_1
#> 5 Elem_A  DOI_2 DOI_2
#> 6 String String DOI_2
#> 7 String String DOI_2
#> 8 Elem_A  DOI_3 DOI_3
#> 9 String String DOI_3
#> 10 String String DOI_3
#> 11 String String DOI_3
#> 12 String String DOI_3

I was thinking I should somehow incorporate str_detect but the issue is that sometimes "DOI" is also the beginning of some of the "Strings" values within the same column that the "DOI" values are in.

CodePudding user response：

We can use str_detect with case_when/ifelse to retrieve the row element and then use fill to fill the NA values with the previous non-NA

library(dplyr)
library(tidyr)
library(stringr)
df <- df %>%
   mutate(col3 = case_when(str_detect(col2, "DOI_") ~ col2)) %>% 
   fill(col3)

-output

df
 col1   col2  col3
1  Elem_A  DOI_1 DOI_1
2  String String DOI_1
3  String String DOI_1
4  String String DOI_1
5  Elem_A  DOI_2 DOI_2
6  String String DOI_2
7  String String DOI_2
8  Elem_A  DOI_3 DOI_3
9  String String DOI_3
10 String String DOI_3
11 String String DOI_3
12 String String DOI_3

If 'DOI_\\d ' is a substring, then use str_extract to extract the substring

df <- df %>%
        mutate(col3 = str_extract(col2, "DOI_\\d ")) %>%
        fill(col3)

-output

df
 col1   col2  col3
1  Elem_A  DOI_1 DOI_1
2  String String DOI_1
3  String String DOI_1
4  String String DOI_1
5  Elem_A  DOI_2 DOI_2
6  String String DOI_2
7  String String DOI_2
8  Elem_A  DOI_3 DOI_3
9  String String DOI_3
10 String String DOI_3
11 String String DOI_3
12 String String DOI_3

CodePudding user response：

With Base R way

s <- unlist(gregexpr("DOI_\\d " , df$col2))

df$col3 <- unlist(Map(\(x,y) rep(df$col2[x] ,length.out = y   1) ,
                  which(s > -1) , rle(s)$lengths[which(rle(s)$values == -1)]))

output

     col1   col2  col3
1  Elem_A  DOI_1 DOI_1
2  String String DOI_1
3  String String DOI_1
4  String String DOI_1
5  Elem_A  DOI_2 DOI_2
6  String String DOI_2
7  String String DOI_2
8  Elem_A  DOI_3 DOI_3
9  String String DOI_3
10 String String DOI_3
11 String String DOI_3
12 String String DOI_3