Home > Software engineering >  Splitting multiple columns using mutate - dplyr
Splitting multiple columns using mutate - dplyr

Time:07-05

I'm trying to split multiple columns, but struggling to do this efficiently.

I have a df:

> dput(df)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1", 
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1", 
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1", 
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1", 
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1", 
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1", 
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1", 
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1", 
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1", 
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1", 
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1", 
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1", 
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=123.00;FIRST=117.00", 
"FIRST=92.00", "FIRST=111.00", "BASELINE=125.00;FIRST=113.00", 
"FIRST=124.00", "NULL=NULL", "NULL=NULL", "FIRST=77.00", "FIRST=132.00", 
"FIRST=123.00", "BASELINE=119.00;FIRST=93.00", "BASELINE=163.00;FIRST=141.00", 
"FIRST=137.00", "NULL=NULL", "NULL=NULL", "FIRST=137.00", "FIRST=127.00", 
"BASELINE=NULL;FIRST=101.00;SECOND=105.00", "FIRST=120.00", "FIRST=145.00", 
"FIRST=126.00", "BASELINE=93.00;FIRST=82.00", "BASELINE=114.00;FIRST=95.00", 
"BASELINE=137.00;FIRST=125.00", "BASELINE=157.00;FIRST=112.00", 
"NULL=NULL", "FIRST=127.00", "FIRST=147.00", "BASELINE=80.00;FIRST=92.00", 
"BASELINE=129.00;FIRST=131.00", "FIRST=108.00", "BASELINE=92.00;FIRST=80.00", 
"BASELINE=118.00;FIRST=107.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=89.00", 
"NULL=NULL", "BASELINE=NULL;FIRST=101.00", "FIRST=151.00", "FIRST=106.00", 
"BASELINE=81.00;FIRST=87.00", "FIRST=94.00;SECOND=85.00", "FIRST=85.00", 
"FIRST=118.00;SECOND=116.00", "FIRST=101.00;SECOND=103.00", "FIRST=114.00", 
"BASELINE=NULL;FIRST=129.00", "BASELINE=123.00;FIRST=105.00", 
"NULL=NULL", "NULL=NULL"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=2.90;FIRST=2.40", 
"FIRST=14.80", "FIRST=11.00", "BASELINE=6.90;FIRST=7.30", "FIRST=4.50", 
"NULL=NULL", "NULL=NULL", "FIRST=6.10", "FIRST=7.70", "FIRST=16.20", 
"BASELINE=7.40;FIRST=5.30", "BASELINE=12.80;FIRST=10.30", "FIRST=14.50", 
"NULL=NULL", "NULL=NULL", "FIRST=12.80", "FIRST=3.70", "BASELINE=4.20;FIRST=4.70;SECOND=6.50", 
"FIRST=16.60", "FIRST=9.30", "FIRST=15.90", "BASELINE=31.00;FIRST=31.70", 
"BASELINE=3.70;FIRST=7.00", "BASELINE=14.10;FIRST=7.20", "BASELINE=21.80;FIRST=31.20", 
"NULL=NULL", "FIRST=9.20", "FIRST=5.90", "BASELINE=2.20;FIRST=0.50", 
"BASELINE=10.50;FIRST=10.60", "FIRST=12.00", "BASELINE=14.70;FIRST=11.80", 
"BASELINE=13.40;FIRST=13.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=18.60", 
"NULL=NULL", "BASELINE=4.30;FIRST=5.90", "FIRST=7.60", "FIRST=3.10", 
"BASELINE=10.10;FIRST=11.40", "FIRST=12.00;SECOND=21.30", "FIRST=9.80", 
"FIRST=29.50;SECOND=4.80", "FIRST=4.60;SECOND=9.90", "FIRST=18.90", 
"BASELINE=15.00;FIRST=13.00", "BASELINE=12.80;FIRST=10.60", "NULL=NULL", 
"NULL=NULL")), row.names = c(NA, -50L), class = c("tbl_df", "tbl", 
"data.frame"))

I want this output:

dput(output)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1", 
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1", 
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1", 
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1", 
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1", 
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1", 
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1", 
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1", 
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1", 
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1", 
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1", 
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1", 
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), Haemoglobin_BASELINE = c("123.00", NA, NA, "125.00", NA, NA, 
NA, NA, NA, NA, "119.00", "163.00", NA, NA, NA, NA, NA, "NULL", 
NA, NA, NA, "93.00", "114.00", "137.00", "157.00", NA, NA, NA, 
"80.00", "129.00", NA, "92.00", "118.00", NA, NA, "NULL", NA, 
"NULL", NA, NA, "81.00", NA, NA, NA, NA, NA, "NULL", "123.00", 
NA, NA), Haemoglobin_FIRST = c("117.00", "92.00", "111.00", "113.00", 
"124.00", NA, NA, "77.00", "132.00", "123.00", "93.00", "141.00", 
"137.00", NA, NA, "137.00", "127.00", "101.00", "120.00", "145.00", 
"126.00", "82.00", "95.00", "125.00", "112.00", NA, "127.00", 
"147.00", "92.00", "131.00", "108.00", "80.00", "107.00", NA, 
NA, "89.00", NA, "101.00", "151.00", "106.00", "87.00", "94.00", 
"85.00", "118.00", "101.00", "114.00", "129.00", "105.00", NA, 
NA), Haemoglobin_SECOND = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, "105.00", NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, "85.00", NA, "116.00", "103.00", NA, NA, NA, NA, NA), 
    WBC_BASELINE = c("2.90", NA, NA, "6.90", NA, NA, NA, NA, 
    NA, NA, "7.40", "12.80", NA, NA, NA, NA, NA, "4.20", NA, 
    NA, NA, "31.00", "3.70", "14.10", "21.80", NA, NA, NA, "2.20", 
    "10.50", NA, "14.70", "13.40", NA, NA, "NULL", NA, "4.30", 
    NA, NA, "10.10", NA, NA, NA, NA, NA, "15.00", "12.80", NA, 
    NA), WBC_FIRST = c("2.40", "14.80", "11.00", "7.30", "4.50", 
    NA, NA, "6.10", "7.70", "16.20", "5.30", "10.30", "14.50", 
    NA, NA, "12.80", "3.70", "4.70", "16.60", "9.30", "15.90", 
    "31.70", "7.00", "7.20", "31.20", NA, "9.20", "5.90", "0.50", 
    "10.60", "12.00", "11.80", "13.00", NA, NA, "18.60", NA, 
    "5.90", "7.60", "3.10", "11.40", "12.00", "9.80", "29.50", 
    "4.60", "18.90", "13.00", "10.60", NA, NA), WBC_SECOND = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, "6.50", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "21.30", NA, 
    "4.80", "9.90", NA, NA, NA, NA, NA)), row.names = c(NA, -50L
), class = "data.frame")

I can achieve this, but my code is inefficient (as I am doing this on each blood column):

#haemoglobin
haem_ID <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_1=HAEMOGLOBIN`)

haem_sep <- haem_ID %>%
  rename(text = `TIME_POINT_1=HAEMOGLOBIN`) %>%
  separate_rows(text, sep = ';') %>%
  separate(text, c('text', 'value'), sep = '=') %>%
  pivot_wider(names_from = text, values_from = value)

tmp <- haem_sep %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("Haemoglobin", colnames(tmp), sep = "_")
tmp2 <- haem_ID %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-Haemoglobin_NULL)
haem <- tmp3


#white cells
a <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_2=WHITE_CELLS`)

b <- a %>%
  rename(text = `TIME_POINT_2=WHITE_CELLS`) %>%
  separate_rows(text, sep = ';') %>%
  separate(text, c('text', 'value'), sep = '=') %>%
  pivot_wider(names_from = text, values_from = value)

tmp <- b %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("WBC", colnames(tmp), sep = "_")
tmp2 <- a %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-WBC_NULL)
WBC <- tmp3

output <- merge(haem, WBC)

Pls can someone help me achieve the same but more effectively please? In reality I have 20 blood tests and not just 2, so hard-coding it is not ideal.

UPDATE

Upon the kind responses, I've tried running the suggested code but I'm getting an error and I can't figure out why.

input data

structure(list(UNIQUE_PATIENT_ID = c("DIS-1104-1021-E1", "DIS-1107-1010-E1", 
"DIS-1124-1108-E1"), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=98.00;FIRST=90.00;SECOND=81.00;FIRST=85.00", 
"FIRST=120.00;NULL=120.00;NULL=118.00;SECOND=109.00;NULL=186.00", 
"BASELINE=78.00;FIRST=NULL;FIRST=99.00"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=0.01;FIRST=NULL;SECOND=0.01;FIRST=0.00", 
"FIRST=15.40;NULL=16.10;NULL=14.50;SECOND=11.40;NULL=2.20;NULL=5.00", 
"BASELINE=85.00;FIRST=NULL;FIRST=209.00"), `TIME_POINT_3=PLATELETS` = c("BASELINE=28.00;FIRST=28.00;SECOND=12.00;FIRST=21.00", 
"FIRST=200.00;NULL=186.00;NULL=203.00;SECOND=243.00;NULL=145.00;NULL=457.00", 
"BASELINE=186.00;FIRST=NULL"), `TIME_POINT_4=NEUTROPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00", 
"FIRST=13.00;NULL=13.20;NULL=11.10;SECOND=7.70;NULL=1.30;NULL=2.20", 
"BASELINE=7.10;FIRST=2.40;FIRST=NULL"), `TIME_POINT_5=LYMPHOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00", 
"FIRST=0.80;NULL=1.60;NULL=1.40;SECOND=1.70;NULL=0.40", "BASELINE=76.80;FIRST=135.00;FIRST=NULL"
), `TIME_POINT_6=MONOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00", 
"FIRST=1.50;NULL=1.20;NULL=1.90;SECOND=2.00;NULL=0.40;NULL=0.90", 
"BASELINE=1.40;FIRST=4.20;FIRST=NULL"), `TIME_POINT_7=EOSINOPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00", 
"FIRST=0.10;NULL=0.00;SECOND=0.00;NULL=0.10;NULL=0.30", "BASELINE=0.10;FIRST=0.40;FIRST=NULL"
), `TIME_POINT_8=FIBRINOGEN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=6.90;NULL=6.60;NULL=7.10;SECOND=6.60;NULL=6.00;NULL=NULL", 
"BASELINE=6.70;FIRST=NULL"), `TIME_POINT_9=PT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=19.20;NULL=17.20;NULL=16.10;SECOND=14.40;NULL=14.90;NULL=NULL", 
"BASELINE=13.40;FIRST=NULL"), `TIME_POINT_10=UREA` = c("BASELINE=4.30;FIRST=4.30;SECOND=5.10;FIRST=4.50", 
"FIRST=3.80;NULL=3.70;NULL=4.40;SECOND=2.80;NULL=2.90;NULL=4.10", 
"BASELINE=5.10;FIRST=NULL;FIRST=7.20"), `TIME_POINT_11=CREATININE` = c("BASELINE=67.00;FIRST=55.00;SECOND=61.00;FIRST=60.00", 
"FIRST=44.00;NULL=42.00;NULL=45.00;SECOND=42.00;NULL=33.00", 
"BASELINE=65.00;FIRST=NULL;FIRST=72.00"), `TIME_POINT_12=ALT` = c("BASELINE=9.00;FIRST=11.00;SECOND=8.00;FIRST=5.00", 
"FIRST=20.00;NULL=NULL;NULL=19.00;SECOND=16.00;NULL=16.00;NULL=21.00", 
"BASELINE=64.00;FIRST=NULL;FIRST=92.00"), `TIME_POINT_13=BILIRUBIN` = c("BASELINE=3.00;FIRST=5.00;SECOND=3.00;FIRST=NULL", 
"FIRST=20.00;NULL=NULL;NULL=11.00;SECOND=10.00;NULL=5.00", "BASELINE=7.00;FIRST=NULL;FIRST=10.00"
), `TIME_POINT_14=ALBUMIN` = c("BASELINE=42.00;FIRST=39.00;SECOND=35.00;FIRST=36.00", 
"FIRST=42.00;NULL=NULL;NULL=37.00;SECOND=35.00;NULL=35.00;NULL=39.00", 
"BASELINE=31.00;FIRST=NULL;FIRST=40.00"), `TIME_POINT_15=VITAMIN_D` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_16=CRP` = c("BASELINE=NULL;FIRST=172.00;SECOND=116.00;FIRST=212.00", 
"FIRST=427.00;NULL=420.00;NULL=338.00;SECOND=215.00;NULL=87.00;NULL=64.00", 
"BASELINE=199.00;FIRST=NULL;FIRST=1.00"), `TIME_POINT_17=PCT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_18=FERRITIN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;NULL=378.00;SECOND=NULL", "BASELINE=809.00;FIRST=432.00;FIRST=NULL"
), `TIME_POINT_19=TROP_T` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;NULL=3.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_20=D_DIMER` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=1.85;NULL=1.65;SECOND=NULL;NULL=NULL", "BASELINE=749.00;FIRST=486.00;FIRST=NULL"
), `TIME_POINT_21=INR` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_22=BNP_NT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_23=LDH` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=254.00;FIRST=NULL"
), `TIME_POINT_24=CK` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;NULL=29.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_25=SCD_25` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_26=BASE_EXCESS` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=3.10;NULL=1.50;NULL=NULL;SECOND=NULL", "BASELINE=-3.80;FIRST=NULL"
), `TIME_POINT_27=LACTATE` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=0.90;NULL=1.30;NULL=NULL;SECOND=NULL", "BASELINE=0.60;FIRST=NULL"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))

Code:

output <- input |> 
  separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |> 
  pivot_longer(-UNIQUE_PATIENT_ID) |> 
  separate(value, into = c("timepoint", "value"), sep = "=") |> 
  mutate(name = str_remove(name, ".*=")) |> 
  pivot_wider(names_from = c("name", "timepoint")) |> 
  select(-ends_with("NULL"))

Error message:

Error in `fn()`:
! In row 2, can't recycle input of size 5 to size 6.
Run `rlang::last_error()` to see where the error occurred.

Thank you!

CodePudding user response:

The last select line is optional depending on how you want the columns ordered and whether or not you wish to keep the NULL columns:

df |> 
  separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |> 
  pivot_longer(-UNIQUE_PATIENT_ID) |> 
  separate(value, into = c("timepoint", "value"), sep = "=") |> 
  mutate(name = str_remove(name, ".*=")) |> 
  pivot_wider(names_from = c("name", "timepoint")) |> 
  select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()), -contains("NULL"))

Using the example data with syntactic columns names, i.e. using a "." instead of "="), the following reprex may be run in a clean R session:

library(tidyverse)

df <- tribble(
  ~UNIQUE_PATIENT_ID,                  ~TIME_POINT_1.HAEMOGLOBIN,              ~TIME_POINT_2.WHITE_CELLS,
  "DIS-1101-1001-E1",             "BASELINE=123.00;FIRST=117.00",             "BASELINE=2.90;FIRST=2.40",
  "DIS-1101-1002-E1",                              "FIRST=92.00",                          "FIRST=14.80",
  "DIS-1101-1003-E1",                             "FIRST=111.00",                          "FIRST=11.00",
  "DIS-1101-1004-E1",             "BASELINE=125.00;FIRST=113.00",             "BASELINE=6.90;FIRST=7.30",
  "DIS-1101-1004-E2",                             "FIRST=124.00",                           "FIRST=4.50",
  "DIS-1101-1005-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1006-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1007-E1",                              "FIRST=77.00",                           "FIRST=6.10",
  "DIS-1101-1008-E1",                             "FIRST=132.00",                           "FIRST=7.70",
  "DIS-1101-1009-E1",                             "FIRST=123.00",                          "FIRST=16.20",
  "DIS-1101-1010-E1",              "BASELINE=119.00;FIRST=93.00",             "BASELINE=7.40;FIRST=5.30",
  "DIS-1101-1011-E1",             "BASELINE=163.00;FIRST=141.00",           "BASELINE=12.80;FIRST=10.30",
  "DIS-1101-1012-E1",                             "FIRST=137.00",                          "FIRST=14.50",
  "DIS-1101-1013-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1014-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1015-E1",                             "FIRST=137.00",                          "FIRST=12.80",
  "DIS-1101-1016-E1",                             "FIRST=127.00",                           "FIRST=3.70",
  "DIS-1101-1017-E1", "BASELINE=NULL;FIRST=101.00;SECOND=105.00", "BASELINE=4.20;FIRST=4.70;SECOND=6.50",
  "DIS-1101-1018-E1",                             "FIRST=120.00",                          "FIRST=16.60",
  "DIS-1101-1019-E1",                             "FIRST=145.00",                           "FIRST=9.30",
  "DIS-1101-1020-E1",                             "FIRST=126.00",                          "FIRST=15.90",
  "DIS-1101-1021-E1",               "BASELINE=93.00;FIRST=82.00",           "BASELINE=31.00;FIRST=31.70",
  "DIS-1101-1022-E1",              "BASELINE=114.00;FIRST=95.00",             "BASELINE=3.70;FIRST=7.00",
  "DIS-1101-1023-E1",             "BASELINE=137.00;FIRST=125.00",            "BASELINE=14.10;FIRST=7.20",
  "DIS-1101-1025-E1",             "BASELINE=157.00;FIRST=112.00",           "BASELINE=21.80;FIRST=31.20",
  "DIS-1101-1026-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1027-E1",                             "FIRST=127.00",                           "FIRST=9.20",
  "DIS-1101-1028-E1",                             "FIRST=147.00",                           "FIRST=5.90",
  "DIS-1101-1029-E1",               "BASELINE=80.00;FIRST=92.00",             "BASELINE=2.20;FIRST=0.50",
  "DIS-1101-1030-E1",             "BASELINE=129.00;FIRST=131.00",           "BASELINE=10.50;FIRST=10.60",
  "DIS-1101-1031-E1",                             "FIRST=108.00",                          "FIRST=12.00",
  "DIS-1101-1032-E1",               "BASELINE=92.00;FIRST=80.00",           "BASELINE=14.70;FIRST=11.80",
  "DIS-1101-1033-E1",             "BASELINE=118.00;FIRST=107.00",           "BASELINE=13.40;FIRST=13.00",
  "DIS-1101-1034-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1035-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1036-E1",                "BASELINE=NULL;FIRST=89.00",            "BASELINE=NULL;FIRST=18.60",
  "DIS-1101-1037-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1038-E1",               "BASELINE=NULL;FIRST=101.00",             "BASELINE=4.30;FIRST=5.90",
  "DIS-1101-1039-E1",                             "FIRST=151.00",                           "FIRST=7.60",
  "DIS-1101-1040-E1",                             "FIRST=106.00",                           "FIRST=3.10",
  "DIS-1101-1041-E1",               "BASELINE=81.00;FIRST=87.00",           "BASELINE=10.10;FIRST=11.40",
  "DIS-1101-1042-E1",                 "FIRST=94.00;SECOND=85.00",             "FIRST=12.00;SECOND=21.30",
  "DIS-1101-1042-E2",                              "FIRST=85.00",                           "FIRST=9.80",
  "DIS-1101-1043-E1",               "FIRST=118.00;SECOND=116.00",              "FIRST=29.50;SECOND=4.80",
  "DIS-1101-1044-E1",               "FIRST=101.00;SECOND=103.00",               "FIRST=4.60;SECOND=9.90",
  "DIS-1101-1045-E1",                             "FIRST=114.00",                          "FIRST=18.90",
  "DIS-1101-1046-E1",               "BASELINE=NULL;FIRST=129.00",           "BASELINE=15.00;FIRST=13.00",
  "DIS-1101-1047-E1",             "BASELINE=123.00;FIRST=105.00",           "BASELINE=12.80;FIRST=10.60",
  "DIS-1101-1048-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1049-E1",                                "NULL=NULL",                            "NULL=NULL"
  )

df |> 
  separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |> 
  pivot_longer(-UNIQUE_PATIENT_ID) |> 
  separate(value, into = c("timepoint", "value"), sep = "=") |> 
  mutate(name = str_remove(name, ".*\\.")) |> 
  pivot_wider(names_from = c("name", "timepoint")) |> 
  select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()), -contains("NULL"))

#> # A tibble: 50 × 7
#>    UNIQUE_PATIENT_ID HAEMOGLOBIN_BASELINE HAEMOGLOBIN_FIRST HAEMOGLOBIN_SECOND
#>    <chr>             <chr>                <chr>             <chr>             
#>  1 DIS-1101-1001-E1  123.00               117.00            <NA>              
#>  2 DIS-1101-1002-E1  <NA>                 92.00             <NA>              
#>  3 DIS-1101-1003-E1  <NA>                 111.00            <NA>              
#>  4 DIS-1101-1004-E1  125.00               113.00            <NA>              
#>  5 DIS-1101-1004-E2  <NA>                 124.00            <NA>              
#>  6 DIS-1101-1005-E1  <NA>                 <NA>              <NA>              
#>  7 DIS-1101-1006-E1  <NA>                 <NA>              <NA>              
#>  8 DIS-1101-1007-E1  <NA>                 77.00             <NA>              
#>  9 DIS-1101-1008-E1  <NA>                 132.00            <NA>              
#> 10 DIS-1101-1009-E1  <NA>                 123.00            <NA>              
#> # … with 40 more rows, and 3 more variables: WHITE_CELLS_BASELINE <chr>,
#> #   WHITE_CELLS_FIRST <chr>, WHITE_CELLS_SECOND <chr>

Created on 2022-07-04 by the reprex package (v2.0.1)

** UPDATE FOR YOUR WIDER SAMPLE2 DATASET **

To handle the varying number of separators across columns I've suggested a function to separate_rows iteratively by column. This overcomes the separate_rows error in the wider dataset that wasn't an issue in the sample dataset.

Using pattern matching, I've also extracted only strings referencing "BASELINE", "FIRST" or "SECOND" where these are associated with a value.

And as @harre notes, because some columns contain multiple, e.g. FIRSTs, I've subsequently taken the first of these using slice_head before using pivot_wider. You could instead take the last using slice_tail or take the mean etc. depending on the requirement.

You'll need to test further on your extended dataset, but hope this helps!

library(tidyverse)

df <- structure(list(UNIQUE_PATIENT_ID = c(
  "DIS-1104-1021-E1", "DIS-1107-1010-E1",
  "DIS-1124-1108-E1"
), `TIME_POINT_1=HAEMOGLOBIN` = c(
  "BASELINE=98.00;FIRST=90.00;SECOND=81.00;FIRST=85.00",
  "FIRST=120.00;NULL=120.00;NULL=118.00;SECOND=109.00;NULL=186.00",
  "BASELINE=78.00;FIRST=NULL;FIRST=99.00"
), `TIME_POINT_2=WHITE_CELLS` = c(
  "BASELINE=0.01;FIRST=NULL;SECOND=0.01;FIRST=0.00",
  "FIRST=15.40;NULL=16.10;NULL=14.50;SECOND=11.40;NULL=2.20;NULL=5.00",
  "BASELINE=85.00;FIRST=NULL;FIRST=209.00"
), `TIME_POINT_3=PLATELETS` = c(
  "BASELINE=28.00;FIRST=28.00;SECOND=12.00;FIRST=21.00",
  "FIRST=200.00;NULL=186.00;NULL=203.00;SECOND=243.00;NULL=145.00;NULL=457.00",
  "BASELINE=186.00;FIRST=NULL"
), `TIME_POINT_4=NEUTROPHILS` = c(
  "BASELINE=0.00;FIRST=0.00;SECOND=0.00",
  "FIRST=13.00;NULL=13.20;NULL=11.10;SECOND=7.70;NULL=1.30;NULL=2.20",
  "BASELINE=7.10;FIRST=2.40;FIRST=NULL"
), `TIME_POINT_5=LYMPHOCYTES` = c(
  "BASELINE=0.00;FIRST=0.00;SECOND=0.00",
  "FIRST=0.80;NULL=1.60;NULL=1.40;SECOND=1.70;NULL=0.40", "BASELINE=76.80;FIRST=135.00;FIRST=NULL"
), `TIME_POINT_6=MONOCYTES` = c(
  "BASELINE=0.00;FIRST=0.00;SECOND=0.00",
  "FIRST=1.50;NULL=1.20;NULL=1.90;SECOND=2.00;NULL=0.40;NULL=0.90",
  "BASELINE=1.40;FIRST=4.20;FIRST=NULL"
), `TIME_POINT_7=EOSINOPHILS` = c(
  "BASELINE=0.00;FIRST=0.00;SECOND=0.00",
  "FIRST=0.10;NULL=0.00;SECOND=0.00;NULL=0.10;NULL=0.30", "BASELINE=0.10;FIRST=0.40;FIRST=NULL"
), `TIME_POINT_8=FIBRINOGEN` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=6.90;NULL=6.60;NULL=7.10;SECOND=6.60;NULL=6.00;NULL=NULL",
  "BASELINE=6.70;FIRST=NULL"
), `TIME_POINT_9=PT` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=19.20;NULL=17.20;NULL=16.10;SECOND=14.40;NULL=14.90;NULL=NULL",
  "BASELINE=13.40;FIRST=NULL"
), `TIME_POINT_10=UREA` = c(
  "BASELINE=4.30;FIRST=4.30;SECOND=5.10;FIRST=4.50",
  "FIRST=3.80;NULL=3.70;NULL=4.40;SECOND=2.80;NULL=2.90;NULL=4.10",
  "BASELINE=5.10;FIRST=NULL;FIRST=7.20"
), `TIME_POINT_11=CREATININE` = c(
  "BASELINE=67.00;FIRST=55.00;SECOND=61.00;FIRST=60.00",
  "FIRST=44.00;NULL=42.00;NULL=45.00;SECOND=42.00;NULL=33.00",
  "BASELINE=65.00;FIRST=NULL;FIRST=72.00"
), `TIME_POINT_12=ALT` = c(
  "BASELINE=9.00;FIRST=11.00;SECOND=8.00;FIRST=5.00",
  "FIRST=20.00;NULL=NULL;NULL=19.00;SECOND=16.00;NULL=16.00;NULL=21.00",
  "BASELINE=64.00;FIRST=NULL;FIRST=92.00"
), `TIME_POINT_13=BILIRUBIN` = c(
  "BASELINE=3.00;FIRST=5.00;SECOND=3.00;FIRST=NULL",
  "FIRST=20.00;NULL=NULL;NULL=11.00;SECOND=10.00;NULL=5.00", "BASELINE=7.00;FIRST=NULL;FIRST=10.00"
), `TIME_POINT_14=ALBUMIN` = c(
  "BASELINE=42.00;FIRST=39.00;SECOND=35.00;FIRST=36.00",
  "FIRST=42.00;NULL=NULL;NULL=37.00;SECOND=35.00;NULL=35.00;NULL=39.00",
  "BASELINE=31.00;FIRST=NULL;FIRST=40.00"
), `TIME_POINT_15=VITAMIN_D` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_16=CRP` = c(
  "BASELINE=NULL;FIRST=172.00;SECOND=116.00;FIRST=212.00",
  "FIRST=427.00;NULL=420.00;NULL=338.00;SECOND=215.00;NULL=87.00;NULL=64.00",
  "BASELINE=199.00;FIRST=NULL;FIRST=1.00"
), `TIME_POINT_17=PCT` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_18=FERRITIN` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;NULL=378.00;SECOND=NULL", "BASELINE=809.00;FIRST=432.00;FIRST=NULL"
), `TIME_POINT_19=TROP_T` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;NULL=3.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_20=D_DIMER` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=1.85;NULL=1.65;SECOND=NULL;NULL=NULL", "BASELINE=749.00;FIRST=486.00;FIRST=NULL"
), `TIME_POINT_21=INR` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_22=BNP_NT` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_23=LDH` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=254.00;FIRST=NULL"
), `TIME_POINT_24=CK` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;NULL=29.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_25=SCD_25` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_26=BASE_EXCESS` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=3.10;NULL=1.50;NULL=NULL;SECOND=NULL", "BASELINE=-3.80;FIRST=NULL"
), `TIME_POINT_27=LACTATE` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=0.90;NULL=1.30;NULL=NULL;SECOND=NULL", "BASELINE=0.60;FIRST=NULL"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))



pattern <- c("FIRST=\\d{2,3}.\\d{0,2}|SECOND=\\d{2,3}.\\d{0,2}|BASELINE=\\d{2,3}.\\d{0,2}")

# Function to iteratively separate by column
separate_iteratively <- function(col){
  separate_rows(df2, col, sep = ";") |> 
    select(1, col)
}

# Extract only if BASELINE, FIRST or SECOND with an associate number
df2 <- df |> 
  mutate(across(-UNIQUE_PATIENT_ID, ~ str_extract_all(., pattern) |> 
                  map_chr(~str_c(., collapse = ";"))))

# Separate & rearrange
df3 <- map_dfr(2:ncol(df2), separate_iteratively) |> 
  pivot_longer(-UNIQUE_PATIENT_ID) |> 
  filter(!is.na(value) & !value == "") |> 
  separate(value, into = c("timepoint", "value"), sep = "=") |> 
  mutate(name = str_remove(name, ".*=")) |> 
  arrange(UNIQUE_PATIENT_ID, name, timepoint) |> 
  group_by(UNIQUE_PATIENT_ID, name, timepoint) |> 
  slice_head(n = 1) |> # Or use slice_tail
  pivot_wider(names_from = c("name", "timepoint"))|> 
  select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()))

df3
#> # A tibble: 3 × 35
#> # Groups:   UNIQUE_PATIENT_ID [3]
#>   UNIQUE_PATIENT_ID ALBUMIN_BASELINE ALBUMIN_FIRST ALBUMIN_SECOND ALT_BASELINE
#>   <chr>             <chr>            <chr>         <chr>          <chr>       
#> 1 DIS-1104-1021-E1  42.00            39.00         35.00          <NA>        
#> 2 DIS-1107-1010-E1  <NA>             42.00         35.00          <NA>        
#> 3 DIS-1124-1108-E1  31.00            40.00         <NA>           64.00       
#> # … with 30 more variables: ALT_FIRST <chr>, ALT_SECOND <chr>,
#> #   BILIRUBIN_FIRST <chr>, BILIRUBIN_SECOND <chr>, CREATININE_BASELINE <chr>,
#> #   CREATININE_FIRST <chr>, CREATININE_SECOND <chr>, CRP_BASELINE <chr>,
#> #   CRP_FIRST <chr>, CRP_SECOND <chr>, D_DIMER_BASELINE <chr>,
#> #   D_DIMER_FIRST <chr>, FERRITIN_BASELINE <chr>, FERRITIN_FIRST <chr>,
#> #   HAEMOGLOBIN_BASELINE <chr>, HAEMOGLOBIN_FIRST <chr>,
#> #   HAEMOGLOBIN_SECOND <chr>, LDH_FIRST <chr>, LYMPHOCYTES_BASELINE <chr>, …


# Just to illustrate the issue with varying separators
tribble(~v1, ~v2,
        "a;a", "b;b",
        "a;a", "b;b;b",
        ) |> 
  separate_rows(everything(), sep = ";")
#> Error in `fn()`:
#> ! In row 2, can't recycle input of size 2 to size 3.

# Works
tribble(~v1, ~v2,
        "a;a", "b;b",
        "a;a", "b;b",
) |> 
  separate_rows(everything(), sep = ";")
#> # A tibble: 4 × 2
#>   v1    v2   
#>   <chr> <chr>
#> 1 a     b    
#> 2 a     b    
#> 3 a     b    
#> 4 a     b

Created on 2022-07-04 by the reprex package (v2.0.1)

CodePudding user response:

Building on my comments above and @Carl's implementation, you you could utilize the nested list structure directly using values_fn, which might save you from some lines of code. I am using first() from dplyr to take the first element, but you could as well use other functions depending on what you prefer, ~ mean(as.numeric(.)) to take the mean etc. There is no varying number of separators when we pivot_longer before using separate_rows, when I run the data.

df |> 
    pivot_longer(-UNIQUE_PATIENT_ID) |> 
    separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |> 
    separate(value, into = c("timepoint", "value"), sep = "=") |> 
    mutate(name = str_remove(name, ".*=")) |>
    filter(!is.na(value) & value != "NULL" & timepoint != "NULL") |>
    pivot_wider(names_from = c("name", "timepoint"), values_fn = ~ first(.)) |>
    select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()))
  • Related