I'm trying to split multiple columns, but struggling to do this efficiently.
I have a df:
> dput(df)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1",
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1",
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1",
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1",
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1",
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1",
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1",
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1",
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1",
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1",
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1",
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1",
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=123.00;FIRST=117.00",
"FIRST=92.00", "FIRST=111.00", "BASELINE=125.00;FIRST=113.00",
"FIRST=124.00", "NULL=NULL", "NULL=NULL", "FIRST=77.00", "FIRST=132.00",
"FIRST=123.00", "BASELINE=119.00;FIRST=93.00", "BASELINE=163.00;FIRST=141.00",
"FIRST=137.00", "NULL=NULL", "NULL=NULL", "FIRST=137.00", "FIRST=127.00",
"BASELINE=NULL;FIRST=101.00;SECOND=105.00", "FIRST=120.00", "FIRST=145.00",
"FIRST=126.00", "BASELINE=93.00;FIRST=82.00", "BASELINE=114.00;FIRST=95.00",
"BASELINE=137.00;FIRST=125.00", "BASELINE=157.00;FIRST=112.00",
"NULL=NULL", "FIRST=127.00", "FIRST=147.00", "BASELINE=80.00;FIRST=92.00",
"BASELINE=129.00;FIRST=131.00", "FIRST=108.00", "BASELINE=92.00;FIRST=80.00",
"BASELINE=118.00;FIRST=107.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=89.00",
"NULL=NULL", "BASELINE=NULL;FIRST=101.00", "FIRST=151.00", "FIRST=106.00",
"BASELINE=81.00;FIRST=87.00", "FIRST=94.00;SECOND=85.00", "FIRST=85.00",
"FIRST=118.00;SECOND=116.00", "FIRST=101.00;SECOND=103.00", "FIRST=114.00",
"BASELINE=NULL;FIRST=129.00", "BASELINE=123.00;FIRST=105.00",
"NULL=NULL", "NULL=NULL"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=2.90;FIRST=2.40",
"FIRST=14.80", "FIRST=11.00", "BASELINE=6.90;FIRST=7.30", "FIRST=4.50",
"NULL=NULL", "NULL=NULL", "FIRST=6.10", "FIRST=7.70", "FIRST=16.20",
"BASELINE=7.40;FIRST=5.30", "BASELINE=12.80;FIRST=10.30", "FIRST=14.50",
"NULL=NULL", "NULL=NULL", "FIRST=12.80", "FIRST=3.70", "BASELINE=4.20;FIRST=4.70;SECOND=6.50",
"FIRST=16.60", "FIRST=9.30", "FIRST=15.90", "BASELINE=31.00;FIRST=31.70",
"BASELINE=3.70;FIRST=7.00", "BASELINE=14.10;FIRST=7.20", "BASELINE=21.80;FIRST=31.20",
"NULL=NULL", "FIRST=9.20", "FIRST=5.90", "BASELINE=2.20;FIRST=0.50",
"BASELINE=10.50;FIRST=10.60", "FIRST=12.00", "BASELINE=14.70;FIRST=11.80",
"BASELINE=13.40;FIRST=13.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=18.60",
"NULL=NULL", "BASELINE=4.30;FIRST=5.90", "FIRST=7.60", "FIRST=3.10",
"BASELINE=10.10;FIRST=11.40", "FIRST=12.00;SECOND=21.30", "FIRST=9.80",
"FIRST=29.50;SECOND=4.80", "FIRST=4.60;SECOND=9.90", "FIRST=18.90",
"BASELINE=15.00;FIRST=13.00", "BASELINE=12.80;FIRST=10.60", "NULL=NULL",
"NULL=NULL")), row.names = c(NA, -50L), class = c("tbl_df", "tbl",
"data.frame"))
I want this output:
dput(output)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1",
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1",
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1",
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1",
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1",
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1",
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1",
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1",
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1",
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1",
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1",
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1",
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), Haemoglobin_BASELINE = c("123.00", NA, NA, "125.00", NA, NA,
NA, NA, NA, NA, "119.00", "163.00", NA, NA, NA, NA, NA, "NULL",
NA, NA, NA, "93.00", "114.00", "137.00", "157.00", NA, NA, NA,
"80.00", "129.00", NA, "92.00", "118.00", NA, NA, "NULL", NA,
"NULL", NA, NA, "81.00", NA, NA, NA, NA, NA, "NULL", "123.00",
NA, NA), Haemoglobin_FIRST = c("117.00", "92.00", "111.00", "113.00",
"124.00", NA, NA, "77.00", "132.00", "123.00", "93.00", "141.00",
"137.00", NA, NA, "137.00", "127.00", "101.00", "120.00", "145.00",
"126.00", "82.00", "95.00", "125.00", "112.00", NA, "127.00",
"147.00", "92.00", "131.00", "108.00", "80.00", "107.00", NA,
NA, "89.00", NA, "101.00", "151.00", "106.00", "87.00", "94.00",
"85.00", "118.00", "101.00", "114.00", "129.00", "105.00", NA,
NA), Haemoglobin_SECOND = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "105.00", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "85.00", NA, "116.00", "103.00", NA, NA, NA, NA, NA),
WBC_BASELINE = c("2.90", NA, NA, "6.90", NA, NA, NA, NA,
NA, NA, "7.40", "12.80", NA, NA, NA, NA, NA, "4.20", NA,
NA, NA, "31.00", "3.70", "14.10", "21.80", NA, NA, NA, "2.20",
"10.50", NA, "14.70", "13.40", NA, NA, "NULL", NA, "4.30",
NA, NA, "10.10", NA, NA, NA, NA, NA, "15.00", "12.80", NA,
NA), WBC_FIRST = c("2.40", "14.80", "11.00", "7.30", "4.50",
NA, NA, "6.10", "7.70", "16.20", "5.30", "10.30", "14.50",
NA, NA, "12.80", "3.70", "4.70", "16.60", "9.30", "15.90",
"31.70", "7.00", "7.20", "31.20", NA, "9.20", "5.90", "0.50",
"10.60", "12.00", "11.80", "13.00", NA, NA, "18.60", NA,
"5.90", "7.60", "3.10", "11.40", "12.00", "9.80", "29.50",
"4.60", "18.90", "13.00", "10.60", NA, NA), WBC_SECOND = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "6.50", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "21.30", NA,
"4.80", "9.90", NA, NA, NA, NA, NA)), row.names = c(NA, -50L
), class = "data.frame")
I can achieve this, but my code is inefficient (as I am doing this on each blood column):
#haemoglobin
haem_ID <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_1=HAEMOGLOBIN`)
haem_sep <- haem_ID %>%
rename(text = `TIME_POINT_1=HAEMOGLOBIN`) %>%
separate_rows(text, sep = ';') %>%
separate(text, c('text', 'value'), sep = '=') %>%
pivot_wider(names_from = text, values_from = value)
tmp <- haem_sep %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("Haemoglobin", colnames(tmp), sep = "_")
tmp2 <- haem_ID %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-Haemoglobin_NULL)
haem <- tmp3
#white cells
a <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_2=WHITE_CELLS`)
b <- a %>%
rename(text = `TIME_POINT_2=WHITE_CELLS`) %>%
separate_rows(text, sep = ';') %>%
separate(text, c('text', 'value'), sep = '=') %>%
pivot_wider(names_from = text, values_from = value)
tmp <- b %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("WBC", colnames(tmp), sep = "_")
tmp2 <- a %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-WBC_NULL)
WBC <- tmp3
output <- merge(haem, WBC)
Pls can someone help me achieve the same but more effectively please? In reality I have 20 blood tests and not just 2, so hard-coding it is not ideal.
UPDATE
Upon the kind responses, I've tried running the suggested code but I'm getting an error and I can't figure out why.
input data
structure(list(UNIQUE_PATIENT_ID = c("DIS-1104-1021-E1", "DIS-1107-1010-E1",
"DIS-1124-1108-E1"), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=98.00;FIRST=90.00;SECOND=81.00;FIRST=85.00",
"FIRST=120.00;NULL=120.00;NULL=118.00;SECOND=109.00;NULL=186.00",
"BASELINE=78.00;FIRST=NULL;FIRST=99.00"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=0.01;FIRST=NULL;SECOND=0.01;FIRST=0.00",
"FIRST=15.40;NULL=16.10;NULL=14.50;SECOND=11.40;NULL=2.20;NULL=5.00",
"BASELINE=85.00;FIRST=NULL;FIRST=209.00"), `TIME_POINT_3=PLATELETS` = c("BASELINE=28.00;FIRST=28.00;SECOND=12.00;FIRST=21.00",
"FIRST=200.00;NULL=186.00;NULL=203.00;SECOND=243.00;NULL=145.00;NULL=457.00",
"BASELINE=186.00;FIRST=NULL"), `TIME_POINT_4=NEUTROPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=13.00;NULL=13.20;NULL=11.10;SECOND=7.70;NULL=1.30;NULL=2.20",
"BASELINE=7.10;FIRST=2.40;FIRST=NULL"), `TIME_POINT_5=LYMPHOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=0.80;NULL=1.60;NULL=1.40;SECOND=1.70;NULL=0.40", "BASELINE=76.80;FIRST=135.00;FIRST=NULL"
), `TIME_POINT_6=MONOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=1.50;NULL=1.20;NULL=1.90;SECOND=2.00;NULL=0.40;NULL=0.90",
"BASELINE=1.40;FIRST=4.20;FIRST=NULL"), `TIME_POINT_7=EOSINOPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=0.10;NULL=0.00;SECOND=0.00;NULL=0.10;NULL=0.30", "BASELINE=0.10;FIRST=0.40;FIRST=NULL"
), `TIME_POINT_8=FIBRINOGEN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=6.90;NULL=6.60;NULL=7.10;SECOND=6.60;NULL=6.00;NULL=NULL",
"BASELINE=6.70;FIRST=NULL"), `TIME_POINT_9=PT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=19.20;NULL=17.20;NULL=16.10;SECOND=14.40;NULL=14.90;NULL=NULL",
"BASELINE=13.40;FIRST=NULL"), `TIME_POINT_10=UREA` = c("BASELINE=4.30;FIRST=4.30;SECOND=5.10;FIRST=4.50",
"FIRST=3.80;NULL=3.70;NULL=4.40;SECOND=2.80;NULL=2.90;NULL=4.10",
"BASELINE=5.10;FIRST=NULL;FIRST=7.20"), `TIME_POINT_11=CREATININE` = c("BASELINE=67.00;FIRST=55.00;SECOND=61.00;FIRST=60.00",
"FIRST=44.00;NULL=42.00;NULL=45.00;SECOND=42.00;NULL=33.00",
"BASELINE=65.00;FIRST=NULL;FIRST=72.00"), `TIME_POINT_12=ALT` = c("BASELINE=9.00;FIRST=11.00;SECOND=8.00;FIRST=5.00",
"FIRST=20.00;NULL=NULL;NULL=19.00;SECOND=16.00;NULL=16.00;NULL=21.00",
"BASELINE=64.00;FIRST=NULL;FIRST=92.00"), `TIME_POINT_13=BILIRUBIN` = c("BASELINE=3.00;FIRST=5.00;SECOND=3.00;FIRST=NULL",
"FIRST=20.00;NULL=NULL;NULL=11.00;SECOND=10.00;NULL=5.00", "BASELINE=7.00;FIRST=NULL;FIRST=10.00"
), `TIME_POINT_14=ALBUMIN` = c("BASELINE=42.00;FIRST=39.00;SECOND=35.00;FIRST=36.00",
"FIRST=42.00;NULL=NULL;NULL=37.00;SECOND=35.00;NULL=35.00;NULL=39.00",
"BASELINE=31.00;FIRST=NULL;FIRST=40.00"), `TIME_POINT_15=VITAMIN_D` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_16=CRP` = c("BASELINE=NULL;FIRST=172.00;SECOND=116.00;FIRST=212.00",
"FIRST=427.00;NULL=420.00;NULL=338.00;SECOND=215.00;NULL=87.00;NULL=64.00",
"BASELINE=199.00;FIRST=NULL;FIRST=1.00"), `TIME_POINT_17=PCT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_18=FERRITIN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=378.00;SECOND=NULL", "BASELINE=809.00;FIRST=432.00;FIRST=NULL"
), `TIME_POINT_19=TROP_T` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=3.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_20=D_DIMER` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=1.85;NULL=1.65;SECOND=NULL;NULL=NULL", "BASELINE=749.00;FIRST=486.00;FIRST=NULL"
), `TIME_POINT_21=INR` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_22=BNP_NT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_23=LDH` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=254.00;FIRST=NULL"
), `TIME_POINT_24=CK` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=29.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_25=SCD_25` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_26=BASE_EXCESS` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=3.10;NULL=1.50;NULL=NULL;SECOND=NULL", "BASELINE=-3.80;FIRST=NULL"
), `TIME_POINT_27=LACTATE` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=0.90;NULL=1.30;NULL=NULL;SECOND=NULL", "BASELINE=0.60;FIRST=NULL"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
Code:
output <- input |>
separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |>
pivot_longer(-UNIQUE_PATIENT_ID) |>
separate(value, into = c("timepoint", "value"), sep = "=") |>
mutate(name = str_remove(name, ".*=")) |>
pivot_wider(names_from = c("name", "timepoint")) |>
select(-ends_with("NULL"))
Error message:
Error in `fn()`:
! In row 2, can't recycle input of size 5 to size 6.
Run `rlang::last_error()` to see where the error occurred.
Thank you!
CodePudding user response:
The last select
line is optional depending on how you want the columns ordered and whether or not you wish to keep the NULL columns:
df |>
separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |>
pivot_longer(-UNIQUE_PATIENT_ID) |>
separate(value, into = c("timepoint", "value"), sep = "=") |>
mutate(name = str_remove(name, ".*=")) |>
pivot_wider(names_from = c("name", "timepoint")) |>
select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()), -contains("NULL"))
Using the example data with syntactic columns names, i.e. using a "." instead of "="), the following reprex may be run in a clean R session:
library(tidyverse)
df <- tribble(
~UNIQUE_PATIENT_ID, ~TIME_POINT_1.HAEMOGLOBIN, ~TIME_POINT_2.WHITE_CELLS,
"DIS-1101-1001-E1", "BASELINE=123.00;FIRST=117.00", "BASELINE=2.90;FIRST=2.40",
"DIS-1101-1002-E1", "FIRST=92.00", "FIRST=14.80",
"DIS-1101-1003-E1", "FIRST=111.00", "FIRST=11.00",
"DIS-1101-1004-E1", "BASELINE=125.00;FIRST=113.00", "BASELINE=6.90;FIRST=7.30",
"DIS-1101-1004-E2", "FIRST=124.00", "FIRST=4.50",
"DIS-1101-1005-E1", "NULL=NULL", "NULL=NULL",
"DIS-1101-1006-E1", "NULL=NULL", "NULL=NULL",
"DIS-1101-1007-E1", "FIRST=77.00", "FIRST=6.10",
"DIS-1101-1008-E1", "FIRST=132.00", "FIRST=7.70",
"DIS-1101-1009-E1", "FIRST=123.00", "FIRST=16.20",
"DIS-1101-1010-E1", "BASELINE=119.00;FIRST=93.00", "BASELINE=7.40;FIRST=5.30",
"DIS-1101-1011-E1", "BASELINE=163.00;FIRST=141.00", "BASELINE=12.80;FIRST=10.30",
"DIS-1101-1012-E1", "FIRST=137.00", "FIRST=14.50",
"DIS-1101-1013-E1", "NULL=NULL", "NULL=NULL",
"DIS-1101-1014-E1", "NULL=NULL", "NULL=NULL",
"DIS-1101-1015-E1", "FIRST=137.00", "FIRST=12.80",
"DIS-1101-1016-E1", "FIRST=127.00", "FIRST=3.70",
"DIS-1101-1017-E1", "BASELINE=NULL;FIRST=101.00;SECOND=105.00", "BASELINE=4.20;FIRST=4.70;SECOND=6.50",
"DIS-1101-1018-E1", "FIRST=120.00", "FIRST=16.60",
"DIS-1101-1019-E1", "FIRST=145.00", "FIRST=9.30",
"DIS-1101-1020-E1", "FIRST=126.00", "FIRST=15.90",
"DIS-1101-1021-E1", "BASELINE=93.00;FIRST=82.00", "BASELINE=31.00;FIRST=31.70",
"DIS-1101-1022-E1", "BASELINE=114.00;FIRST=95.00", "BASELINE=3.70;FIRST=7.00",
"DIS-1101-1023-E1", "BASELINE=137.00;FIRST=125.00", "BASELINE=14.10;FIRST=7.20",
"DIS-1101-1025-E1", "BASELINE=157.00;FIRST=112.00", "BASELINE=21.80;FIRST=31.20",
"DIS-1101-1026-E1", "NULL=NULL", "NULL=NULL",
"DIS-1101-1027-E1", "FIRST=127.00", "FIRST=9.20",
"DIS-1101-1028-E1", "FIRST=147.00", "FIRST=5.90",
"DIS-1101-1029-E1", "BASELINE=80.00;FIRST=92.00", "BASELINE=2.20;FIRST=0.50",
"DIS-1101-1030-E1", "BASELINE=129.00;FIRST=131.00", "BASELINE=10.50;FIRST=10.60",
"DIS-1101-1031-E1", "FIRST=108.00", "FIRST=12.00",
"DIS-1101-1032-E1", "BASELINE=92.00;FIRST=80.00", "BASELINE=14.70;FIRST=11.80",
"DIS-1101-1033-E1", "BASELINE=118.00;FIRST=107.00", "BASELINE=13.40;FIRST=13.00",
"DIS-1101-1034-E1", "NULL=NULL", "NULL=NULL",
"DIS-1101-1035-E1", "NULL=NULL", "NULL=NULL",
"DIS-1101-1036-E1", "BASELINE=NULL;FIRST=89.00", "BASELINE=NULL;FIRST=18.60",
"DIS-1101-1037-E1", "NULL=NULL", "NULL=NULL",
"DIS-1101-1038-E1", "BASELINE=NULL;FIRST=101.00", "BASELINE=4.30;FIRST=5.90",
"DIS-1101-1039-E1", "FIRST=151.00", "FIRST=7.60",
"DIS-1101-1040-E1", "FIRST=106.00", "FIRST=3.10",
"DIS-1101-1041-E1", "BASELINE=81.00;FIRST=87.00", "BASELINE=10.10;FIRST=11.40",
"DIS-1101-1042-E1", "FIRST=94.00;SECOND=85.00", "FIRST=12.00;SECOND=21.30",
"DIS-1101-1042-E2", "FIRST=85.00", "FIRST=9.80",
"DIS-1101-1043-E1", "FIRST=118.00;SECOND=116.00", "FIRST=29.50;SECOND=4.80",
"DIS-1101-1044-E1", "FIRST=101.00;SECOND=103.00", "FIRST=4.60;SECOND=9.90",
"DIS-1101-1045-E1", "FIRST=114.00", "FIRST=18.90",
"DIS-1101-1046-E1", "BASELINE=NULL;FIRST=129.00", "BASELINE=15.00;FIRST=13.00",
"DIS-1101-1047-E1", "BASELINE=123.00;FIRST=105.00", "BASELINE=12.80;FIRST=10.60",
"DIS-1101-1048-E1", "NULL=NULL", "NULL=NULL",
"DIS-1101-1049-E1", "NULL=NULL", "NULL=NULL"
)
df |>
separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |>
pivot_longer(-UNIQUE_PATIENT_ID) |>
separate(value, into = c("timepoint", "value"), sep = "=") |>
mutate(name = str_remove(name, ".*\\.")) |>
pivot_wider(names_from = c("name", "timepoint")) |>
select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()), -contains("NULL"))
#> # A tibble: 50 × 7
#> UNIQUE_PATIENT_ID HAEMOGLOBIN_BASELINE HAEMOGLOBIN_FIRST HAEMOGLOBIN_SECOND
#> <chr> <chr> <chr> <chr>
#> 1 DIS-1101-1001-E1 123.00 117.00 <NA>
#> 2 DIS-1101-1002-E1 <NA> 92.00 <NA>
#> 3 DIS-1101-1003-E1 <NA> 111.00 <NA>
#> 4 DIS-1101-1004-E1 125.00 113.00 <NA>
#> 5 DIS-1101-1004-E2 <NA> 124.00 <NA>
#> 6 DIS-1101-1005-E1 <NA> <NA> <NA>
#> 7 DIS-1101-1006-E1 <NA> <NA> <NA>
#> 8 DIS-1101-1007-E1 <NA> 77.00 <NA>
#> 9 DIS-1101-1008-E1 <NA> 132.00 <NA>
#> 10 DIS-1101-1009-E1 <NA> 123.00 <NA>
#> # … with 40 more rows, and 3 more variables: WHITE_CELLS_BASELINE <chr>,
#> # WHITE_CELLS_FIRST <chr>, WHITE_CELLS_SECOND <chr>
Created on 2022-07-04 by the reprex package (v2.0.1)
** UPDATE FOR YOUR WIDER SAMPLE2 DATASET **
To handle the varying number of separators across columns I've suggested a function to separate_rows
iteratively by column. This overcomes the separate_rows
error in the wider dataset that wasn't an issue in the sample dataset.
Using pattern matching, I've also extracted only strings referencing "BASELINE", "FIRST" or "SECOND" where these are associated with a value.
And as @harre notes, because some columns contain multiple, e.g. FIRSTs, I've subsequently taken the first of these using slice_head
before using pivot_wider
. You could instead take the last using slice_tail
or take the mean etc. depending on the requirement.
You'll need to test further on your extended dataset, but hope this helps!
library(tidyverse)
df <- structure(list(UNIQUE_PATIENT_ID = c(
"DIS-1104-1021-E1", "DIS-1107-1010-E1",
"DIS-1124-1108-E1"
), `TIME_POINT_1=HAEMOGLOBIN` = c(
"BASELINE=98.00;FIRST=90.00;SECOND=81.00;FIRST=85.00",
"FIRST=120.00;NULL=120.00;NULL=118.00;SECOND=109.00;NULL=186.00",
"BASELINE=78.00;FIRST=NULL;FIRST=99.00"
), `TIME_POINT_2=WHITE_CELLS` = c(
"BASELINE=0.01;FIRST=NULL;SECOND=0.01;FIRST=0.00",
"FIRST=15.40;NULL=16.10;NULL=14.50;SECOND=11.40;NULL=2.20;NULL=5.00",
"BASELINE=85.00;FIRST=NULL;FIRST=209.00"
), `TIME_POINT_3=PLATELETS` = c(
"BASELINE=28.00;FIRST=28.00;SECOND=12.00;FIRST=21.00",
"FIRST=200.00;NULL=186.00;NULL=203.00;SECOND=243.00;NULL=145.00;NULL=457.00",
"BASELINE=186.00;FIRST=NULL"
), `TIME_POINT_4=NEUTROPHILS` = c(
"BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=13.00;NULL=13.20;NULL=11.10;SECOND=7.70;NULL=1.30;NULL=2.20",
"BASELINE=7.10;FIRST=2.40;FIRST=NULL"
), `TIME_POINT_5=LYMPHOCYTES` = c(
"BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=0.80;NULL=1.60;NULL=1.40;SECOND=1.70;NULL=0.40", "BASELINE=76.80;FIRST=135.00;FIRST=NULL"
), `TIME_POINT_6=MONOCYTES` = c(
"BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=1.50;NULL=1.20;NULL=1.90;SECOND=2.00;NULL=0.40;NULL=0.90",
"BASELINE=1.40;FIRST=4.20;FIRST=NULL"
), `TIME_POINT_7=EOSINOPHILS` = c(
"BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=0.10;NULL=0.00;SECOND=0.00;NULL=0.10;NULL=0.30", "BASELINE=0.10;FIRST=0.40;FIRST=NULL"
), `TIME_POINT_8=FIBRINOGEN` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=6.90;NULL=6.60;NULL=7.10;SECOND=6.60;NULL=6.00;NULL=NULL",
"BASELINE=6.70;FIRST=NULL"
), `TIME_POINT_9=PT` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=19.20;NULL=17.20;NULL=16.10;SECOND=14.40;NULL=14.90;NULL=NULL",
"BASELINE=13.40;FIRST=NULL"
), `TIME_POINT_10=UREA` = c(
"BASELINE=4.30;FIRST=4.30;SECOND=5.10;FIRST=4.50",
"FIRST=3.80;NULL=3.70;NULL=4.40;SECOND=2.80;NULL=2.90;NULL=4.10",
"BASELINE=5.10;FIRST=NULL;FIRST=7.20"
), `TIME_POINT_11=CREATININE` = c(
"BASELINE=67.00;FIRST=55.00;SECOND=61.00;FIRST=60.00",
"FIRST=44.00;NULL=42.00;NULL=45.00;SECOND=42.00;NULL=33.00",
"BASELINE=65.00;FIRST=NULL;FIRST=72.00"
), `TIME_POINT_12=ALT` = c(
"BASELINE=9.00;FIRST=11.00;SECOND=8.00;FIRST=5.00",
"FIRST=20.00;NULL=NULL;NULL=19.00;SECOND=16.00;NULL=16.00;NULL=21.00",
"BASELINE=64.00;FIRST=NULL;FIRST=92.00"
), `TIME_POINT_13=BILIRUBIN` = c(
"BASELINE=3.00;FIRST=5.00;SECOND=3.00;FIRST=NULL",
"FIRST=20.00;NULL=NULL;NULL=11.00;SECOND=10.00;NULL=5.00", "BASELINE=7.00;FIRST=NULL;FIRST=10.00"
), `TIME_POINT_14=ALBUMIN` = c(
"BASELINE=42.00;FIRST=39.00;SECOND=35.00;FIRST=36.00",
"FIRST=42.00;NULL=NULL;NULL=37.00;SECOND=35.00;NULL=35.00;NULL=39.00",
"BASELINE=31.00;FIRST=NULL;FIRST=40.00"
), `TIME_POINT_15=VITAMIN_D` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_16=CRP` = c(
"BASELINE=NULL;FIRST=172.00;SECOND=116.00;FIRST=212.00",
"FIRST=427.00;NULL=420.00;NULL=338.00;SECOND=215.00;NULL=87.00;NULL=64.00",
"BASELINE=199.00;FIRST=NULL;FIRST=1.00"
), `TIME_POINT_17=PCT` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_18=FERRITIN` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=378.00;SECOND=NULL", "BASELINE=809.00;FIRST=432.00;FIRST=NULL"
), `TIME_POINT_19=TROP_T` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=3.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_20=D_DIMER` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=1.85;NULL=1.65;SECOND=NULL;NULL=NULL", "BASELINE=749.00;FIRST=486.00;FIRST=NULL"
), `TIME_POINT_21=INR` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_22=BNP_NT` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_23=LDH` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=254.00;FIRST=NULL"
), `TIME_POINT_24=CK` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=29.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_25=SCD_25` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_26=BASE_EXCESS` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=3.10;NULL=1.50;NULL=NULL;SECOND=NULL", "BASELINE=-3.80;FIRST=NULL"
), `TIME_POINT_27=LACTATE` = c(
"BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=0.90;NULL=1.30;NULL=NULL;SECOND=NULL", "BASELINE=0.60;FIRST=NULL"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))
pattern <- c("FIRST=\\d{2,3}.\\d{0,2}|SECOND=\\d{2,3}.\\d{0,2}|BASELINE=\\d{2,3}.\\d{0,2}")
# Function to iteratively separate by column
separate_iteratively <- function(col){
separate_rows(df2, col, sep = ";") |>
select(1, col)
}
# Extract only if BASELINE, FIRST or SECOND with an associate number
df2 <- df |>
mutate(across(-UNIQUE_PATIENT_ID, ~ str_extract_all(., pattern) |>
map_chr(~str_c(., collapse = ";"))))
# Separate & rearrange
df3 <- map_dfr(2:ncol(df2), separate_iteratively) |>
pivot_longer(-UNIQUE_PATIENT_ID) |>
filter(!is.na(value) & !value == "") |>
separate(value, into = c("timepoint", "value"), sep = "=") |>
mutate(name = str_remove(name, ".*=")) |>
arrange(UNIQUE_PATIENT_ID, name, timepoint) |>
group_by(UNIQUE_PATIENT_ID, name, timepoint) |>
slice_head(n = 1) |> # Or use slice_tail
pivot_wider(names_from = c("name", "timepoint"))|>
select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()))
df3
#> # A tibble: 3 × 35
#> # Groups: UNIQUE_PATIENT_ID [3]
#> UNIQUE_PATIENT_ID ALBUMIN_BASELINE ALBUMIN_FIRST ALBUMIN_SECOND ALT_BASELINE
#> <chr> <chr> <chr> <chr> <chr>
#> 1 DIS-1104-1021-E1 42.00 39.00 35.00 <NA>
#> 2 DIS-1107-1010-E1 <NA> 42.00 35.00 <NA>
#> 3 DIS-1124-1108-E1 31.00 40.00 <NA> 64.00
#> # … with 30 more variables: ALT_FIRST <chr>, ALT_SECOND <chr>,
#> # BILIRUBIN_FIRST <chr>, BILIRUBIN_SECOND <chr>, CREATININE_BASELINE <chr>,
#> # CREATININE_FIRST <chr>, CREATININE_SECOND <chr>, CRP_BASELINE <chr>,
#> # CRP_FIRST <chr>, CRP_SECOND <chr>, D_DIMER_BASELINE <chr>,
#> # D_DIMER_FIRST <chr>, FERRITIN_BASELINE <chr>, FERRITIN_FIRST <chr>,
#> # HAEMOGLOBIN_BASELINE <chr>, HAEMOGLOBIN_FIRST <chr>,
#> # HAEMOGLOBIN_SECOND <chr>, LDH_FIRST <chr>, LYMPHOCYTES_BASELINE <chr>, …
# Just to illustrate the issue with varying separators
tribble(~v1, ~v2,
"a;a", "b;b",
"a;a", "b;b;b",
) |>
separate_rows(everything(), sep = ";")
#> Error in `fn()`:
#> ! In row 2, can't recycle input of size 2 to size 3.
# Works
tribble(~v1, ~v2,
"a;a", "b;b",
"a;a", "b;b",
) |>
separate_rows(everything(), sep = ";")
#> # A tibble: 4 × 2
#> v1 v2
#> <chr> <chr>
#> 1 a b
#> 2 a b
#> 3 a b
#> 4 a b
Created on 2022-07-04 by the reprex package (v2.0.1)
CodePudding user response:
Building on my comments above and @Carl's implementation, you you could utilize the nested list structure directly using values_fn
, which might save you from some lines of code. I am using first()
from dplyr
to take the first element, but you could as well use other functions depending on what you prefer, ~ mean(as.numeric(.))
to take the mean etc. There is no varying number of separators when we pivot_longer
before using separate_rows
, when I run the data.
df |>
pivot_longer(-UNIQUE_PATIENT_ID) |>
separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |>
separate(value, into = c("timepoint", "value"), sep = "=") |>
mutate(name = str_remove(name, ".*=")) |>
filter(!is.na(value) & value != "NULL" & timepoint != "NULL") |>
pivot_wider(names_from = c("name", "timepoint"), values_fn = ~ first(.)) |>
select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()))