I have a df in wide format that has columns for start and end times for negative and positive controls (total 4 columns). I want to condense those four columns into 3 columns with one being "control" that is binary with a "1" indicating it is control and a "0" indicating it is an experiment, the second being "start_time", and the third being "end_time". Please see my dput for MRE and preferably I would like to see how this is done with the tidyverse
data <- structure(list(region = c("el_valle", "el_valle", "el_valle",
"el_valle", "el_valle", "el_valle", "el_valle", "el_valle", "el_cope",
"el_cope", "el_cope", "el_cope", "el_cope", "el_cope", "el_cope",
"el_cope", "el_cope", "santa_fe", "santa_fe", "santa_fe", "santa_fe",
"santa_fe", "santa_fe", "el_cope", "el_cope", "el_cope", "el_cope",
"el_cope", "el_cope"), site = c("jordinal", "jordinal", "jordinal",
"jordinal", "jordinal", "rio_maria", "rio_maria", "rio_maria",
"rio_blanco", "rio_blanco", "rio_blanco", "guabal", "guabal",
"guabal", "rio_tigrero", "rio_tigrero", "rio_tigrero", "altos_de piedra",
"altos_de piedra", "altos_de piedra", "altos_de piedra", "altos_de piedra",
"altos_de piedra", "sophia_stream", "sophia_stream", "sophia_stream",
"medina", "medina", "medina"), start_date = c("2022-07-21T00:00:00 00:00",
"2022-07-21T00:00:00 00:00", "2022-07-21T00:00:00 00:00", "2022-07-21T00:00:00 00:00",
"2022-07-21T00:00:00 00:00", "2022-07-21T00:00:00 00:00", "2022-07-21T00:00:00 00:00",
"2022-07-21T00:00:00 00:00", "2022-07-23T00:00:00 00:00", "2022-07-23T00:00:00 00:00",
"2022-07-23T00:00:00 00:00", "2022-07-24T00:00:00 00:00", "2022-07-24T00:00:00 00:00",
"2022-07-24T00:00:00 00:00", "2022-07-26T00:00:00 00:00", "2022-07-26T00:00:00 00:00",
"2022-07-26T00:00:00 00:00", "2022-07-28T00:00:00 00:00", "2022-07-28T00:00:00 00:00",
"2022-07-28T00:00:00 00:00", "2022-07-28T00:00:00 00:00", "2022-07-28T00:00:00 00:00",
"2022-07-28T00:00:00 00:00", "2022-07-29T00:00:00 00:00", "2022-07-29T00:00:00 00:00",
"2022-07-29T00:00:00 00:00", "2022-07-29T00:00:00 00:00", "2022-07-29T00:00:00 00:00",
"2022-07-29T00:00:00 00:00"), geometry = c("POINT (-80.0864405 8.6514853)",
"POINT (-80.0865047 8.6514879)", "POINT (-80.0864767 8.6514699)",
"POINT (-80.0864365 8.6514033)", "POINT (-80.59016347153077 8.657592487131978)",
"POINT (-80.0723613 8.6433237)", "POINT (-80.0724193 8.6433145)",
"POINT (-80.59012719701079 8.657653855819484)", "POINT (-80.59084727642541 8.65736553296408)",
"POINT (-80.59008739454525 8.65762533073169)", "POINT (-80.5900603404817 8.657645618088532)",
"POINT (-80.59006448025009 8.65775089241877)", "POINT (-80.5900978883977 8.657643924266186)",
"POINT (-80.59013662686417 8.657665177897595)", "POINT (-119.81285845363512 39.537657157446034)",
"POINT (-119.81285845363512 39.537657157446034)", "POINT (-119.81285845363512 39.537657157446034)",
"POINT (-119.81306464514674 39.53745922338899)", "POINT (-119.81306464514674 39.53745922338899)",
"POINT (-119.81306464514674 39.53745922338899)", "POINT (-119.81306464514674 39.53745922338899)",
"POINT (-119.81306464514674 39.53745922338899)", "POINT (-119.81306464514674 39.53745922338899)",
"POINT (-119.81300112116044 39.53757812923485)", "POINT (-119.81300112116044 39.53757812923485)",
"POINT (-119.8130185476632 39.53759315771993)", "POINT (-119.81295059817982 39.53742649195985)",
"POINT (-119.81295059817982 39.53742649195985)", "POINT (-119.8129264843882 39.53773406660491)"
), id_edna_sample = c("220721_edna_001", "220721_edna_002", "220721_edna_003",
"220721_edna_004", "220721_edna_007", "220721_005", "220721_006",
"220721_007", "220723_001", "220723_002", "220723_003", "220724_001",
"220724_002", "220724_003", "220726_eDNA_001", "220726_eDNA_002",
"220726_eDNA_003", "220728_eDNA_001", "220728_eDNA_002", "220728_eDNA_003",
"220728_eDNA_004", "220728_eDNA_005", "220728_eDNA_006", "220729_eDNA_001",
"220729_eDNA_002", "220729_eDNA_005", "220729_eDNA_003", "220729_eDNA_004",
"220729_eDNA_005"), transect_meters_sampled_m = c(0, 0, 0, 0,
0, 10, 10, 0, 200, 200, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, NA), start_time_negative_control_edna_filtering = structure(c(NA,
NA, NA, NA, 67800, NA, NA, 67800, NA, NA, 67680, NA, NA, 65760,
NA, NA, 62700, NA, NA, NA, NA, 68340, 68340, NA, NA, 45600, NA,
NA, 45600), class = c("hms", "difftime"), units = "secs"), finish_time_negative_control_edna_filtering = structure(c(NA,
NA, NA, NA, 68520, NA, NA, 68520, NA, NA, 67920, NA, NA, 66360,
NA, NA, 63000, NA, NA, NA, NA, 68640, 68640, NA, NA, 46020, NA,
NA, 46020), class = c("hms", "difftime"), units = "secs"), start_time_sample_edna_filtering = structure(c(39180,
40020, 40800, 41940, NA, 54480, 55860, NA, 65280, 66480, NA,
63300, 64380, NA, 61500, 60300, NA, 66900, 66900, 67680, 67680,
NA, NA, 42720, 43560, NA, 44280, 44940, NA), class = c("hms",
"difftime"), units = "secs"), finish_time_sample_edna_filtering = structure(c(39540,
40380, 41400, 42480, NA, 54720, 56280, NA, 65520, 66840, NA,
63660, 64980, NA, 62160, 60660, NA, 67500, 67500, 68160, 68160,
NA, NA, 43140, 43920, NA, 44700, 45360, NA), class = c("hms",
"difftime"), units = "secs"), volume_of_filtered_sample_ml = c(1000,
1000, 1000, NA, NA, 1000, NA, NA, 1000, 1000, NA, 1000, 1000,
1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
1000, 1000, 1000, 1000, NA), edn_comments = c("Jordinal A right. 5 and 0.45 filters. Slightly less than 1L. 10:11 am took sample",
"Jordinal A left.", "Jordinal B right. 10:58 am took sample",
"Jordinal B left. 10:07 took sample", NA, "Taken at 12.47 transect A right",
"Transect A left taken at 12:47", NA, "Taken at 18:30", "Taken at 18.30",
NA, NA, NA, NA, "Left 5 and 0.45 um", "Right 5 and 0.45 um",
"Negative 5 and 0.45 um", "Left 5um", "Left 0.45um", "Right 5um",
"Right 0.45um", "Negative 5um", "Negative 0.45um", "Left 0.45 and 5um",
"Right 0.45 and 5um", "Negative 5 and 0.45 um", "Left 0.45 and 5 um",
"Right 0.45 and 5 um", "Negative 5 and 0.45um")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -29L))
CodePudding user response:
We can use the names_pattern
and its ".value"
special name. I'll end in %>% str()
so that we can see the real data since it's a bit wide as-is. (For clarity: do not use %>% str()
in production, it is only for display purposes here.)
library(tidyr)
pivot_longer(
data,
c(start_time_negative_control_edna_filtering, finish_time_negative_control_edna_filtering,
start_time_sample_edna_filtering, finish_time_sample_edna_filtering),
names_pattern = "(start|finish)_time(.*)_edna_filtering",
names_to = c(".value", "control")) %>%
str()
# tibble [58 x 11] (S3: tbl_df/tbl/data.frame)
# $ region : chr [1:58] "el_valle" "el_valle" "el_valle" "el_valle" ...
# $ site : chr [1:58] "jordinal" "jordinal" "jordinal" "jordinal" ...
# $ start_date : chr [1:58] "2022-07-21T00:00:00 00:00" "2022-07-21T00:00:00 00:00" "2022-07-21T00:00:00 00:00" "2022-07-21T00:00:00 00:00" ...
# $ geometry : chr [1:58] "POINT (-80.0864405 8.6514853)" "POINT (-80.0864405 8.6514853)" "POINT (-80.0865047 8.6514879)" "POINT (-80.0865047 8.6514879)" ...
# $ id_edna_sample : chr [1:58] "220721_edna_001" "220721_edna_001" "220721_edna_002" "220721_edna_002" ...
# $ transect_meters_sampled_m : num [1:58] 0 0 0 0 0 0 0 0 0 0 ...
# $ volume_of_filtered_sample_ml: num [1:58] 1000 1000 1000 1000 1000 1000 NA NA NA NA ...
# $ edn_comments : chr [1:58] "Jordinal A right. 5 and 0.45 filters. Slightly less than 1L. 10:11 am took sample" "Jordinal A right. 5 and 0.45 filters. Slightly less than 1L. 10:11 am took sample" "Jordinal A left." "Jordinal A left." ...
# $ control : chr [1:58] "_negative_control" "_sample" "_negative_control" "_sample" ...
# $ start : 'hms' num [1:58] NA 10:53:00 NA 11:07:00 ...
# ..- attr(*, "units")= chr "secs"
# $ finish : 'hms' num [1:58] NA 10:59:00 NA 11:13:00 ...
# ..- attr(*, "units")= chr "secs"
I didn't finish the control
column into a binary yet, I thought I'd keep it as the extract string so you can see better what's going on with the data. The added step to get that will be
... %>%
mutate(control = grepl("control", control))
(assuming dplyr
as well).