I need to detect (among other things) the first occurrence of a non-"F" code in a patient's list, after the first "F" code occurrence. The below code seems to succeed in this, however it is shown to be too inefficient on the server running in a data set of one million observations.
The final data set should have a variable of number of non-F codes (nhosp), and the first non-F code found after the first F-code appearance on the DAIGNOSTICO variable. No duplicates of ID.
- How can I improve both in terms of complexity and speed? Tidyverse pipe preferred.
This is how the result should look like:
# A tibble: 7 × 6
# Groups: ID [7]
ID DAIGNOSTICO data_entrada data_saida nhosp ficd
<dbl> <chr> <date> <date> <dbl> <chr>
1 1555 F180 1930-04-05 2005-03-15 1 T124
2 1234 F100 1980-04-01 2005-03-02 2 O155
3 16666 F120 1990-06-05 2005-03-18 0 <NA>
4 123456 F145 2001-03-07 2005-03-11 2 T123
5 177778 F155 2001-04-13 2005-03-22 2 G123
6 166666 F125 2002-03-12 2005-03-19 2 W345
7 12345 F150 2002-06-03 2005-03-07 4 K709
This is how my code looks like currently:
library(readr)
library(dplyr)
library(tidyr)
simulation <- read_csv("SIMULADO.txt", col_types = cols(
data_entrada = col_date("%d/%m/%Y"),
data_saida = col_date("%d/%m/%Y")
)
)
simulation <- as.data.frame(simulation)
simulation[, "nhosp"] <- 0
oldpos <- 1
for (i in 1:nrow(simulation)) {
if (grepl("F", simulation[i, "DAIGNOSTICO"], )) { # Has F?
oldpos <- i
clin <- 0
simulation[i, "hasF"] <- T
} else {
simulation[i, "hasF"] <-F
}
if (simulation[i, "ID"] == simulation[oldpos, "ID"]) { # same person?
if (simulation[oldpos, "hasF"] == T) { # Did she/him had F?
simulation[i, "hasF"] <- T
if (simulation[i, "data_entrada"] > simulation[oldpos, "data_entrada"]) { # é subsequente?
if (!grepl("F", simulation[i, "DAIGNOSTICO"], )) { # not-F?
simulation[i,"hasC"] <- T
clin <- 1
simulation[i, "ficd"] <- simulation[i, "DAIGNOSTICO"]
simulation[i, "nhosp"] <- clin
first_cc <- simulation[i, "DAIGNOSTICO"]
}
}
}
}
}
dt1 <- simulation %>%
arrange(data_entrada) %>%
group_by(ID) %>%
select(ficd) %>%
drop_na() %>%
slice(1)
dt2 <- simulation %>%
arrange(data_entrada) %>%
group_by(ID) %>%
filter(hasF == T) %>%
mutate(nhosp = cumsum(nhosp),
nhosp = max(nhosp)) %>%
select(-ficd,-hasF, -hasC) %>%
distinct(ID, .keep_all = TRUE) %>%
full_join(dt1, by = "ID")
dt2
And this is an example data set, with some errors to check robustness of the code:
ID, DAIGNOSTICO, data_entrada, data_saida
123490, O100, 01/04/1980, 02/03/2005
123490, O100, 01/04/1981, 02/03/2005
123491, O101, 01/04/1980, 02/03/2005
123491, O101, 01/04/1981, 02/03/2005
1234, F100, 01/04/1980, 02/03/2005
1234, O155, 02/04/1980, 03/03/2005
1234, G123, 05/05/1982, 04/03/2005
12345, T124, 01/06/2002, 05/03/2005
12345, Y124, 02/06/2002, 06/03/2005
12345, F150, 03/06/2002, 07/03/2005
12345, K709, 04/06/2002, 08/03/2005
12345, Y709, 05/06/2002, 09/03/2005
12345, F150, 03/06/2002, 07/03/2005
12345, K710, 06/06/2002, 08/03/2005
12345, K711, 07/06/2002, 10/03/2005
12345, F150, 08/06/2002, 07/03/2005
123456, F145, 07/03/2001, 11/03/2005
123456, T123, 08/03/2001, 12/03/2005
123456, P123, 09/03/2001, 13/03/2005
1555 ,R155, 04/04/1930, 14/03/2005
1555 ,F180, 05/04/1930, 15/03/2005
1555 ,T124, 06/04/1930, 16/03/2005
1555 ,F708, 07/04/1930, 17/03/2005
16666 ,F120, 05/06/1990, 18/03/2005
166666, F125, 12/03/2002, 19/03/2005
166666, W345, 13/03/2002, 20/03/2005
166666, L123, 14/03/2002, 21/03/2005
177778, F155, 13/04/2001, 22/03/2005
177778, G123, 14/04/2001, 23/03/2005
177778, F190, 15/04/2001, 24/03/2005
177778, E124, 16/04/2001, 25/03/2005
177779, G155, 13/04/2001, 22/03/2005
177779, G123, 14/04/2001, 23/03/2005
177779, G190, 15/04/2001, 24/03/2005
177779, E124, 16/04/2001, 25/03/2005
CodePudding user response:
You could use
library(dplyr)
library(stringr)
df %>%
group_by(ID) %>%
filter(cumsum(str_detect(DAIGNOSTICO, "^F")) > 0) %>%
mutate(nhosp = sum(str_detect(DAIGNOSTICO, "^[^F]")),
ficd = lead(DAIGNOSTICO)) %>%
filter(str_detect(DAIGNOSTICO, "^F")) %>%
slice(1) %>%
ungroup()
This returns
# A tibble: 7 x 6
ID DAIGNOSTICO data_entrada data_saida nhosp ficd
<dbl> <chr> <chr> <chr> <int> <chr>
1 1234 F100 01/04/1980 02/03/2005 2 O155
2 1555 F180 05/04/1930 15/03/2005 1 T124
3 12345 F150 03/06/2002 07/03/2005 4 K709
4 16666 F120 05/06/1990 18/03/2005 0 NA
5 123456 F145 07/03/2001 11/03/2005 2 T123
6 166666 F125 12/03/2002 19/03/2005 2 W345
7 177778 F155 13/04/2001 22/03/2005 2 G123
Edit
I think there might be a flaw, perhaps
library(dplyr)
library(stringr)
df %>%
group_by(ID) %>%
filter(
cumsum(str_detect(DAIGNOSTICO, "^F")) == 1 |
!str_detect(DAIGNOSTICO, "^F") & cumsum(str_detect(DAIGNOSTICO, "^F")) > 0
) %>%
mutate(nhosp = sum(str_detect(DAIGNOSTICO, "^[^F]")),
ficd = lead(DAIGNOSTICO)) %>%
filter(str_detect(DAIGNOSTICO, "^F")) %>%
slice(1) %>%
ungroup()
is a better solution.