I'm a newbie at R. The dput of a sample data is given below:
structure(list(fullcode = c("C08J 11/04-11/28", "C22B 7/00-7/04, 19/30, 25/06",
"B01D 53/00-53/96", "F01N 3/00-3/38", "B01D 45/00-51/00", "A01N 25/00-65/00"
), firstdigits = c("C08J", "C22B", "B01D", "F01N", "B01D", "A01N"
), lastdigits = c("11/04-11/28", "7/00-7/04", "53/00-53/96",
"3/00-3/38", "45/00-51/00", "25/00-65/00")), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
You'll see that "lastdigits" are comprised of numerical ranges in the form ab/cd-wx/yz
. What I want to do is separate these ranges into separate rows, one row per integer. For example, I want to separate the row value 25/00-65/00
into separate rows, each row containing one of 25/00, 25/01, ... 25/99, 26/00, 26/01, ..., 64/98, 64/99, 65/00
in ascending order.
Another example is to separate the row value 53/00-53/96
into separate rows, each containing one of 53/00, 53/01, ..., 53/95, 53/96
.
A colleague suggested the following code, but it doesn't work for my case; I believe the below approach assumed that numbers before the \
sign are fixed:
doubled <- grepl("-", range2$lastdigits)
my_data <-
Map(
f = function(x, y) {
out <- range2[x, ]
if (doubled[[x]]) {
lastdigits <- gsub(
"(\\d )/(\\d )-(\\d )/(\\d )",
"c(\\1, \\2, \\3)", out[["lastdigits"]]
) |>
str2lang() |>
eval()
lastdigits[2:3] <- sort(lastdigits[2:3])
out <- data.frame(
firstdigits = out[["firstdigits"]],
lastdigits = paste(lastdigits[[1]], seq(from = lastdigits[[2]], to = lastdigits[[3]], by = 1),
sep = "/"
)
)
}
out
},
x = seq_len(nrow(range22)),
y = doubled
) |>
do.call(what = rbind)
Anyone have any ideas as to how I can easily find a solution? Thank you!
CodePudding user response:
We may use
library(dplyr)
library(tidyr)
library(purrr)
df1 %>%
separate(lastdigits, into = c('start', 'end'), sep = "-") %>%
mutate(across(start:end, ~ as.numeric(chartr("/", ".", .x))),
lastdigits = map2(start, end, ~ str_replace(sprintf('%0.2f',
seq(.x, .y, by = 0.01)), fixed("."), "/")), start = NULL, end = NULL) %>%
unnest(lastdigits)
-output
# A tibble: 4,768 × 3
fullcode firstdigits lastdigits
<chr> <chr> <chr>
1 C08J 11/04-11/28 C08J 11/04
2 C08J 11/04-11/28 C08J 11/05
3 C08J 11/04-11/28 C08J 11/06
4 C08J 11/04-11/28 C08J 11/07
5 C08J 11/04-11/28 C08J 11/08
6 C08J 11/04-11/28 C08J 11/09
7 C08J 11/04-11/28 C08J 11/10
8 C08J 11/04-11/28 C08J 11/11
9 C08J 11/04-11/28 C08J 11/12
10 C08J 11/04-11/28 C08J 11/13
# … with 4,758 more rows
CodePudding user response:
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(
temp = str_remove_all(lastdigits, "/"),
temp = strsplit(temp, split = "-", fixed = TRUE),
temp = lapply(temp, \(x) as.integer(x[1]):as.integer(x[2]))
) %>%
unnest(temp) %>%
mutate(final = paste0(substr(temp, 1, 2), "/", substr(temp, 3, 4)))
# # A tibble: 4,768 × 5
# fullcode firstdigits lastdigits temp final
# <chr> <chr> <chr> <int> <chr>
# 1 C08J 11/04-11/28 C08J 11/04-11/28 1104 11/04
# 2 C08J 11/04-11/28 C08J 11/04-11/28 1105 11/05
# 3 C08J 11/04-11/28 C08J 11/04-11/28 1106 11/06
# 4 C08J 11/04-11/28 C08J 11/04-11/28 1107 11/07
# 5 C08J 11/04-11/28 C08J 11/04-11/28 1108 11/08
# 6 C08J 11/04-11/28 C08J 11/04-11/28 1109 11/09
# 7 C08J 11/04-11/28 C08J 11/04-11/28 1110 11/10
# 8 C08J 11/04-11/28 C08J 11/04-11/28 1111 11/11
# 9 C08J 11/04-11/28 C08J 11/04-11/28 1112 11/12
# 10 C08J 11/04-11/28 C08J 11/04-11/28 1113 11/13
# # … with 4,758 more rows