I have a string, "1500|3|10000|5"
and I wish to have a numeric vector like so:
[1] 1500 3 10000 5
strsplit is much faster than str_extract_all
. Is strsplit the fastest way to do this?
library("tidyverse")
library("microbenchmark")
x <- "1500|3|10000|5"
# mean ~ 137 microseconds
microbenchmark(
x |>
str_extract_all("\\d ") |>
unlist(use.names = FALSE) |>
as.double()
)
# mean ~ 15 microseconds
microbenchmark(
x |>
strsplit(split = "\\|") |>
unlist(use.names = FALSE) |>
as.double()
)
CodePudding user response:
stringi
seems to be slightly faster, also pipe should be omitted for max speed.
library(stringr)
library(stringi)
set.seed(123)
x <- paste0(sample(100000), collapse = '|')
microbenchmark::microbenchmark(
str_extract = x |>
str_extract_all("\\d ") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit = x |>
strsplit(split = "\\|") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit_fixed = x |>
strsplit(split = '|', fixed = TRUE) |>
unlist(use.names = FALSE) |>
as.numeric(),
stringi = as.numeric(stri_split_fixed(x, '|')[[1]]),
stringi2 = x |>
stri_split_fixed(pattern = '|') |>
unlist(use.names = FALSE) |>
as.numeric()
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# str_extract 27.5158 27.77085 28.63940 28.01650 28.32090 36.7092 100 c
# strsplit 50.6624 51.16750 52.11587 51.55955 51.98610 59.2446 100 d
# strsplit_fixed 18.9921 19.24650 20.95589 19.40140 19.68805 113.9647 100 b
# stringi 17.8246 18.13970 18.53155 18.31015 18.57825 26.4410 100 a
# stringi2 18.2519 18.64035 19.21868 18.78765 19.20105 27.1056 100 ab
CodePudding user response:
I'm going to assume that this question is because you have a large pipe-delimited file and you need to turn it into a data frame.
If you've already read the file into a char vector x
:
x <- readLines("mydelimfile.txt")
# base R
df <- read.delim(text=x, sep="|", header=FALSE)
# with readr
df <- readr::read_delim(paste0(x, collapse="\n"), delim="|", col_names=FALSE)
But you can just convert the file directly into a df:
df <- read.delim("mydelimfile.txt", sep="|")
df <- readr::read_delim("mydelimfile.txt", delim="|")
CodePudding user response:
Using fixed = TRUE
in strsplit
provides better results. On a larger sample str_extract_all
performs better than strsplit
code in your post.
library(stringr)
set.seed(123)
x <- paste0(sample(100000), collapse = '|')
microbenchmark::microbenchmark(
str_extract = x |>
str_extract_all("\\d ") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit = x |>
strsplit(split = "\\|") |>
unlist(use.names = FALSE) |>
as.double() ,
strsplit_fixed = x |>
strsplit(split = '|', fixed = TRUE) |>
unlist(use.names = FALSE) |>
as.numeric()
)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# str_extract 27.00734 28.68815 30.62537 29.62420 31.59296 55.36550 100 b
# strsplit 87.71705 91.47075 97.39022 94.99620 101.27776 123.17484 100 c
# strsplit_fixed 17.57684 20.08943 23.03720 21.59174 23.40159 49.83912 100 a