Please have a look at the reprex at the end of the post. I need to read a column as a string, perform several manipulations and then save convert it to a numerical column. The blanks ("") in the string column give me a headache because arrow does not convert them to numerical missing values NA.
Does anybody know how to achieve that? Many thanks
library(tidyverse)
library(arrow)
#> Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information.
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
df <- tibble(x=rep(c("4000 -", "6000 -", "", "8000 - "), 10),
y=seq(1,10, length=40))
write_csv(df, "test_string.csv")
data <- open_dataset("test_string.csv",
format="csv",
skip=1,
schema=schema(x=string(), y=double()))
data2 <- data |>
mutate(x= sub(" -.*", "", x) ) |>
mutate(x2=as.numeric(x)) |>
collect() ## how to convert the blank to a numeric NA ?
#> Error in `collect()`:
#> ! Invalid: Failed to parse string: '' as a scalar of type double
#> Backtrace:
#> ▆
#> 1. ├─dplyr::collect(mutate(mutate(data, x = sub(" -.*", "", x)), x2 = as.numeric(x)))
#> 2. └─arrow:::collect.arrow_dplyr_query(mutate(mutate(data, x = sub(" -.*", "", x)), x2 = as.numeric(x)))
#> 3. └─base::tryCatch(...)
#> 4. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 5. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#> 6. └─value[[3L]](cond)
#> 7. └─arrow:::augment_io_error_msg(e, call, schema = x$.data$schema)
#> 8. └─rlang::abort(msg, call = call)
sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Debian GNU/Linux 11 (bullseye)
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.13.so
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] arrow_10.0.0 forcats_0.5.2 stringr_1.4.1 dplyr_1.0.10
#> [5] purrr_0.3.5 readr_2.1.3 tidyr_1.2.1 tibble_3.1.8
#> [9] ggplot2_3.4.0 tidyverse_1.3.2
#>
#> loaded via a namespace (and not attached):
#> [1] lubridate_1.9.0 assertthat_0.2.1 digest_0.6.30
#> [4] utf8_1.2.2 R6_2.5.1 cellranger_1.1.0
#> [7] backports_1.4.1 reprex_2.0.2 evaluate_0.17
#> [10] httr_1.4.4 highr_0.9 pillar_1.8.1
#> [13] rlang_1.0.6 googlesheets4_1.0.1 readxl_1.4.1
#> [16] R.utils_2.12.1 R.oo_1.25.0 rmarkdown_2.17
#> [19] styler_1.8.0 googledrive_2.0.0 bit_4.0.4
#> [22] munsell_0.5.0 broom_1.0.1 compiler_4.2.2
#> [25] modelr_0.1.9 xfun_0.34 pkgconfig_2.0.3
#> [28] htmltools_0.5.3 tidyselect_1.2.0 fansi_1.0.3
#> [31] crayon_1.5.2 tzdb_0.3.0 dbplyr_2.2.1
#> [34] withr_2.5.0 R.methodsS3_1.8.2 grid_4.2.2
#> [37] jsonlite_1.8.3 gtable_0.3.1 lifecycle_1.0.3
#> [40] DBI_1.1.3 magrittr_2.0.3 scales_1.2.1
#> [43] vroom_1.6.0 cli_3.4.1 stringi_1.7.8
#> [46] fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2
#> [49] generics_0.1.3 vctrs_0.5.0 tools_4.2.2
#> [52] bit64_4.0.5 R.cache_0.16.0 glue_1.6.2
#> [55] hms_1.1.2 parallel_4.2.2 fastmap_1.1.0
#> [58] yaml_2.3.6 timechange_0.1.1 colorspace_2.0-3
#> [61] gargle_1.2.1 rvest_1.0.3 knitr_1.40
#> [64] haven_2.5.1
Created on 2022-11-07 with reprex v2.0.2
CodePudding user response:
Try using the read_csv
instead of open_dataset
library(readr)
data <- read_csv("test_string.csv")
CodePudding user response:
ifelse
works here when all classes are correct (and not double()
); if_else
enforces this already, so we can use either.
data |>
mutate(x = sub(" -.*", "", x)) |>
mutate(
x = ifelse(x == "", NA_character_, x), # also if_else works
x2 = as.numeric(x)
) |>
collect()
# # A tibble: 40 x 3
# x y x2
# <chr> <dbl> <dbl>
# 1 4000 1 4000
# 2 6000 1.23 6000
# 3 NA 1.46 NA
# 4 8000 1.69 8000
# 5 4000 1.92 4000
# 6 6000 2.15 6000
# 7 NA 2.38 NA
# 8 8000 2.62 8000
# 9 4000 2.85 4000
# 10 6000 3.08 6000
# # ... with 30 more rows