Please have a look at the snippet at the end of the file. I am making my first baby steps with arrow and R to deal with files which are too large to be loaded in memory.
I am trying to reproduce the steps here
https://francoismichonneau.net/2022/10/import-big-csv/
which illustrates how to convert a large tsv file to a parquet file without loading it into memory. I know it is bad practice to use a link pointing to your data, but I have no alternative to provide a minimal example which nevertheless fails.
You can download the test.tsv file (a small extract of a much larger file) here
https://mega.nz/file/V9FnhazJ#YXBvpuRVQ9s3nVkWCQEaWn0g9Ul6pmtXlap6X7Zkz1E
In the reprex I have some errors but I do not understand where they come from and they do not appear compatible with the data. Any help is much appreciated!
library(tidyverse)
library(arrow)
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
data1<- read_tsv("test.tsv") ## data in memory
#> Rows: 10000 Columns: 29
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: "\t"
#> chr (27): AID_MEASURE_ID, DATE_CREATED, DATE_GRANTED, AA_PUBLISHED_DATE, SER...
#> dbl (2): GRANTED_AMOUNT_FROM_EUR, NOMINAL_AMOUNT_EUR_FROM
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data1
#> # A tibble: 10,000 × 29
#> AID_MEASURE_ID DATE_…¹ DATE_…² AA_PU…³ SERVE…⁴ AM_TI…⁵ AM_TI…⁶ STATUS AM_PR…⁷
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 SA.41416 24/11/… 18/08/… 09/12/… TM-100… NGA Sc… NGA Sc… Publi… N
#> 2 SA.41416 24/11/… 18/08/… 09/12/… TM-100… NGA Sc… NGA Sc… Publi… N
#> 3 SA.41416 24/11/… 18/08/… 09/12/… TM-100… NGA Sc… NGA Sc… Publi… N
#> 4 SA.41416 24/11/… 18/08/… 09/12/… TM-100… NGA Sc… NGA Sc… Publi… N
#> 5 SA.41416 25/11/… 25/08/… 09/12/… TM-100… NGA Sc… NGA Sc… Publi… N
#> 6 SA.41416 25/11/… 29/08/… 09/12/… TM-100… NGA Sc… NGA Sc… Publi… N
#> 7 SA.45235 02/11/… 27/10/… 02/11/… TM-100… 2014–2… Measur… Publi… X
#> 8 SA.45235 02/11/… 27/10/… 02/11/… TM-100… 2014–2… Measur… Publi… X
#> 9 SA.33193 08/11/… 27/10/… 11/01/… TM-100… R&D&I-… R & D … Publi… N
#> 10 SA.42321 16/11/… 01/11/… 01/06/… TM-100… POTENC… POTENT… Publi… X
#> # … with 9,990 more rows, 20 more variables: COFINANCE <chr>, OBJECTIVE <chr>,
#> # OTHER_OBJECTIVE_EN <chr>, AID_INSTRUMENT <chr>,
#> # OTHER_AID_INSTRUMENT_EN <chr>, BENEFICIARY_NAME <chr>,
#> # BENEFICIARY_NAME_ENGLISH <chr>, BENEFICIARY_NATIONAL_ID <chr>,
#> # BENEFICIARY_NAT_ID_TYPE_SD <chr>, BENEFICIARY_TYPE_SD <chr>,
#> # COUNTRY_SD <chr>, REGION_SD <chr>, SECTOR_SD <chr>,
#> # GRANTED_AMOUNT_FROM_EUR <dbl>, NOMINAL_AMOUNT_EUR_FROM <dbl>, …
#> # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
data2 <- open_dataset("test.tsv", format="tsv")
data2
#> FileSystemDataset with 1 csv file
#> AID_MEASURE_ID: string
#> DATE_CREATED: string
#> DATE_GRANTED: string
#> AA_PUBLISHED_DATE: string
#> SERVER_REF: string
#> AM_TITLE: string
#> AM_TITLE_EN: string
#> STATUS: string
#> AM_PROC_TYPE_CD: string
#> COFINANCE: string
#> OBJECTIVE: string
#> OTHER_OBJECTIVE_EN: null
#> AID_INSTRUMENT: string
#> OTHER_AID_INSTRUMENT_EN: string
#> BENEFICIARY_NAME: string
#> BENEFICIARY_NAME_ENGLISH: string
#> BENEFICIARY_NATIONAL_ID: string
#> BENEFICIARY_NAT_ID_TYPE_SD: string
#> BENEFICIARY_TYPE_SD: string
#> COUNTRY_SD: string
#> REGION_SD: string
#> SECTOR_SD: string
#> GRANTED_AMOUNT_FROM_EUR: double
#> NOMINAL_AMOUNT_EUR_FROM: double
#> GRANT_RANGE: string
#> GRANTING_AUTHORITY_NAME: string
#> GRANTING_AUTHORITY_NAME_EN: string
#> NUTS_CD: string
#> GRANTING_AUTHORITY_COUNTRY: string
data3 <- read_tsv_arrow("test.tsv",as_data_frame=F)
##This works
write_dataset(
data3,
format = "parquet",
path = ".",
max_rows_per_file = 1e7
)
print("Done with the first parquet file")
#> [1] "Done with the first parquet file"
## but this does not...why?
write_dataset(
data2,
format = "parquet",
path = ".",
max_rows_per_file = 1e7
)
#> Error: Invalid: In CSV column #11: Row #9107: CSV conversion error to null: invalid value 'SECURITY ARRANGEMENTS'
###Apparently there is an error with the string "Security Arrangements" on line
### 9107. Let us see
data_error <- data1[9107, ] |>
glimpse()
#> Rows: 1
#> Columns: 29
#> $ AID_MEASURE_ID <chr> "SA.33193"
#> $ DATE_CREATED <chr> "27/09/17"
#> $ DATE_GRANTED <chr> "14/09/17"
#> $ AA_PUBLISHED_DATE <chr> "27/09/17"
#> $ SERVER_REF <chr> "TM-10080819"
#> $ AM_TITLE <chr> "R&D&I-scheme Flanders. Prolongation of an …
#> $ AM_TITLE_EN <chr> "R & D & I-scheme Flanders. Extension of an…
#> $ STATUS <chr> "Published"
#> $ AM_PROC_TYPE_CD <chr> "N"
#> $ COFINANCE <chr> "No"
#> $ OBJECTIVE <chr> "Research and development"
#> $ OTHER_OBJECTIVE_EN <chr> NA
#> $ AID_INSTRUMENT <chr> "Direct grant"
#> $ OTHER_AID_INSTRUMENT_EN <chr> NA
#> $ BENEFICIARY_NAME <chr> "INDAVER"
#> $ BENEFICIARY_NAME_ENGLISH <chr> "INDAVER"
#> $ BENEFICIARY_NATIONAL_ID <chr> "0427.973.304"
#> $ BENEFICIARY_NAT_ID_TYPE_SD <chr> "KBO/BCE"
#> $ BENEFICIARY_TYPE_SD <chr> "Only large enterprises"
#> $ COUNTRY_SD <chr> "Belgium"
#> $ REGION_SD <chr> NA
#> $ SECTOR_SD <chr> "E.38.21-Treatment and disposal of non-haza…
#> $ GRANTED_AMOUNT_FROM_EUR <dbl> 644531
#> $ NOMINAL_AMOUNT_EUR_FROM <dbl> NA
#> $ GRANT_RANGE <chr> "Not Range"
#> $ GRANTING_AUTHORITY_NAME <chr> "VLAIO"
#> $ GRANTING_AUTHORITY_NAME_EN <chr> "VLAIO"
#> $ NUTS_CD <chr> NA
#> $ GRANTING_AUTHORITY_COUNTRY <chr> "Belgium"
### but I see nothing about that text on that line
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Debian GNU/Linux 11 (bullseye)
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] arrow_9.0.0.2 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.9
#> [5] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.8
#> [9] ggplot2_3.3.6 tidyverse_1.3.1
#>
#> loaded via a namespace (and not attached):
#> [1] lubridate_1.8.0 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2
#> [5] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 reprex_2.0.1
#> [9] evaluate_0.15 httr_1.4.3 highr_0.9 pillar_1.8.0
#> [13] rlang_1.0.4 readxl_1.4.0 R.utils_2.12.0 R.oo_1.25.0
#> [17] rmarkdown_2.14 styler_1.7.0 bit_4.0.4 munsell_0.5.0
#> [21] broom_1.0.0 compiler_4.2.1 modelr_0.1.8 xfun_0.31
#> [25] pkgconfig_2.0.3 htmltools_0.5.2 tidyselect_1.1.2 fansi_1.0.3
#> [29] crayon_1.5.1 tzdb_0.3.0 dbplyr_2.2.0 withr_2.5.0
#> [33] R.methodsS3_1.8.2 grid_4.2.1 jsonlite_1.8.0 gtable_0.3.0
#> [37] lifecycle_1.0.1 DBI_1.1.3 magrittr_2.0.3 scales_1.2.0
#> [41] vroom_1.5.7 cli_3.3.0 stringi_1.7.8 fs_1.5.2
#> [45] xml2_1.3.3 ellipsis_0.3.2 generics_0.1.3 vctrs_0.4.1
#> [49] tools_4.2.1 bit64_4.0.5 R.cache_0.16.0 glue_1.6.2
#> [53] hms_1.1.1 parallel_4.2.1 fastmap_1.1.0 yaml_2.3.5
#> [57] colorspace_2.0-3 rvest_1.0.2 knitr_1.39 haven_2.5.0
Created on 2022-10-21 by the reprex package (v2.0.1)
CodePudding user response:
When you import data without specifying the data types (the schema), they are being inferred from your file. read_tsv_arrow()
and open_dataset()
use different strategies to do so:
read_tsv_arrow()
does it by looking at the full datasetopen_dataset()
does it by scanning the first few rows of the dataset
The error message you get tells you that the parser cannot convert to null
something that looks like text.
The schema reported by open_dataset()
tells you that the only column with the null
type is OTHER_OBJECTIVE_EN
. If you look at the schema reported by read_tsv_arrow()
, you will see that the OTHER_OBJECTIVE_EN
is a string
. This difference points you to what the error message suggests. There is actually text data in that column but it is further down from what the parser looks at to infer the data types. You cannot rely on the default behavior of open_dataset()
in your case and you need to specify the schema manually:
data <- open_dataset("test.tsv",
format = "tsv",
skip_rows = 1,
schema = schema(
AID_MEASURE_ID = string(),
DATE_CREATED = string(),
DATE_GRANTED = string(),
AA_PUBLISHED_DATE = string(),
SERVER_REF = string(),
AM_TITLE = string(),
AM_TITLE_EN = string(),
STATUS = string(),
AM_PROC_TYPE_CD = string(),
COFINANCE = string(),
OBJECTIVE = string(),
OTHER_OBJECTIVE_EN = string(),
AID_INSTRUMENT = string(),
OTHER_AID_INSTRUMENT_EN = string(),
BENEFICIARY_NAME = string(),
BENEFICIARY_NAME_ENGLISH = string(),
BENEFICIARY_NATIONAL_ID = string(),
BENEFICIARY_NAT_ID_TYPE_SD = string(),
BENEFICIARY_TYPE_SD = string(),
COUNTRY_SD = string(),
REGION_SD = string(),
SECTOR_SD = string(),
GRANTED_AMOUNT_FROM_EUR = double(),
NOMINAL_AMOUNT_EUR_FROM = double(),
GRANT_RANGE = string(),
GRANTING_AUTHORITY_NAME = string(),
GRANTING_AUTHORITY_NAME_EN = string(),
NUTS_CD = string(),
GRANTING_AUTHORITY_COUNTRY = string()
)
)
write_dataset(
data,
format = "parquet",
path = ".",
max_rows_per_file = 1e7
)
Note that when specifying the schema with open_dataset()
you need to skip the first row which contains the column names.
In the error message, the row number with the incorrect data type is off by one because the header is included in the count.